def get_nearest_sentence_ids(query_index, db_index, topk, nprobe, batch_size=1024, use_gpu=True): try: faiss.ParameterSpace().set_index_parameter(db_index, 'nprobe', nprobe) except RuntimeError as e: if 'could not set parameter nprobe' in str(e): pass else: raise e if use_gpu: db_index = faiss.index_cpu_to_all_gpus(db_index) all_distances = np.empty((query_index.ntotal, topk)) all_sentence_ids = np.empty((query_index.ntotal, topk), dtype=int) for batch_idx in range((query_index.ntotal // batch_size) + 1): start_idx = batch_idx * batch_size end_idx = min(start_idx + batch_size, query_index.ntotal) actual_batch_size = end_idx - start_idx query_embeddings = query_index.reconstruct_n(start_idx, actual_batch_size) # TODO: Do this in the background distances, sentence_ids = db_index.search(query_embeddings, topk) all_distances[start_idx:end_idx] = distances all_sentence_ids[start_idx:end_idx] = sentence_ids # If distances are sorted in descending order, we make them ascending instead for the following code to work if np.all(np.diff(all_distances) <= 0): # This is taylored for transforming cosine similarity into a pseudo-distance: the maximum cosine similarity is 1 (vectors are equal). # Hence distance = 1 - cosine will always be positive and will be be equal to 0 when vectors are equal. all_distances = 1 - all_distances return all_distances, all_sentence_ids.astype(int)
def __init__(self, dim=10, nlist=100, gpu=-1): self.dim = dim self.nlist = nlist #聚类中心的个数 #self.index = faiss.IndexFlatL2(dim) # build the index quantizer = faiss.IndexFlatL2(dim) # the other index # faiss.METRIC_L2: faiss定义了两种衡量相似度的方法(metrics), # 分别为faiss.METRIC_L2 欧式距离、 faiss.METRIC_INNER_PRODUCT 向量内积 # here we specify METRIC_L2, by default it performs inner-product search self.index = faiss.IndexIVFFlat(quantizer, dim, self.nlist, faiss.METRIC_L2) try: if gpu >= 0: if gpu == 0: # use a single GPU res = faiss.StandardGpuResources() gpu_index = faiss.index_cpu_to_gpu(res, 0, self.index) else: gpu_index = faiss.index_cpu_to_all_gpus(self.index) self.index = gpu_index except: pass # data self.xb = None
def __init__(self, iterator=None, filename=None, embeddings=None, shape=None, device="cpu"): self.iterator = iterator if os.path.exists(filename) == True: print(f'Index file {filename}') self.index = faiss.read_index( filename) # index2 is identical to index else: self.index = faiss.index_factory(shape, "Flat", faiss.METRIC_INNER_PRODUCT) faiss.normalize_L2(embeddings) self.index.add(embeddings) faiss.write_index(self.index, filename) print(f'Index written at {filename}') if device == "cuda": print('Now running on CUDA') self.index = faiss.index_cpu_to_all_gpus(self.index) print(f'Index trained - {self.index.is_trained}')
def create_gpu(dim): gpu_quantizer = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(dim)) index = create_cpu(dim) index.clustering_index = gpu_quantizer index.dont_dealloc_me = gpu_quantizer return index
def get_nearestneighbors_faiss(xq, xb, k, device, needs_exact=True, verbose=False): assert device in ["cpu", "cuda"] if verbose: print("Computing nearest neighbors (Faiss)") if needs_exact or device == 'cuda': index = faiss.IndexFlatL2(xq.shape[1]) else: index = faiss.index_factory(xq.shape[1], "HNSW32") index.hnsw.efSearch = 64 if device == 'cuda': index = faiss.index_cpu_to_all_gpus(index) start = time.time() index.add(xb) _, I = index.search(xq, k) if verbose: print(" NN search (%s) done in %.2f s" % (device, time.time() - start)) return I
def load_index(path_index, mode="cpu"): index = faiss.read_index(path_index) if mode == "gpu": ngpus = faiss.get_num_gpus() if ngpus > 0: index = faiss.index_cpu_to_all_gpus(index) return index
def get_knn(reference_embeddings, test_embeddings, k, embeddings_come_from_same_source=False): """ Finds the k elements in reference_embeddings that are closest to each element of test_embeddings. Args: reference_embeddings: numpy array of size (num_samples, dimensionality). test_embeddings: numpy array of size (num_samples2, dimensionality). k: int, number of nearest neighbors to find embeddings_come_from_same_source: if True, then the nearest neighbor of each element (which is actually itself) will be ignored. """ d = reference_embeddings.shape[1] logging.info("running k-nn with k=%d" % k) logging.info("embedding dimensionality is %d" % d) index = faiss.IndexFlatL2(d) if faiss.get_num_gpus() > 0: index = faiss.index_cpu_to_all_gpus(index) index.add(reference_embeddings) _, indices = index.search(test_embeddings, k + 1) if embeddings_come_from_same_source: return indices[:, 1:] return indices[:, :k]
def faiss_knn(feats_train, targets_train, feats_val, targets_val, k): feats_train = feats_train.numpy() targets_train = targets_train.numpy() feats_val = feats_val.numpy() targets_val = targets_val.numpy() d = feats_train.shape[-1] index = faiss.IndexFlatL2(d) # build the index co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True co.shard = True gpu_index = faiss.index_cpu_to_all_gpus(index, co) gpu_index.add(feats_train) D, I = gpu_index.search(feats_val, k) pred = np.zeros(I.shape[0]) for i in range(I.shape[0]): votes = list(Counter(targets_train[I[i]]).items()) shuffle(votes) pred[i] = max(votes, key=lambda x: x[1])[0] acc = 100.0 * (pred == targets_val).mean() return acc
def mine_nearest_neighbors(self, topk, calculate_accuracy=True): # mine the topk nearest neighbors for every sample import faiss features = self.features.cpu().numpy() n, dim = features.shape[0], features.shape[1] index = faiss.IndexFlatIP(dim) index = faiss.index_cpu_to_all_gpus(index) index.add(features) distances, indices = index.search(features, topk + 1) # Sample itself is included np.save( "/scratch/b/bkantarc/jfern090/Projects/Lytica/results/tabledb/pretext/features.npy", features) # evaluate if calculate_accuracy: targets = self.targets.cpu().numpy() neighbor_targets = np.take( targets, indices[:, 1:], axis=0) # Exclude sample itself for eval anchor_targets = np.repeat(targets.reshape(-1, 1), topk, axis=1) accuracy = np.mean(neighbor_targets == anchor_targets) return indices, accuracy else: return indices
def build_faiss_index(nd_feats_array, mode): """ build index on multi GPUs :param nd_feats_array: :param mode: 0: CPU; 1: GPU; 2: Multi-GPU :return: """ d = nd_feats_array.shape[1] cpu_index = faiss.IndexFlatL2(d) # build the index on CPU if mode == 0: print("[INFO] Is trained? >> {}".format(cpu_index.is_trained)) cpu_index.add(nd_feats_array) # add vectors to the index print("[INFO] Capacity of gallery: {}".format(cpu_index.ntotal)) return cpu_index elif mode == 1: ngpus = faiss.get_num_gpus() print("[INFO] number of GPUs:", ngpus) res = faiss.StandardGpuResources() # use a single GPU gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index) gpu_index.add(nd_feats_array) # add vectors to the index print("[INFO] Capacity of gallery: {}".format(gpu_index.ntotal)) return gpu_index elif mode == 2: multi_gpu_index = faiss.index_cpu_to_all_gpus( cpu_index) # build the index on multi GPUs multi_gpu_index.add(nd_feats_array) # add vectors to the index print("[INFO] Capacity of gallery: {}".format(multi_gpu_index.ntotal)) return multi_gpu_index
def get_k(bases, xb, params, k_data, dates): if params['kLineFirst']: dim = len(params['pickedStockKLine']['values']) else: dim = len(params['pickedStockTicks']['values']) #query kLine = params['pickedStockKLine']['values'] open = list(map(lambda x: x[0], kLine)) close = list(map(lambda x: x[1], kLine)) low = list(map(lambda x: x[2], kLine)) high = list(map(lambda x: x[3], kLine)) volume = params['pickedStockKLine']['volumes'] query = list(map(lambda x: x / open[0], open)) + list( map(lambda x: x / open[0], close)) + list( map(lambda x: x / open[0], low)) + list( map(lambda x: x / open[0], high)) + list( map(lambda x: x[1] / volume[0][1], volume)) xq = np.array([np.array(query)]).astype('float32') ngpus = faiss.get_num_gpus() #build index start = clock() cpu_index = faiss.IndexFlatL2(dim * 5) gpu_index = faiss.index_cpu_to_all_gpus(cpu_index) gpu_index.add(xb) D, I = gpu_index.search(xq, 10) #have not done end = clock() print(end - start) results = list(map(lambda x: bases[x], I[0])) print(results) return jsonify(back_and_front(results, k_data, dates)) #not yet
def faiss_search_impl(emb_q, emb_id, emb_size, shift, k=50, search_batch_sz=50000, gpu=True): index = faiss.IndexFlat(emb_size) if gpu: index = faiss.index_cpu_to_all_gpus(index) index.add(emb_id) print('Total index =', index.ntotal) vals, inds = [], [] for i_batch in tqdm(range(0, len(emb_q), search_batch_sz)): val, ind = index.search( emb_q[i_batch:min(i_batch + search_batch_sz, len(emb_q))], k) val = torch.from_numpy(val) val = 1 - val vals.append(val) inds.append(torch.from_numpy(ind) + shift) # print(vals[-1].size()) # print(inds[-1].size()) del index, emb_id, emb_q vals, inds = torch.cat(vals), torch.cat(inds) return vals, inds
def _set_mips_index(self): """ Create a Faiss Flat index with inner product as the metric to search against """ try: import faiss except ImportError: raise Exception( "Error: Please install faiss to use FaissMIPSIndex") if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0: print("\n> Building index", flush=True) cpu_index = faiss.IndexFlatIP(self.embed_size) if self.use_gpu: # create resources and config for GpuIndex config = faiss.GpuMultipleClonerOptions() config.shard = True config.useFloat16 = True gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=config) self.mips_index = faiss.IndexIDMap(gpu_index) if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0: print(">> Initialized index on GPU", flush=True) else: # CPU index supports IDs so wrap with IDMap self.mips_index = faiss.IndexIDMap(cpu_index) if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0: print(">> Initialized index on CPU", flush=True) # if we were constructed with a BlockData, then automatically load it # when the FAISS structure is built if self.embed_data is not None: self.add_embed_data(self.embed_data)
def __init__(self, database, method): super().__init__(database, method) self.index = {'cosine': faiss.IndexFlatIP, 'euclidean': faiss.IndexFlatL2}[method](self.D) if os.environ.get('CUDA_VISIBLE_DEVICES'): self.index = faiss.index_cpu_to_all_gpus(self.index) self.add()
def compute_GT_sliced(xb, xq, k): print "compute GT" t0 = time.time() nb, d = xb.shape nq, d = xq.shape rh = ResultHeap(nq, k) bs = 10 ** 5 xqs = sanitize(xq) db_gt = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d)) # compute ground-truth by blocks of bs, and add to heaps for i0 in range(0, nb, bs): i1 = min(nb, i0 + bs) xsl = sanitize(xb[i0:i1]) db_gt.add(xsl) D, I = db_gt.search(xqs, k) rh.add_batch_result(D, I, i0) db_gt.reset() print "\r %d/%d, %.3f s" % (i0, nb, time.time() - t0), sys.stdout.flush() print rh.finalize() gt_I = rh.I print "GT time: %.3f s" % (time.time() - t0) return gt_I
def knn_gpu(x, y, k, mem=5 * 1024 * 1024 * 1024): # Adapted From: https://github.com/google-research/xtreme/blob/ # 522434d1aece34131d997a97ce7e9242a51a688a/third_party/utils_retrieve.py import faiss dim = x.shape[1] batch_size = mem // (dim * 4) sim = np.zeros((x.shape[0], k), dtype=np.float32) ind = np.zeros((x.shape[0], k), dtype=np.int64) for xfrom in range(0, x.shape[0], batch_size): xto = min(xfrom + batch_size, x.shape[0]) bsims, binds = [], [] for yfrom in range(0, y.shape[0], batch_size): yto = min(yfrom + batch_size, y.shape[0]) idx = faiss.IndexFlatIP(dim) idx = faiss.index_cpu_to_all_gpus(idx) idx.add(y[yfrom:yto]) bsim, bind = idx.search(x[xfrom:xto], min(k, yto - yfrom)) bsims.append(bsim) binds.append(bind + yfrom) del idx bsims = np.concatenate(bsims, axis=1) binds = np.concatenate(binds, axis=1) aux = np.argsort(-bsims, axis=1) for i in range(xfrom, xto): for j in range(k): sim[i, j] = bsims[i - xfrom, aux[i - xfrom, j]] ind[i, j] = binds[i - xfrom, aux[i - xfrom, j]] return sim, ind
def _faiss_index_to_device( index: "faiss.Index", device: Optional[Union[int, List[int]]] = None) -> "faiss.Index": """ Sends a faiss index to a device. A device can either be a positive integer (GPU id), a negative integer (all GPUs), or a list of positive integers (select GPUs to use), or `None` for CPU. """ # If device is not specified, then it runs on CPU. if device is None: return index import faiss # noqa: F811 # If the device id is given as an integer if isinstance(device, int): # Positive integers are directly mapped to GPU ids if device > -1: faiss_res = faiss.StandardGpuResources() index = faiss.index_cpu_to_gpu(faiss_res, device, index) # And negative integers mean using all GPUs else: index = faiss.index_cpu_to_all_gpus(index) # Device ids given as a list mean mapping to those devices specified. elif isinstance(device, (list, tuple)): index = faiss.index_cpu_to_gpus_list(index, gpus=list(device)) else: raise TypeError( f"The argument type: {type(device)} is not expected. " + "Please pass in either nothing, a positive int, a negative int, or a list of positive ints." ) return index
def __init__(self, model, *, num_documents, doc_seq_len, documents_memmap_path, masks_memmap_path=None, num_evidence=4, reindex_batch_size=4, use_faiss_ann=False): super().__init__() self.dim = model.dim self.num_evidence = num_evidence self.model = model.cuda() self.num_docs = num_documents self.doc_shape = (num_documents, doc_seq_len) self.documents_path = documents_memmap_path self.knn_path = f'{self.documents_path}.knn' self.use_faiss_ann = use_faiss_ann if use_faiss_ann: self.index = FaissANN(self.dim, self.num_docs) else: index = faiss.IndexFlatL2(self.dim) self.index = faiss.index_cpu_to_all_gpus(index) self.reindex_batch_size = reindex_batch_size self.reindex() self.dataset = DocumentDataset(num_documents, doc_seq_len, num_evidence, documents_memmap_path, masks_memmap_path) self.dataset.set_knn_path(self.knn_path)
def search_gpu(query_path, refer_path, output, topk=100): queryfeas, queryconts = loadFeaFromPickle(query_path) referfeas, referconts = loadFeaFromPickle(refer_path) assert(queryfeas.shape[1] == referfeas.shape[1]) dim = int(queryfeas.shape[1]) print("=> query feature shape: {}".format(queryfeas.shape), file=sys.stderr) print("=> refer feature shape: {}".format(referfeas.shape), file=sys.stderr) start = time.time() ngpus = faiss.get_num_gpus() print("=> search use gpu number of GPUs: {}".format(ngpus), file=sys.stderr) cpu_index = faiss.IndexFlat(dim, faiss.METRIC_INNER_PRODUCT) # build the index gpu_index = faiss.index_cpu_to_all_gpus( # build the index cpu_index ) gpu_index.add(referfeas) # add vectors to the index print(index.ntotal) print("=> building gpu index success, \ total index number: {}".format(gpu_index), file=sys.stderr) distance, ind = gpu_index.search(queryfeas, int(topk)) assert(distance.shape == ind.shape) end = time.time() print("=> searching total use time {}".format(end - start), file=sys.stderr) outdic = {} for key_id in range(queryfeas.shape[0]): querycont = queryconts[key_id] searchresult = [(referconts[ind[key_id][i]], distance[key_id][i]) \ for i in range(len(distance[key_id]))] outdic[querycont] = searchresult print("=> convert search gpu result to output format success") pickle.dump(outdic, open(output,"wb"), protocol=2)
def create_faiss_index(vecs, method, n_gpu): """ Create FAISS index on GPU(s). To create a GPU index with FAISS, one first needs to create it on CPU then copy it on GPU. Note that a "flat" index means that it is brute-force, with no approximation techniques. """ # Build flat CPU index given the chosen method. if method=='l2': index = faiss.IndexFlatL2(vecs.shape[1]) # Exact Search for L2 elif method=='ip': index = faiss.IndexFlatIP(vecs.shape[1]) # Exact Search for Inner Product elif method=='cos': # Cosime similarity comes down to normalizing the embeddings beforehand and then applying inner product. vecs = preprocessing.normalize(vecs, norm='l2') index = faiss.IndexFlatIP(vecs.shape[1]) else: print("Error: Please choose between L2 Distance ('l2'), Inner Product Distance ('ip') or Cosine Distance ('cos') as brute-force method for exact search. Exiting...") sys.exit(0) # Convert to flat GPU index. if n_gpu > 0: co = faiss.GpuMultipleClonerOptions() # If using multiple GPUs, enable sharding so that the dataset is divided across the GPUs rather than replicated. co.shard = True index = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=n_gpu) # Convert CPU index to GPU index. # Add vectors to GPU index. index.add(vecs) # Convert back to cpu index (needed for saving it to disk). index = faiss.index_gpu_to_cpu(index) return index
def knn_ground_truth(xq, db_iterator, k, metric_type=faiss.METRIC_L2): """Computes the exact KNN search results for a dataset that possibly does not fit in RAM but for which we have an iterator that returns it block by block. """ LOG.info("knn_ground_truth queries size %s k=%d" % (xq.shape, k)) t0 = time.time() nq, d = xq.shape rh = faiss.ResultHeap(nq, k) index = faiss.IndexFlat(d, metric_type) if faiss.get_num_gpus(): LOG.info('running on %d GPUs' % faiss.get_num_gpus()) index = faiss.index_cpu_to_all_gpus(index) # compute ground-truth by blocks, and add to heaps i0 = 0 for xbi in db_iterator: ni = xbi.shape[0] index.add(xbi) D, I = index.search(xq, k) I += i0 rh.add_result(D, I) index.reset() i0 += ni LOG.info("%d db elements, %.3f s" % (i0, time.time() - t0)) rh.finalize() LOG.info("GT time: %.3f s (%d vectors)" % (time.time() - t0, i0)) return rh.D, rh.I
def knnGPU(x, y, k, mem=5 * 1024 * 1024 * 1024): dim = x.shape[1] batch_size = mem // (dim * 4) sim = np.zeros((x.shape[0], k), dtype=np.float32) ind = np.zeros((x.shape[0], k), dtype=np.int64) for xfrom in range(0, x.shape[0], batch_size): xto = min(xfrom + batch_size, x.shape[0]) bsims, binds = [], [] for yfrom in range(0, y.shape[0], batch_size): yto = min(yfrom + batch_size, y.shape[0]) # print('{}-{} -> {}-{}'.format(xfrom, xto, yfrom, yto)) idx = faiss.IndexFlatIP(dim) idx = faiss.index_cpu_to_all_gpus(idx) idx.add(y[yfrom:yto]) bsim, bind = idx.search(x[xfrom:xto], min(k, yto - yfrom)) bsims.append(bsim) binds.append(bind + yfrom) del idx bsims = np.concatenate(bsims, axis=1) binds = np.concatenate(binds, axis=1) aux = np.argsort(-bsims, axis=1) for i in range(xfrom, xto): for j in range(k): sim[i, j] = bsims[i - xfrom, aux[i - xfrom, j]] ind[i, j] = binds[i - xfrom, aux[i - xfrom, j]] return sim, ind
def run_kmeans(x, nmb_clusters): """ Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ x = c_f.to_numpy(x).astype(np.float32) n_data, d = x.shape logging.info("running k-means clustering with k=%d" % nmb_clusters) logging.info("embedding dimensionality is %d" % d) # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 index = faiss.IndexFlatL2(d) if faiss.get_num_gpus() > 0: index = faiss.index_cpu_to_all_gpus(index) # perform the training clus.train(x, index) _, idxs = index.search(x, 1) return [int(n[0]) for n in idxs]
def mine_nearest_neighbors(self, topk, calculate_accuracy=True): #features = self.features.cpu().numpy() #print(features.shape) #similarities = squareform(pdist(features)) #indices = np.argpartition(similarities, topk)[:,:topk] import faiss features = self.features.cpu().numpy() _, dim = features.shape[0], features.shape[1] index = faiss.IndexFlatIP(dim) index = faiss.index_cpu_to_all_gpus(index) index.add(features) _, indices = index.search(features, topk + 1) # Sample itself is included # evaluate if calculate_accuracy: targets = self.targets.cpu().numpy() #neighbor_targets = np.take(targets, indices[:,:], axis=0) # Exclude sample itself for eval neighbor_targets = np.take( targets, indices[:, 1:], axis=0) # Exclude sample itself for eval anchor_targets = np.repeat(targets.reshape(-1, 1), topk, axis=1) accuracy = np.mean(neighbor_targets == anchor_targets) return indices, accuracy else: return indices
def run(self, data: Dict[str, np.ndarray], faiss_task, output_dir, opts={}): save_path = f"{output_dir}/{faiss_task}.in" if os.path.isfile(save_path): logger.info(f"Reading from saved path: {save_path}") index = faiss.read_index(save_path) index = faiss.index_cpu_to_all_gpus(index) else: logger.info(f"Constructing Faiss: {opts}") # data_db = reduce( # lambda data, val: np.vstack((data, val)) if data is not None else val, # data.values(), # None, # ) if type(data) is dict: data_db = np.array(list(data.values())) else: data_db = data logger.info(f"Shape of db {data_db.shape}") n_gpus = faiss.get_num_gpus() logger.info(f"Number of GPUs available: {n_gpus}") if opts.get("mips", True): logger.info("Building MIPS indexes") index = faiss.IndexFlatIP(data_db.shape[1]) index = faiss.IndexIVFFlat( index, data_db.shape[1], opts.get("nlist", int(4 * math.sqrt(len(data)))), faiss.METRIC_INNER_PRODUCT, ) else: index = faiss.IndexFlatL2(data_db.shape[1]) if n_gpus > 0: logger.info("Building GPU model") index = faiss.index_cpu_to_all_gpus(index) logger.info("Builing Indexes") data_db = np.float32(data_db) if opts.get("mips", True): index.train(data_db) index.add(data_db) logger.info(f"Gpu Index: {index.ntotal}") faiss.write_index(faiss.index_gpu_to_cpu(index), save_path) return index
def __init__(self, target, nprobe=128, index_factory_str=None, verbose=False, mode='proxy', using_gpu=True): self._res_list = [] num_gpu = faiss.get_num_gpus() print('[faiss gpu] #GPU: {}'.format(num_gpu)) size, dim = target.shape assert size > 0, "size: {}".format(size) index_factory_str = "IVF{},PQ{}".format( min(8192, 16 * round(np.sqrt(size))), 32) if index_factory_str is None else index_factory_str cpu_index = faiss.index_factory(dim, index_factory_str) cpu_index.nprobe = nprobe if mode == 'proxy': co = faiss.GpuClonerOptions() co.useFloat16 = True co.usePrecomputed = False index = faiss.IndexProxy() for i in range(num_gpu): res = faiss.StandardGpuResources() self._res_list.append(res) sub_index = faiss.index_cpu_to_gpu( res, i, cpu_index, co) if using_gpu else cpu_index index.addIndex(sub_index) elif mode == 'shard': co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True co.usePrecomputed = False co.shard = True index = faiss.index_cpu_to_all_gpus(cpu_index, co, ngpu=num_gpu) else: raise KeyError("Unknown index mode") index = faiss.IndexIDMap(index) index.verbose = verbose # get nlist to decide how many samples used for training nlist = int( float([ item for item in index_factory_str.split(",") if 'IVF' in item ][0].replace("IVF", ""))) # training if not index.is_trained: indexes_sample_for_train = np.random.randint(0, size, nlist * 256) index.train(target[indexes_sample_for_train]) # add with ids target_ids = np.arange(0, size) index.add_with_ids(target, target_ids) self.index = index
def range_ground_truth(xq, db_iterator, threshold, metric_type=faiss.METRIC_L2, shard=False, ngpu=-1): """Computes the range-search search results for a dataset that possibly does not fit in RAM but for which we have an iterator that returns it block by block. """ nq, d = xq.shape t0 = time.time() xq = np.ascontiguousarray(xq, dtype='float32') index = faiss.IndexFlat(d, metric_type) if ngpu == -1: ngpu = faiss.get_num_gpus() if ngpu: LOG.info('running on %d GPUs' % ngpu) co = faiss.GpuMultipleClonerOptions() co.shard = shard index_gpu = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=ngpu) # compute ground-truth by blocks i0 = 0 D = [[] for _i in range(nq)] I = [[] for _i in range(nq)] all_lims = [] for xbi in db_iterator: ni = xbi.shape[0] if ngpu > 0: index_gpu.add(xbi) lims_i, Di, Ii = range_search_gpu(xq, threshold, index_gpu, xbi) index_gpu.reset() else: index.add(xbi) lims_i, Di, Ii = index.range_search(xq, threshold) index.reset() Ii += i0 for j in range(nq): l0, l1 = lims_i[j], lims_i[j + 1] if l1 > l0: D[j].append(Di[l0:l1]) I[j].append(Ii[l0:l1]) i0 += ni LOG.info("%d db elements, %.3f s" % (i0, time.time() - t0)) empty_I = np.zeros(0, dtype='int64') empty_D = np.zeros(0, dtype='float32') # import pdb; pdb.set_trace() D = [(np.hstack(i) if i != [] else empty_D) for i in D] I = [(np.hstack(i) if i != [] else empty_I) for i in I] sizes = [len(i) for i in I] assert len(sizes) == nq lims = np.zeros(nq + 1, dtype="uint64") lims[1:] = np.cumsum(sizes) return lims, np.hstack(D), np.hstack(I)
def build_index(cfg: DictConfig, model: object): """ Builds faiss index from index dataset specified in the config. Args: cfg: Config file specifying index parameters """ # Get index dataset embeddings # PCA model exists and index embeddings have already been PCAed, no need to re-extract/PCA them if cfg.apply_pca and os.path.isfile(cfg.pca.pca_save_name) and os.path.isfile(cfg.pca_embeddings_save_name): logging.info("Loading reduced dimensionality embeddings") embeddings = h5py.File(cfg.pca_embeddings_save_name, "r") embeddings = embeddings[cfg.index_ds.name][:] elif os.path.isfile(cfg.embedding_save_name): logging.info("Loading previously extracted index dataset embeddings") embeddings = h5py.File(cfg.embedding_save_name, "r") embeddings = embeddings[cfg.index_ds.name][:] else: logging.info("Encoding index dataset, this may take a while") index_dataloader = model.setup_dataloader(cfg.index_ds, is_index_data=True) embeddings, concept_ids = get_index_embeddings(cfg, index_dataloader, model) # Create pca model to reduce dimensionality of index dataset and decrease memory footprint if cfg.apply_pca: # Need to train PCA model and apply PCA transformation with newly trained model if not os.path.isfile(cfg.pca.pca_save_name): logging.info("Fitting PCA model for embedding dimensionality reduction") pca_train_set = random.sample(list(embeddings), k=int(len(embeddings)*cfg.pca.sample_fraction)) pca = PCA(n_components=cfg.pca.output_dim) pca.fit(pca_train_set) pkl.dump(pca, open(cfg.pca.pca_save_name, "wb")) embeddings = reduce_embedding_dim(pca, embeddings, cfg) #PCA model already trained, just need to reduce dimensionality of all embeddings elif not os.path.isfile(cfg.pca_embeddings_save_name): pca = pkl.load(open(cfg.pca.pca_save_name, "rb")) embeddings = reduce_embedding_dim(pca, embeddings, cfg) # Build faiss index from embeddings logging.info(f"Training index with embedding dim size {cfg.dims} using {faiss.get_num_gpus()} gpus") quantizer = faiss.IndexFlatL2(cfg.dims) index = faiss.IndexIVFFlat(quantizer, cfg.dims, cfg.nlist) index = faiss.index_cpu_to_all_gpus(index) index.train(embeddings) logging.info("Adding dataset embeddings to index") for i in tqdm(range(0, embeddings.shape[0], cfg.index_batch_size)): index.add(embeddings[i:i+cfg.index_batch_size]) logging.info("Saving index") faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index_save_name) logging.info("Index built and saved")
def __init__(self, x, gpu_id, verbose=False): DatasetAssign.__init__(self, x) index = faiss.IndexFlatL2(x.shape[1]) if gpu_id >= 0: self.index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), gpu_id, index) else: # -1 -> assign to all GPUs self.index = faiss.index_cpu_to_all_gpus(index)
def faiss_knn(feats_train, targets_train, feats_val, targets_val, feats_val_poisoned, targets_val_poisoned, k): feats_train = feats_train.numpy() targets_train = targets_train.numpy() feats_val = feats_val.numpy() targets_val = targets_val.numpy() d = feats_train.shape[-1] index = faiss.IndexFlatL2(d) # build the index co = faiss.GpuMultipleClonerOptions() co.useFloat16 = True co.shard = True gpu_index = faiss.index_cpu_to_all_gpus(index, co) gpu_index.add(feats_train) # Val clean D, I = gpu_index.search(feats_val, k) # create confusion matrix ROWS ground truth COLUMNS pred conf_matrix_clean = np.zeros( (int(targets_val.max()) + 1, int(targets_val.max()) + 1)) pred = np.zeros(I.shape[0]) for i in range(I.shape[0]): votes = list(Counter(targets_train[I[i]]).items()) shuffle(votes) pred[i] = max(votes, key=lambda x: x[1])[0] # update confusion matrix conf_matrix_clean[targets_val[i], int(pred[i])] += 1 acc = 100.0 * (pred == targets_val).mean() # Val poisoned feats_val_poisoned = feats_val_poisoned.numpy() targets_val_poisoned = targets_val_poisoned.numpy() D, I = gpu_index.search(feats_val_poisoned, k) # create confusion matrix ROWS ground truth COLUMNS pred conf_matrix_poisoned = np.zeros((int(targets_val_poisoned.max()) + 1, int(targets_val_poisoned.max()) + 1)) pred_poisoned = np.zeros(I.shape[0]) for i in range(I.shape[0]): votes = list(Counter(targets_train[I[i]]).items()) shuffle(votes) pred_poisoned[i] = max(votes, key=lambda x: x[1])[0] # update confusion matrix conf_matrix_poisoned[targets_val_poisoned[i], int(pred_poisoned[i])] += 1 acc_poisoned = 100.0 * (pred_poisoned == targets_val_poisoned).mean() return acc, conf_matrix_clean, acc_poisoned, conf_matrix_poisoned
import numpy as np d = 64 # dimension nb = 100000 # database size nq = 10000 # nb of queries np.random.seed(1234) # make reproducible xb = np.random.random((nb, d)).astype('float32') xb[:, 0] += np.arange(nb) / 1000. xq = np.random.random((nq, d)).astype('float32') xq[:, 0] += np.arange(nq) / 1000. import faiss # make faiss available ngpus = faiss.get_num_gpus() print("number of GPUs:", ngpus) cpu_index = faiss.IndexFlatL2(d) gpu_index = faiss.index_cpu_to_all_gpus( # build the index cpu_index ) gpu_index.add(xb) # add vectors to the index print(gpu_index.ntotal) k = 4 # we want to see 4 nearest neighbors D, I = gpu_index.search(xq, k) # actual search print(I[:5]) # neighbors of the 5 first queries print(I[-5:]) # neighbors of the 5 last queries