def query_distance(self, query_img_path, ref_path_list, embedding_weights): q_emb_list = self._embed_image(query_img_path) q_emb_dict = { layer: q_emb_list[i] for i, layer in enumerate(self.layer_names) if layer in embedding_weights } query_gram_dict = self._build_query_gram_dict(q_emb_dict) start = dt.datetime.now() dist_dict = {} rev_file_mapping = {v: k for k, v in self.file_mapping.items()} ref_indices = [rev_file_mapping[path] for path in ref_path_list] for layer_name, gram in query_gram_dict.items(): labels_iter_range = list(range(1, len(ref_indices) + 1)) labels = np.array([list(ref_indices), labels_iter_range]) distances = np.empty((1, len(ref_indices)), dtype='float32') self.index_dict[layer_name].compute_distance_subset( 1, faiss.swig_ptr(gram), len(ref_indices), faiss.swig_ptr(distances), faiss.swig_ptr(labels)) distances = distances.flatten() dist_dict[layer_name] = { idx: distances[i] for i, idx in enumerate(ref_indices) }
def add_batch_result(self, D, I, i0): assert D.shape == (self.nq, self.k) assert I.shape == (self.nq, self.k) I += i0 self.heaps.addn_with_ids( self.k, faiss.swig_ptr(D), faiss.swig_ptr(I), self.k)
def avg_interclass_centroid_dist_dict(self): # Should I use norm_mean or mean_norm for centroid? d = {} class_list = [] v_list = [] # TODO: TEMPORARILY CHANGED TO V_mean_norm for class_, v in self.V_mean_norm_dict.items(): class_list.append(class_) v_list.append(v) V_norm_mean = np.stack(v_list) dim = V_norm_mean.shape[1] index = faiss.IndexFlatIP(dim) index.add(np.ascontiguousarray(V_norm_mean)) for i, v in enumerate(V_norm_mean): V_ref_indices = list( chain(range(0, i), range(i + 1, len(V_norm_mean)))) v = np.expand_dims(v, axis=0) labels_iter_range = list(range(1, len(V_norm_mean))) labels = np.array([list(V_ref_indices), labels_iter_range]) distances = np.empty((1, len(V_norm_mean) - 1), dtype='float32') index.compute_distance_subset(1, faiss.swig_ptr(v), len(V_norm_mean), faiss.swig_ptr(distances), faiss.swig_ptr(labels)) distances = distances.flatten() print(f'centroid distances: {distances}') avg_dist = np.mean(distances) d[class_list[i]] = avg_dist return d
def do_test_codec(self, nbit): pq = faiss.ProductQuantizer(16, 2, nbit) # simulate training rs = np.random.RandomState(123) centroids = rs.rand(2, 1 << nbit, 8).astype('float32') faiss.copy_array_to_vector(centroids.ravel(), pq.centroids) idx = rs.randint(1 << nbit, size=(100, 2)) # can be encoded exactly x = np.hstack(( centroids[0, idx[:, 0]], centroids[1, idx[:, 1]] )) # encode / decode codes = pq.compute_codes(x) xr = pq.decode(codes) assert np.all(xr == x) # encode w/ external index assign_index = faiss.IndexFlatL2(8) pq.assign_index = assign_index codes2 = np.empty((100, pq.code_size), dtype='uint8') pq.compute_codes_with_assign_index( faiss.swig_ptr(x), faiss.swig_ptr(codes2), 100) assert np.all(codes == codes2)
def bitvec_shuffle(a, order): n, d = a.shape db, = order.shape b = np.empty((n, db // 8), dtype='uint8') faiss.bitvec_shuffle(n, d * 8, db, faiss.swig_ptr(order), faiss.swig_ptr(a), faiss.swig_ptr(b)) return b
def test_clipping(self): """ verify that a clipped residual quantizer gives the same code prefix + suffix as the full RQ """ ds = datasets.SyntheticDataset(32, 1000, 100, 0) rq = faiss.ResidualQuantizer(ds.d, 5, 4) rq.train_type = faiss.ResidualQuantizer.Train_default rq.max_beam_size = 5 rq.train(ds.get_train()) rq.max_beam_size = 1 # is not he same for a large beam size codes = rq.compute_codes(ds.get_database()) rq2 = faiss.ResidualQuantizer(ds.d, 2, 4) rq2.initialize_from(rq) self.assertEqual(rq2.M, 2) # verify that the beginning of the codes are the same codes2 = rq2.compute_codes(ds.get_database()) rq3 = faiss.ResidualQuantizer(ds.d, 3, 4) rq3.initialize_from(rq, 2) self.assertEqual(rq3.M, 3) codes3 = rq3.compute_codes(ds.get_database() - rq2.decode(codes2)) # verify that prefixes are the same for i in range(ds.nb): print(i, ds.nb) br = faiss.BitstringReader(faiss.swig_ptr(codes[i]), rq.code_size) br2 = faiss.BitstringReader(faiss.swig_ptr(codes2[i]), rq2.code_size) self.assertEqual(br.read(rq2.tot_bits), br2.read(rq2.tot_bits)) br3 = faiss.BitstringReader(faiss.swig_ptr(codes3[i]), rq3.code_size) self.assertEqual(br.read(rq3.tot_bits), br3.read(rq3.tot_bits))
def test_decode(self): """Test LSQ decode""" d = 16 n = 500 M = 4 nbits = 6 K = (1 << nbits) rs = np.random.RandomState(123) x = rs.rand(n, d).astype(np.float32) codes = rs.randint(0, K, (n, M)).astype(np.int32) lsq = faiss.LocalSearchQuantizer(d, M, nbits) lsq.train(x) # decode x pack_codes = np.zeros((n, lsq.code_size)).astype(np.uint8) decoded_x = np.zeros((n, d)).astype(np.float32) lsq.pack_codes(n, faiss.swig_ptr(codes), faiss.swig_ptr(pack_codes)) lsq.decode_c(faiss.swig_ptr(pack_codes), faiss.swig_ptr(decoded_x), n) # decode in Python codebooks = faiss.vector_float_to_array(lsq.codebooks) codebooks = codebooks.reshape(M, K, d).copy() decoded_x_ref = decode_ref(x, codebooks, codes) np.testing.assert_allclose(decoded_x, decoded_x_ref, rtol=1e-6)
def get_neighbors(hnsw, i, level): " list the neighbors for node i at level " assert i < hnsw.levels.size() assert level < hnsw.levels.at(i) be = np.empty(2, 'uint64') hnsw.neighbor_range(i, level, faiss.swig_ptr(be), faiss.swig_ptr(be[1:])) return [hnsw.neighbors.at(j) for j in range(be[0], be[1])]
def test_update_codebooks(self): """Test codebooks updatation.""" d = 16 n = 500 M = 4 nbits = 6 K = (1 << nbits) # set a larger value to make the updating process more stable lambd = 1e-2 rs = np.random.RandomState(123) x = rs.rand(n, d).astype(np.float32) codes = rs.randint(0, K, (n, M)).astype(np.int32) lsq = faiss.LocalSearchQuantizer(d, M, nbits) lsq.lambd = lambd lsq.train(x) # just for allocating memory for codebooks codebooks = faiss.vector_float_to_array(lsq.codebooks) codebooks = codebooks.reshape(M, K, d).copy() lsq.update_codebooks(faiss.swig_ptr(x), faiss.swig_ptr(codes), n) new_codebooks = faiss.vector_float_to_array(lsq.codebooks) new_codebooks = new_codebooks.reshape(M, K, d).copy() ref_codebooks = update_codebooks_ref(x, codes, K, lambd) np.testing.assert_allclose(new_codebooks, ref_codebooks, atol=1e-3)
def compute_GT_CPU(xb, xq, gt_sl): nq_gt, _ = xq.shape print("compute GT CPU") t0 = time.time() gt_I = np.zeros((nq_gt, gt_sl), dtype='int64') gt_D = np.zeros((nq_gt, gt_sl), dtype='float32') heaps = faiss.float_maxheap_array_t() heaps.k = gt_sl heaps.nh = nq_gt heaps.val = faiss.swig_ptr(gt_D) heaps.ids = faiss.swig_ptr(gt_I) heaps.heapify() bs = 10 ** 5 n, d = xb.shape xqs = sanitize(xq[:nq_gt]) db_gt = faiss.IndexFlatL2(d) # compute ground-truth by blocks of bs, and add to heaps for i0, xsl in dataset_iterator(xb, IdentPreproc(d), bs): db_gt.add(xsl) D, I = db_gt.search(xqs, gt_sl) I += i0 heaps.addn_with_ids( gt_sl, faiss.swig_ptr(D), faiss.swig_ptr(I), gt_sl) db_gt.reset() heaps.reorder() print("GT CPU time: {} s".format(time.time() - t0)) return gt_I, gt_D
def search_single_scan(index, xq, k, bs=128): """performs a search so that the inverted lists are accessed sequentially by blocks of size bs""" # handle pretransform if isinstance(index, faiss.IndexPreTransform): xq = index.apply_py(xq) index = faiss.downcast_index(index.index) # coarse assignment nprobe = min(index.nprobe, index.nlist) coarse_dis, assign = index.quantizer.search(xq, nprobe) nlist = index.nlist assign_buckets = assign // bs nq = len(xq) rh = faiss.ResultHeap(nq, k) index.parallel_mode |= index.PARALLEL_MODE_NO_HEAP_INIT for l0 in range(0, nlist, bs): bucket_no = l0 // bs skip_rows, skip_cols = np.where(assign_buckets != bucket_no) sub_assign = assign.copy() sub_assign[skip_rows, skip_cols] = -1 index.search_preassigned(nq, faiss.swig_ptr(xq), k, faiss.swig_ptr(sub_assign), faiss.swig_ptr(coarse_dis), faiss.swig_ptr(rh.D), faiss.swig_ptr(rh.I), False, None) rh.finalize() return rh.D, rh.I
def get_invlist(invlists, l): """ returns the inverted lists content as a pair of (list_ids, list_codes). The codes are reshaped to a proper size """ invlists = faiss.downcast_InvertedLists(invlists) ls = invlists.list_size(l) list_ids = np.zeros(ls, dtype='int64') ids = codes = None try: ids = invlists.get_ids(l) if ls > 0: faiss.memcpy(faiss.swig_ptr(list_ids), ids, list_ids.nbytes) codes = invlists.get_codes(l) if invlists.code_size != faiss.InvertedLists.INVALID_CODE_SIZE: list_codes = np.zeros((ls, invlists.code_size), dtype='uint8') else: # it's a BlockInvertedLists npb = invlists.n_per_block bs = invlists.block_size ls_round = (ls + npb - 1) // npb list_codes = np.zeros((ls_round, bs // npb, npb), dtype='uint8') if ls > 0: faiss.memcpy(faiss.swig_ptr(list_codes), codes, list_codes.nbytes) finally: if ids is not None: invlists.release_ids(l, ids) if codes is not None: invlists.release_codes(l, codes) return list_ids, list_codes
def test_icm_encode_step(self): d = 16 n = 500 M = 4 nbits = 6 K = (1 << nbits) rs = np.random.RandomState(123) # randomly generate codes, binary terms and unary terms codes = rs.randint(0, K, (n, M)).astype(np.int32) new_codes = codes.copy() unaries = rs.rand(n, M, K).astype(np.float32) binaries = rs.rand(M, M, K, K).astype(np.float32) # do icm encoding given binary and unary terms lsq = faiss.LocalSearchQuantizer(d, M, nbits) lsq.icm_encode_step( faiss.swig_ptr(unaries), faiss.swig_ptr(binaries), faiss.swig_ptr(new_codes), n) # do icm encoding given binary and unary terms in Python ref_codes = icm_encode_step_ref(unaries, binaries, codes) np.testing.assert_array_equal(new_codes, ref_codes)
def compute_GT(): print "compute GT" t0 = time.time() gt_I = np.zeros((nq_gt, gt_sl), dtype='int64') gt_D = np.zeros((nq_gt, gt_sl), dtype='float32') heaps = faiss.float_maxheap_array_t() heaps.k = gt_sl heaps.nh = nq_gt heaps.val = faiss.swig_ptr(gt_D) heaps.ids = faiss.swig_ptr(gt_I) heaps.heapify() bs = 10**5 n, d = xb.shape xqs = sanitize(xq[:nq_gt]) db_gt = faiss.IndexFlatL2(d) vres, vdev = make_vres_vdev() db_gt_gpu = faiss.index_cpu_to_gpu_multiple(vres, vdev, db_gt) # compute ground-truth by blocks of bs, and add to heaps for i0, xsl in dataset_iterator(xb, IdentPreproc(d), bs): db_gt_gpu.add(xsl) D, I = db_gt_gpu.search(xqs, gt_sl) I += i0 heaps.addn_with_ids(gt_sl, faiss.swig_ptr(D), faiss.swig_ptr(I), gt_sl) db_gt_gpu.reset() print "\r %d/%d, %.3f s" % (i0, n, time.time() - t0), print heaps.reorder() print "GT time: %.3f s" % (time.time() - t0) return gt_I
def test_size_t_ptr(self): # issue 1064 index = faiss.IndexHNSWFlat(10, 32) hnsw = index.hnsw index.add(np.random.rand(100, 10).astype('float32')) be = np.empty(2, 'uint64') hnsw.neighbor_range(23, 0, faiss.swig_ptr(be), faiss.swig_ptr(be[1:]))
def evaluation_faiss(X, Y, Kset, args): if args.data_name.lower() != 'inshop': kmax = np.max(Kset + [args.max_r]) # search K else: kmax = np.max(Kset) test_class_dict = args.test_class_dict # compute NMI if args.do_nmi: classN = np.max(Y) + 1 kmeans = KMeans(n_clusters=classN).fit(X) nmi = normalized_mutual_info_score(Y, kmeans.labels_, average_method='arithmetic') else: nmi = 0.0 if args.data_name.lower() != 'inshop': offset = 1 X_query = X X_gallery = X Y_query = Y Y_gallery = Y else: # inshop offset = 0 len_gallery = len(args.gallery_labels) X_gallery = X[:len_gallery, :] X_query = X[len_gallery:, :] Y_query = args.query_labels Y_gallery = args.gallery_labels nq, d = X_query.shape ng, d = X_gallery.shape I = np.empty([nq, kmax + offset], dtype='int64') D = np.empty([nq, kmax + offset], dtype='float32') res = faiss.StandardGpuResources() res.setDefaultNullStreamAllDevices() faiss.bruteForceKnn(res, faiss.METRIC_INNER_PRODUCT, faiss.swig_ptr(X_gallery), True, ng, faiss.swig_ptr(X_query), True, nq, d, int(kmax + offset), faiss.swig_ptr(D), faiss.swig_ptr(I)) indices = I[:, offset:] YNN = Y_gallery[indices] recallK = get_recallK(Y_query, YNN, Kset) if args.data_name.lower() != 'inshop': RP, MAP = get_Rstat(Y_query, YNN, test_class_dict) else: # inshop RP = 0 MAP = 0 return nmi, recallK, RP, MAP
def to_binary(x): n, d = x.shape assert d % 8 == 0 if faiss is None: return ((x >= 0).reshape(n, d // 8, 8) * (1 << np.arange(8)).astype('uint8')).sum(2) else: y = np.empty((n, d // 8), dtype='uint8') faiss.real_to_binary(n * d, faiss.swig_ptr(x), faiss.swig_ptr(y)) return y
def eval_intersection_measure(gt_I, I): """ measure intersection measure (used for knngraph)""" inter = 0 rank = I.shape[1] assert gt_I.shape[1] >= rank for q in range(nq_gt): inter += faiss.ranklist_intersection_size( rank, faiss.swig_ptr(gt_I[q, :]), rank, faiss.swig_ptr(I[q, :].astype('int64'))) return inter / float(rank * nq_gt)
def __init__(self, nq, k): " nq: number of query vectors, k: number of results per query " self.I = np.zeros((nq, k), dtype='int64') self.D = np.zeros((nq, k), dtype='float32') self.nq, self.k = nq, k heaps = faiss.float_maxheap_array_t() heaps.k = k heaps.nh = nq heaps.val = faiss.swig_ptr(self.D) heaps.ids = faiss.swig_ptr(self.I) heaps.heapify() self.heaps = heaps
def search_knn(xq, xb, k, distance_type=faiss.METRIC_L2): """ wrapper around the faiss knn functions without index """ nq, d = xq.shape nb, d2 = xb.shape assert d == d2 I = np.empty((nq, k), dtype='int64') D = np.empty((nq, k), dtype='float32') if distance_type == faiss.METRIC_L2: heaps = faiss.float_maxheap_array_t() heaps.k = k heaps.nh = nq heaps.val = faiss.swig_ptr(D) heaps.ids = faiss.swig_ptr(I) faiss.knn_L2sqr(faiss.swig_ptr(xq), faiss.swig_ptr(xb), d, nq, nb, heaps) elif distance_type == faiss.METRIC_INNER_PRODUCT: heaps = faiss.float_minheap_array_t() heaps.k = k heaps.nh = nq heaps.val = faiss.swig_ptr(D) heaps.ids = faiss.swig_ptr(I) faiss.knn_inner_product(faiss.swig_ptr(xq), faiss.swig_ptr(xb), d, nq, nb, heaps) return D, I
def compute_GT_GPU(xb, xq, gt_sl): nq_gt, _ = xq.shape print("compute GT GPU") t0 = time.time() gt_I = np.zeros((nq_gt, gt_sl), dtype='int64') gt_D = np.zeros((nq_gt, gt_sl), dtype='float32') heaps = faiss.float_maxheap_array_t() heaps.k = gt_sl heaps.nh = nq_gt heaps.val = faiss.swig_ptr(gt_D) heaps.ids = faiss.swig_ptr(gt_I) heaps.heapify() bs = 10 ** 5 # Please change this based on your GPU memory size. tempmem = 3500*1024*1024 n, d = xb.shape xqs = sanitize(xq[:nq_gt]) ngpu = faiss.get_num_gpus() gpu_resources = [] for i in range(ngpu): res = faiss.StandardGpuResources() res.setTempMemory(tempmem) gpu_resources.append(res) vres = faiss.GpuResourcesVector() vdev = faiss.IntVector() for i in range(0, ngpu): vdev.push_back(i) vres.push_back(gpu_resources[i]) db_gt = faiss.IndexFlatL2(d) db_gt_gpu = faiss.index_cpu_to_gpu_multiple( vres, vdev, db_gt) # compute ground-truth by blocks of bs, and add to heaps for i0, xsl in dataset_iterator(xb, IdentPreproc(d), bs): db_gt_gpu.add(xsl) D, I = db_gt_gpu.search(xqs, gt_sl) I += i0 heaps.addn_with_ids( gt_sl, faiss.swig_ptr(D), faiss.swig_ptr(I), gt_sl) db_gt_gpu.reset() heaps.reorder() print("GT GPU time: {} s".format(time.time() - t0)) return gt_I, gt_D
def __init__(self, nq, k): " nq: number of query vectors, k: number of results per query " self.I = np.zeros((nq, k), dtype='int64') self.D = np.zeros((nq, k), dtype='float32') self.nq, self.k = nq, k # changed to minheap from maxheap. The reason is that using cosine-similarity, the most similar (e.g. closest) # vectors have a score of 1, whereas with distances the closest score is 0. heaps = faiss.float_minheap_array_t() heaps.k = k heaps.nh = nq heaps.val = faiss.swig_ptr(self.D) heaps.ids = faiss.swig_ptr(self.I) heaps.heapify() self.heaps = heaps
def add_preassigned(index_ivf, x, a, ids=None): """ Add elements to an IVF index, where the assignment is already computed """ n, d = x.shape assert a.shape == (n, ) if isinstance(index_ivf, faiss.IndexBinaryIVF): d *= 8 assert d == index_ivf.d if ids is not None: assert ids.shape == (n, ) ids = faiss.swig_ptr(ids) index_ivf.add_core( n, faiss.swig_ptr(x), ids, faiss.swig_ptr(a) )
def ivf_search_preassigned(self, xq, list_nos, coarse_dis, k): index_ivf = faiss.extract_index_ivf(self.index) n, d = xq.shape assert d == index_ivf.d n2, d2 = list_nos.shape assert list_nos.shape == coarse_dis.shape assert n2 == n assert d2 == index_ivf.nprobe D = np.empty((n, k), dtype='float32') I = np.empty((n, k), dtype='int64') index_ivf.search_preassigned( n, faiss.swig_ptr(xq), k, faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis), faiss.swig_ptr(D), faiss.swig_ptr(I), False) return D, I
def update(self, features_vectors: list, image_ids: list) -> str: """ Update index ids with new values :param image_ids: image id to change the value for :param features_vectors: features vector :return: status of update """ # Check if image IDs specified if image_ids is None: return messages.NO_IDS_SPECIFIED # Check that for each vector there is an image id if len(image_ids) != len(features_vectors): return messages.DIMENSION_MISMATCH # Check that vector dimension is same as index dimension for features_vector in features_vectors: if len(features_vector) != self.dimension: return messages.DIMENSION_ERROR # Write the image_id = 17 as [17] of type numpy array id_array = numpy.array(image_ids, dtype=numpy.int64) # Select the ids from index and remove them id_selector = IDSelectorBatch(id_array.shape[0], faiss.swig_ptr(id_array)) self.index.remove_ids(id_selector) # Insert new values _status = self.insert(features_vectors, image_ids, is_updating=True) return _status
def test_progressive_dim(self): d = 32 n = 10000 k = 50 xt, _, _ = get_dataset_2(d, n, 0, 0) # basic kmeans kmeans = faiss.Kmeans(d, k) kmeans.train(xt) clus = faiss.ProgressiveDimClustering(d, k) clus.verbose clus.verbose = True clus.progressive_dim_steps clus.progressive_dim_steps = 5 fac = faiss.ProgressiveDimIndexFactory() clus.train(n, faiss.swig_ptr(xt), fac) stats = clus.iteration_stats stats = [stats.at(i) for i in range(stats.size())] obj = np.array([st.obj for st in stats]) # clustering objective should be a tad better self.assertLess(obj[-1], kmeans.obj[-1]) # same test w/ Kmeans wrapper kmeans2 = faiss.Kmeans(d, k, progressive_dim_steps=5) kmeans2.train(xt) self.assertLess(kmeans2.obj[-1], kmeans.obj[-1])
def run_kmeans(x, nmb_clusters, verbose=False, init_cents=None): """Runs kmeans on 1 GPU. Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ n_data, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 if init_cents is not None: clus.centroids.resize(init_cents.size) faiss.memcpy(clus.centroids.data(), faiss.swig_ptr(init_cents), init_cents.size * 4) index = faiss.IndexFlatL2(d) # perform the training clus.train(x, index) _, I = index.search(x, 1) losses = faiss.vector_to_array(clus.obj) if verbose: print('k-means loss evolution: {0}'.format(losses)) centroids = faiss.vector_to_array(clus.centroids).reshape( (nmb_clusters, d)) return [int(n[0]) for n in I], losses[-1], centroids
def do_test_array_type(self, dtype): """ tests swig_ptr and rev_swig_ptr for this type of array """ a = np.arange(12).astype(dtype) ptr = faiss.swig_ptr(a) print(ptr) a2 = faiss.rev_swig_ptr(ptr, 12) np.testing.assert_array_equal(a, a2)
def update_index(image_id, image_vector, index_faiss=None): id_array = np.array([image_id], dtype=np.int64) idsel = IDSelectorBatch(id_array.shape[0], faiss.swig_ptr(id_array)) index_faiss.remove_ids(idsel) vector_array = np.array([image_vector], dtype=np.float32) index_faiss.add_with_ids(vector_array, id_array)
def get_cluster_ids(self, list_num: int) -> np.ndarray: """ TODO: docstring """ # TODO: assert IVF assert self.is_trained # This fixes problem with SWIG and numpy int list_num = int(list_num) index = faiss.read_index(str(self.tempdir / self.MERGED_INDEX_NAME)) # Get the IVF from potentially opaque index invlists = faiss.extract_index_ivf(index).invlists list_size = invlists.list_size(list_num) list_ids = np.zeros(list_size, dtype=np.int64) temp_ids = invlists.get_ids(list_num) # Need to copy since memory will be deallocated along with the invlist. faiss.memcpy(faiss.swig_ptr(list_ids), temp_ids, list_ids.nbytes) invlists.release_ids(list_num, temp_ids) if self.multi_id: list_ids = self._invert_cantor_pairing_vec(list_ids) return list_ids
def compute_populated_index_2(preproc): indexall = prepare_trained_index(preproc) # set up a 3-stage pipeline that does: # - stage 1: load + preproc # - stage 2: assign on GPU # - stage 3: add to index stage1 = dataset_iterator(xb, preproc, add_batch_size) vres, vdev = make_vres_vdev() coarse_quantizer_gpu = faiss.index_cpu_to_gpu_multiple( vres, vdev, indexall.quantizer) def quantize((i0, xs)): _, assign = coarse_quantizer_gpu.search(xs, 1) return i0, xs, assign.ravel() stage2 = rate_limited_imap(quantize, stage1) print "add..." t0 = time.time() nb = xb.shape[0] for i0, xs, assign in stage2: i1 = i0 + xs.shape[0] if indexall.__class__ == faiss.IndexIVFPQ: indexall.add_core_o(i1 - i0, faiss.swig_ptr(xs), None, None, faiss.swig_ptr(assign)) elif indexall.__class__ == faiss.IndexIVFFlat: indexall.add_core(i1 - i0, faiss.swig_ptr(xs), None, faiss.swig_ptr(assign)) else: assert False print '\r%d/%d (%.3f s) ' % ( i0, nb, time.time() - t0), sys.stdout.flush() print "Add time: %.3f s" % (time.time() - t0) return None, indexall
def compute_GT(): print "compute GT" t0 = time.time() gt_I = np.zeros((nq_gt, gt_sl), dtype='int64') gt_D = np.zeros((nq_gt, gt_sl), dtype='float32') heaps = faiss.float_maxheap_array_t() heaps.k = gt_sl heaps.nh = nq_gt heaps.val = faiss.swig_ptr(gt_D) heaps.ids = faiss.swig_ptr(gt_I) heaps.heapify() bs = 10 ** 5 n, d = xb.shape xqs = sanitize(xq[:nq_gt]) db_gt = faiss.IndexFlatL2(d) vres, vdev = make_vres_vdev() db_gt_gpu = faiss.index_cpu_to_gpu_multiple( vres, vdev, db_gt) # compute ground-truth by blocks of bs, and add to heaps for i0, xsl in dataset_iterator(xb, IdentPreproc(d), bs): db_gt_gpu.add(xsl) D, I = db_gt_gpu.search(xqs, gt_sl) I += i0 heaps.addn_with_ids( gt_sl, faiss.swig_ptr(D), faiss.swig_ptr(I), gt_sl) db_gt_gpu.reset() print "\r %d/%d, %.3f s" % (i0, n, time.time() - t0), print heaps.reorder() print "GT time: %.3f s" % (time.time() - t0) return gt_I