def test_2level(self): " verify that 2-level clustering is not too sub-optimal " ds = datasets.SyntheticDataset(32, 10000, 0, 0) xt = ds.get_train() km_ref = faiss.Kmeans(ds.d, 100) km_ref.train(xt) err = faiss.knn(xt, km_ref.centroids, 1)[0].sum() centroids2, _ = clustering.two_level_clustering(xt, 10, 10) err2 = faiss.knn(xt, centroids2, 1)[0].sum() self.assertLess(err2, err * 1.1)
def test_RCQ_knn(self): ds = datasets.SyntheticDataset(32, 1000, 0, 123) xt = ds.get_train() xq = ds.get_queries() # RQ 3+4+5 = 12 bits = 4096 centroids rcq = faiss.index_factory(ds.d, "RCQ1x3_1x4_1x5") rcq.train(xt) aq = rcq.rq cents = rcq.reconstruct_n(0, rcq.ntotal) sp = faiss.swig_ptr # test norms computation norms_ref = (cents ** 2).sum(1) norms = np.zeros(1 << aq.tot_bits, dtype="float32") aq.compute_centroid_norms(sp(norms)) np.testing.assert_array_almost_equal(norms, norms_ref, decimal=5) # test IP search Dref, Iref = faiss.knn( xq, cents, 10, metric=faiss.METRIC_INNER_PRODUCT ) Dnew = np.zeros_like(Dref) Inew = np.zeros_like(Iref) aq.knn_centroids_inner_product(len(xq), sp(xq), 10, sp(Dnew), sp(Inew)) np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5) np.testing.assert_array_equal(Iref, Inew) # test L2 search Dref, Iref = faiss.knn(xq, cents, 10, metric=faiss.METRIC_L2) Dnew = np.zeros_like(Dref) Inew = np.zeros_like(Iref) aq.knn_centroids_L2(len(xq), sp(xq), 10, sp(Dnew), sp(Inew), sp(norms)) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
def test_small_data(self): d = 20 # nlist = (2^4)^2 = 256 index = faiss.index_factory(d, 'IMI2x4,Flat') # When nprobe >= nlist, it is equivalent to an IndexFlat. rs = np.random.RandomState(123) xt = rs.rand(100, d).astype('float32') xb = rs.rand(1000, d).astype('float32') index.train(xt) index.add(xb) index.nprobe = 2048 k = 5 xq = rs.rand(10, d).astype('float32') # test kNN search ref_D, ref_I = index.search(xq, k) D, I = faiss.knn(xq, xb, k) assert np.all(D == ref_D) assert np.all(I == ref_I) # test range search thresh = 0.1 # *squared* distance ref_lims, ref_D, ref_I = index.range_search(xq, thresh) gt_index = faiss.IndexFlat(d) gt_index.add(xb) lims, D, I = index.range_search(xq, thresh) assert np.all(lims == ref_lims) assert np.all(D == ref_D) assert np.all(I == ref_I)
def test_query_iterator(self, metric=faiss.METRIC_L2): ds = datasets.SyntheticDataset(32, 0, 1000, 1000) xq = ds.get_queries() xb = ds.get_database() D, I = faiss.knn(xq, xb, 10, metric=metric) threshold = float(D[:, -1].mean()) print(threshold) index = faiss.IndexFlat(32, metric) index.add(xb) ref_lims, ref_D, ref_I = index.range_search(xq, threshold) def matrix_iterator(xb, bs): for i0 in range(0, xb.shape[0], bs): yield xb[i0:i0 + bs] # check repro OK _, new_lims, new_D, new_I = range_search_max_results( index, matrix_iterator(xq, 100), threshold) evaluation.test_ref_range_results(ref_lims, ref_D, ref_I, new_lims, new_D, new_I) max_res = ref_lims[-1] // 2 new_threshold, new_lims, new_D, new_I = range_search_max_results( index, matrix_iterator(xq, 100), threshold, max_results=max_res) self.assertLessEqual(new_lims[-1], max_res) ref_lims, ref_D, ref_I = index.range_search(xq, new_threshold) evaluation.test_ref_range_results(ref_lims, ref_D, ref_I, new_lims, new_D, new_I)
def test_sparse_routines(self): """ the sparse assignment routine """ ds = datasets.SyntheticDataset(1000, 2000, 0, 200) xt = ds.get_train().copy() faiss.normalize_L2(xt) mask = np.abs(xt) > 0.045 # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros xt[np.logical_not(mask)] = 0 centroids = ds.get_queries() assert len(centroids) == 200 xsparse = scipy.sparse.csr_matrix(xt) Dref, Iref = faiss.knn(xsparse.todense(), centroids, 1) D, I = clustering.sparse_assign_to_dense(xsparse, centroids) np.testing.assert_array_equal(Iref.ravel(), I) np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4) D, I = clustering.sparse_assign_to_dense_blocks(xsparse, centroids, qbs=123, bbs=33, nt=4) np.testing.assert_array_equal(Iref.ravel(), I) np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)
def subtest(self, d, K, metric): metric_names = {faiss.METRIC_L1: 'L1', faiss.METRIC_L2: 'L2', faiss.METRIC_INNER_PRODUCT: 'IP'} nb = 1000 _, xb, _ = get_dataset_2(d, 0, nb, 0) _, knn = faiss.knn(xb, xb, K + 1, metric) knn = knn[:, 1:] index = faiss.IndexNNDescentFlat(d, K, metric) index.nndescent.S = 10 index.nndescent.R = 32 index.nndescent.L = K + 20 index.nndescent.iter = 5 index.verbose = True index.add(xb) graph = index.nndescent.final_graph graph = faiss.vector_to_array(graph) graph = graph.reshape(nb, K) recalls = 0 for i in range(nb): for j in range(K): for k in range(K): if graph[i, j] == knn[i, k]: recalls += 1 break recall = 1.0 * recalls / (nb * K) print('Metric: {}, knng accuracy: {}'.format(metric_names[metric], recall)) assert recall > 0.99
def test_python_kmeans(self): """ Test the python implementation of kmeans """ ds = datasets.SyntheticDataset(32, 10000, 0, 0) x = ds.get_train() # bad distribution to stress-test split code xt = x[:10000].copy() xt[:5000] = x[0] km_ref = faiss.Kmeans(ds.d, 100, niter=10) km_ref.train(xt) err = faiss.knn(xt, km_ref.centroids, 1)[0].sum() data = clustering.DatasetAssign(xt) centroids = clustering.kmeans(100, data, 10) err2 = faiss.knn(xt, centroids, 1)[0].sum() # 33517.645 and 33031.098 self.assertLess(err2, err * 1.1)
def eval_codec(q, xq, xb, gt): t0 = time.time() codes = q.compute_codes(xb) t1 = time.time() xb_decoded = q.decode(codes) recons_err = ((xb - xb_decoded)**2).sum() / xb.shape[0] # for compatibility with the codec benchmarks err_compat = np.linalg.norm(xb - xb_decoded, axis=1).mean() xq_decoded = q.decode(q.compute_codes(xq)) D, I = faiss.knn(xq_decoded, xb_decoded, 1) recall = (I[:, 0] == gt[:, 0]).sum() / nq print( f"\tencode time: {t1 - t0:.3f} reconstruction error: {recons_err:.3f} " f"1-recall@1: {recall:.4f} recons_err_compat {err_compat:.3f}")
def _knn_search(queries, data, k): """ Perform exact knn search (should be replaced with approximate) Return the k nearest keys """ if torch.cuda.is_available( ): # not the best way but should let me know that gpu is being used res = faiss.StandardGpuResources() D, I = faiss.knn_gpu(res, queries, data, k) return D.detach().cpu().numpy(), I.detach().cpu().numpy() queries, data = queries.detach().numpy(), data.detach().numpy() return faiss.knn(queries, data, k) #(distances, indices)
def do_test_range(self, metric): ds = datasets.SyntheticDataset(32, 0, 1000, 10) xq = ds.get_queries() xb = ds.get_database() D, I = faiss.knn(xq, xb, 10, metric=metric) threshold = float(D[:, -1].mean()) index = faiss.IndexFlat(32, metric) index.add(xb) ref_lims, ref_D, ref_I = index.range_search(xq, threshold) new_lims, new_D, new_I = range_ground_truth( xq, ds.database_iterator(bs=100), threshold, metric_type=metric) evaluation.test_ref_range_results(ref_lims, ref_D, ref_I, new_lims, new_D, new_I)
def test_rand_vector(self): """ test if the smooth_vectors function is reasonably compressible with a small PQ """ x = faiss.rand_smooth_vectors(1300, 32) xt = x[:1000] xb = x[1000:1200] xq = x[1200:] _, gt = faiss.knn(xq, xb, 10) index = faiss.IndexPQ(32, 4, 4) index.train(xt) index.add(xb) D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt) # 445 for SyntheticDataset self.assertGreater(ninter, 420) self.assertLess(ninter, 460)
def do_test(self, metric): d = 32 xt, xb, xq = get_dataset_2(d, 2000, 1000, 200) index1 = faiss.index_factory(d, "PQ4x4np", metric) Dref, Iref = faiss.knn(xq, xb, 10, metric) index1.train(xt) index1.add(xb) D1, I1 = index1.search(xq, 100) recall1 = (I1 == Iref[:, :1]).sum() # add refine index on top index_flat = faiss.IndexFlat(d, metric) index_flat.add(xb) index2 = faiss.IndexRefine(index1, index_flat) index2.k_factor = 10.0 D2, I2 = index2.search(xq, 10) # check distance is computed properly for i in range(len(xq)): x1 = xq[i] x2 = xb[I2[i, 5]] if metric == faiss.METRIC_L2: dref = ((x1 - x2) ** 2).sum() else: dref = np.dot(x1, x2) np.testing.assert_almost_equal(dref, D2[i, 5], decimal=5) # check that with refinement, the recall@10 is the same as # the original recall@100 recall2 = (I2 == Iref[:, :1]).sum() # print("recalls", recall1, recall2) self.assertEquals(recall1, recall2)
def perform_search(self, centroids): return faiss.knn(self.x, centroids, 1)
def get_groundtruth(self, k=100): return faiss.knn( self.xq, self.xb, k, faiss.METRIC_L2 if self.metric == 'L2' else faiss.METRIC_INNER_PRODUCT)[1]