def test_instantiate(self): pq1 = nanopq.PQ(M=4, Ks=256) pq2 = nanopq.PQ(M=4, Ks=500) pq3 = nanopq.PQ(M=4, Ks=2**16 + 10) self.assertEqual(pq1.code_dtype, np.uint8) self.assertEqual(pq2.code_dtype, np.uint16) self.assertEqual(pq3.code_dtype, np.uint32)
def test_fit(self): N, D, M, Ks = 100, 12, 4, 10 X = np.random.random((N, D)).astype(np.float32) pq = nanopq.PQ(M=M, Ks=Ks) pq.fit(X) self.assertEqual(pq.Ds, D / M) self.assertEqual(pq.codewords.shape, (M, Ks, D / M)) pq2 = nanopq.PQ(M=M, Ks=Ks).fit(X) # Can be called as a chain self.assertTrue(np.allclose(pq.codewords, pq2.codewords))
def fit(self, vecs, iter=20, seed=123): """Given training vectors, train a codec (PQ or OPQ instance) This should be called first and only once. Args: vecs (np.ndarray): Traning vectors with shape=(Nt, D) and dtype=np.float32. iter (int): The number of iteration for k-means of PQ/OPQ seed (int): The seed for random process Returns: object: self """ assert self.fine_quantizer is None, "`fit` should be called only once" assert vecs.dtype == np.float32 if self.codec == "pq": self.fine_quantizer = nanopq.PQ(M=self.M, Ks=self.Ks, verbose=self.verbose) self.fine_quantizer.fit(vecs=vecs, iter=iter, seed=seed) elif self.codec == "opq": self.fine_quantizer = nanopq.OPQ(M=self.M, Ks=self.Ks, verbose=self.verbose) # rotation_iter is currently fixed to 10 self.fine_quantizer.fit(vecs=vecs, pq_iter=iter, rotation_iter=10, seed=seed) # Set trained codewords to cpp impl self.impl_cpp.set_codewords(self.fine_quantizer.codewords) return self
def test_query_linear(self): M, Ks = 4, 20 N, D = 1000, 40 X = np.random.random((N, D)).astype(np.float32) e = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit( vecs=X)) e.add_configure(vecs=X, nlist=20) for n, q in enumerate(X[:10]): topk = 10 ids1, dists1 = e.impl_cpp.query_linear( q, topk, np.array([], dtype=np.int64)) self.assertTrue(isinstance(ids1, list)) self.assertTrue(isinstance(ids1[0], int)) self.assertTrue(isinstance(dists1, list)) self.assertTrue(isinstance(dists1[0], float)) self.assertEqual(len(ids1), topk) self.assertEqual(len(ids1), len(dists1)) self.assertTrue( np.all(0 <= np.diff(dists1))) # Make sure dists1 is sorted # The true NN is included in top 10 with high prob self.assertTrue(n in ids1) # Subset search w/ a full indices should be the same w/o target ids2, dists2 = e.impl_cpp.query_linear(q, topk, np.arange(N)) self.assertListEqual(ids1, ids2) self.assertListEqual(dists1, dists2) S = np.array( [2, 24, 43, 55, 102, 139, 221, 542, 667, 873, 874, 899]) ids3, dists3 = e.impl_cpp.query_linear(q, topk, S) self.assertTrue(np.all([id in S for id in ids3]))
def test_nanopq_to_faiss(self): D, M, Ks = 32, 4, 256 Nt, Nb, Nq = 2000, 10000, 100 Xt = np.random.rand(Nt, D).astype(np.float32) Xb = np.random.rand(Nb, D).astype(np.float32) Xq = np.random.rand(Nq, D).astype(np.float32) pq_nanopq = nanopq.PQ(M=M, Ks=Ks) pq_nanopq.fit(vecs=Xt) with self.assertRaises(AssertionError): # opq is not supported opq = nanopq.OPQ(M=M, Ks=Ks) nanopq.nanopq_to_faiss(opq) pq_faiss = nanopq.nanopq_to_faiss(pq_nanopq) # IndexPQ # Encoded results should be same Cb_nanopq = pq_nanopq.encode(vecs=Xb) Cb_faiss = pq_faiss.pq.compute_codes(x=Xb) # ProductQuantizer in IndexPQ self.assertTrue(np.array_equal(Cb_nanopq, Cb_faiss)) # Search result should be same topk = 10 pq_faiss.add(Xb) _, ids1 = pq_faiss.search(x=Xq, k=topk) ids2 = np.array( [ np.argsort(pq_nanopq.dtable(query=xq).adist(codes=Cb_nanopq))[:topk] for xq in Xq ] ) self.assertTrue(np.array_equal(ids1, ids2))
def test_faiss_nanopq_compare_accuracy(self): D, M, Ks = 32, 4, 256 Nt, Nb, Nq = 20000, 10000, 100 nbits = int(np.log2(Ks)) assert nbits == 8 Xt = np.random.rand(Nt, D).astype(np.float32) Xb = np.random.rand(Nb, D).astype(np.float32) Xq = np.random.rand(Nq, D).astype(np.float32) pq_faiss = faiss.IndexPQ(D, M, nbits) pq_faiss.train(x=Xt) Cb_faiss = pq_faiss.pq.compute_codes(Xb) Xb_faiss_ = pq_faiss.pq.decode(Cb_faiss) pq_nanopq = nanopq.PQ(M=M, Ks=Ks) pq_nanopq.fit(vecs=Xt) Cb_nanopq = pq_nanopq.encode(vecs=Xb) Xb_nanopq_ = pq_nanopq.decode(codes=Cb_nanopq) # Reconstruction error should be almost identical avg_relative_error_faiss = ((Xb - Xb_faiss_) ** 2).sum() / (Xb ** 2).sum() avg_relative_error_nanopq = ((Xb - Xb_nanopq_) ** 2).sum() / (Xb ** 2).sum() diff_rel = ( avg_relative_error_faiss - avg_relative_error_nanopq ) / avg_relative_error_faiss diff_rel = np.sqrt(diff_rel ** 2) print("avg_rel_error_faiss:", avg_relative_error_faiss) print("avg_rel_error_nanopq:", avg_relative_error_nanopq) print("diff rel:", diff_rel) self.assertLess(diff_rel, 0.01)
def pq_search(source_dataset, query_vector): pq = nanopq.PQ(M=8) pq.fit(source_dataset) source_code = pq.encode(source_dataset) dists = pq.dtable(query_vector).adist(source_code) # (10000, ) print(dists)
def test_construct(self): M, Ks = 4, 20 N, D = 1000, 40 X = np.random.random((N, D)).astype(np.float32) e = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit( vecs=X)) self.assertEqual(e.fine_quantizer.codewords.shape, (M, Ks, D / M)) self.assertEqual((e.M, e.Ks), (M, Ks)) self.assertEqual(e.verbose, True) e.verbose = False self.assertEqual(e.verbose, False)
def test_add_configure(self): M, Ks = 4, 20 N, D = 1000, 40 X = np.random.random((N, D)).astype(np.float32) e1 = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit( vecs=X)) e1.add_configure(vecs=X, nlist=20) e2 = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit( vecs=X)) e2.add(vecs=X, update_posting_lists=False) e2.reconfigure(nlist=20) # The result of add_configure() should be the same as that of # (1) add(updating_posting_lists=False) and (2) reconfigure() self.assertTrue(np.allclose(e1.codes, e2.codes)) self.assertListEqual(e1.posting_lists, e2.posting_lists) e3 = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit( vecs=X)).add_configure(vecs=X, nlist=20) # Can be called as a chain self.assertTrue(np.allclose(e1.codes, e3.codes)) self.assertListEqual(e1.posting_lists, e3.posting_lists)
def test_pickle(self): import pickle N, D, M, Ks = 100, 12, 4, 10 X = np.random.random((N, D)).astype(np.float32) pq = nanopq.PQ(M=M, Ks=Ks) pq.fit(X) dumped = pickle.dumps(pq) pq2 = pickle.loads(dumped) self.assertEqual((pq.M, pq.Ks, pq.verbose, pq.code_dtype, pq.Ds), (pq2.M, pq2.Ks, pq2.verbose, pq2.code_dtype, pq2.Ds)) self.assertTrue(np.allclose(pq.codewords, pq2.codewords)) self.assertTrue(pq == pq2)
def test_eq(self): import copy N, D, M, Ks = 100, 12, 4, 10 X = np.random.random((N, D)).astype(np.float32) pq1 = nanopq.PQ(M=M, Ks=Ks) pq2 = nanopq.PQ(M=M, Ks=Ks) pq3 = copy.deepcopy(pq1) pq4 = nanopq.PQ(M=M, Ks=2 * Ks) self.assertTrue(pq1 == pq1) self.assertTrue(pq1 == pq2) self.assertTrue(pq1 == pq3) self.assertTrue(pq1 != pq4) pq1.fit(X) pq2.fit(X) pq3 = copy.deepcopy(pq1) pq4.fit(X) self.assertTrue(pq1 == pq1) self.assertTrue(pq1 == pq2) self.assertTrue(pq1 == pq3) self.assertTrue(pq1 != pq4)
def test_query_ivf(self): M, Ks = 20, 256 N, D = 1000, 40 X = np.random.random((N, D)).astype(np.float32) e = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit( vecs=X)) e.add_configure(vecs=X, nlist=20) for n, q in enumerate(X[:10]): L = 200 topk = 10 ids1, dists1 = e.impl_cpp.query_ivf(q, topk, np.array([], dtype=np.int64), L) self.assertTrue(isinstance(ids1, list)) self.assertTrue(isinstance(ids1[0], int)) self.assertTrue(isinstance(dists1, list)) self.assertTrue(isinstance(dists1[0], float)) self.assertEqual(len(ids1), topk) self.assertEqual(len(ids1), len(dists1)) self.assertTrue( np.all(0 <= np.diff(dists1))) # Make sure dists1 is sorted # The true NN is included in top 10 with high prob # This might fail if the parameters are severe self.assertTrue(n in ids1) # Subset search w/ a full indices should be the same w/o target ids2, dists2 = e.impl_cpp.query_ivf(q, topk, np.arange(N, dtype=np.int64), L) self.assertListEqual(ids1, ids2) self.assertListEqual(dists1, dists2) S = np.array( [2, 24, 43, 55, 102, 139, 221, 542, 667, 873, 874, 899], dtype=np.int64) ids3, dists3 = e.impl_cpp.query_ivf(q, topk, S, L) self.assertTrue(np.all([id in S for id in ids3])) # When target_ids is all vectors and L=all, the results is the same as linear PQ scan ids4, dists4 = e.impl_cpp.query_ivf(q, topk, np.arange(N, dtype=np.int64), N) ids5, dists5 = e.impl_cpp.query_linear( q, topk, np.array([], dtype=np.int64)) self.assertListEqual(ids4, ids5) self.assertListEqual(dists4, dists5) # When target_ids is specified and L is large, linear and ivf should produce the same result ids6, dists6 = e.impl_cpp.query_ivf(q, topk, S, L) ids7, dists7 = e.impl_cpp.query_linear(q, topk, S) self.assertListEqual(ids6, ids7) self.assertListEqual(dists6, dists7)
def fit(self, x): np_x = np.float32(np.asarray(x)) if (len(np_x)) < self.minimum_required: raise RuntimeError( f"Too less data to train, need at least {self.minimum_required} vectors" ) pq_model = nanopq.PQ(M=self.m, Ks=min(len(np_x) - 1, self.ks_max)) pq_model.fit(vecs=np_x, iter=self.n_iter, seed=random.randint(1, 10000)) self.model = PQModelHolder(pq_model=pq_model, pq_codes=pq_model.encode(vecs=np_x), indexed_data=self.indexed_data)
def test_encode_decode(self): N, D, M, Ks = 100, 12, 4, 10 X = np.random.random((N, D)).astype(np.float32) pq = nanopq.PQ(M=M, Ks=Ks) pq.fit(X) X_ = pq.encode(X) # encoded self.assertEqual(X_.shape, (N, M)) self.assertEqual(X_.dtype, np.uint8) X__ = pq.decode(X_) # reconstructed self.assertEqual(X.shape, X__.shape) # The original X and the reconstructed X__ should be similar self.assertTrue( np.linalg.norm(X - X__)**2 / np.linalg.norm(X)**2 < 0.1)
def test_clear(self): M, Ks = 4, 20 N, D = 1000, 40 X = np.random.random((N, D)).astype(np.float32) e = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit( vecs=X)) e.add_configure(vecs=X, nlist=20) e.clear() self.assertTrue(e.threshold is None) self.assertEqual(e.N, 0) self.assertEqual(e.nlist, 0) self.assertEqual(e.coarse_centers, None) self.assertEqual(e.codes, None) self.assertEqual(len(e.posting_lists), 0)
def test_search(self): N, D, M, Ks = 100, 12, 4, 10 X = np.random.random((N, D)).astype(np.float32) pq = nanopq.PQ(M=M, Ks=Ks) pq.fit(X) X_ = pq.encode(X) q = X[13] dtbl = pq.dtable(q) self.assertEqual(dtbl.dtable.shape, (M, Ks)) dists = dtbl.adist(X_) self.assertEqual(len(dists), N) self.assertEqual(np.argmin(dists), 13) dists2 = pq.dtable(q).adist(X_) # can be chained self.assertAlmostEqual(dists.tolist(), dists2.tolist())
def test_simple_add_configure(self): M, Ks = 4, 20 N1, N2, D = 300, 700, 40 X1 = np.random.random((N1, D)).astype(np.float32) X2 = np.random.random((N2, D)).astype(np.float32) e = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit( vecs=X1)) e.add(vecs=X1) self.assertEqual(e.N, N1) e.add(vecs=X2) self.assertEqual(e.N, N1 + N2) for nlist in [5, 100]: e.reconfigure(nlist=nlist) self.assertEqual(e.nlist, nlist) self.assertEqual(e.coarse_centers.shape, (nlist, M)) self.assertEqual(len(e.posting_lists), nlist) self.assertEqual(sum([len(plist) for plist in e.posting_lists]), N1 + N2)
def test_pickle(self): M, Ks = 10, 256 N, D = 1000, 40 X = np.random.random((N, D)).astype(np.float32) e1 = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit( vecs=X)) e1.add_configure(vecs=X, nlist=20) import pickle dumped = pickle.dumps(e1) e2 = pickle.loads(dumped) self.assertEqual((e1.M, e1.Ks, e1.threshold), (e2.M, e2.Ks, e2.threshold)) self.assertTrue(np.allclose(e1.coarse_centers, e2.coarse_centers)) self.assertTrue(np.allclose(e1.codes, e2.codes)) for pl1, pl2 in zip(e1.posting_lists, e2.posting_lists): self.assertListEqual(pl1, pl2)
def test_merge(self): from itertools import chain M, Ks, N1, N2, D = 4, 20, 1000, 500, 40 X1 = np.random.random((N1, D)).astype(np.float32) X2 = np.random.random((N2, D)).astype(np.float32) codec = nanopq.PQ(M=M, Ks=Ks, verbose=True).fit(vecs=X1) e1 = rii.Rii(fine_quantizer=codec) e2 = rii.Rii(fine_quantizer=codec) # e1: empty e2: empty e1.merge(e2) self.assertEqual((e1.N, e2.N), (0, 0)) # e1: vecs e2: empty e1.add_configure(vecs=X1) e1.merge(e2) # posting lists are created in the above line self.assertEqual(e1.N, N1) self.assertEqual(e1.nlist, int(np.sqrt(N1))) # Have posting lists e1.clear() # e1: empty e2: vecs e2.add_configure(vecs=X2) e1.merge(e2) # e1 didn't have posting lists self.assertEqual(e1.N, N2) self.assertEqual(e1.nlist, 0) # No posting lists e1.clear() e2.clear() # e1: vecs e2: vecs e1.add_configure(vecs=X1) e2.add_configure(vecs=X2) e1.merge(e2) self.assertEqual(e1.N, N1 + N2) self.assertEqual(e1.nlist, int( np.sqrt(N1))) # posting lists are same as the original e1 # Make sure everything is fine self.assertTrue( np.array_equal(e1.codes, codec.encode(np.vstack((X1, X2))))) self.assertEqual(sorted(chain(*e1.posting_lists)), list(range(N1 + N2)))
def pq_dis(): N, D = 10000, 128 X = np.random.random((N, D)).astype(np.float32) # 10,000 128-dim vectors query = np.random.random((D, )).astype(np.float32) # a 128-dim vector # Instantiate with M=8 sub-spaces #pq = nanopq.PQ(M=8,Ks=256) pq = nanopq.PQ(M=8, Ks=256) # Train with the top 1000 vectors pq.fit(X[:1000]) # Encode to PQ-codes X_code = pq.encode(X) # (10000, 8) with dtype=np.uint8 time1 = datetime.datetime.now() # Results: create a distance table online, and compute Asymmetric Distance to each PQ-code dists = pq.dtable(query).adist(X_code) nsmallestList = heapq.nsmallest(5, dists) print(nsmallestList) indexs = [dists.tolist().index(i) for i in nsmallestList] print(indexs) print(dists[indexs]) print("time", (datetime.datetime.now() - time1).microseconds)
def test_add_configure_small_number_of_vectors(self): import copy M, Ks = 4, 20 N, D = 1000, 40 X = np.random.random((N, D)).astype(np.float32) e1 = rii.Rii(fine_quantizer=nanopq.PQ(M=M, Ks=Ks, verbose=True).fit( vecs=X)) e2 = copy.deepcopy(e1) e3 = copy.deepcopy(e1) for x in X[:10]: e1.add_configure(vecs=x.reshape(1, -1)) # Can be added one by one self.assertEqual(e1.N, 10) e2.add_configure(vecs=X[:10]) # Should be same to that by add_reconfigure at once self.assertTrue(np.allclose(e1.codes, e2.codes)) self.assertEqual(e1.posting_lists, e2.posting_lists) for x in X[:10]: e3.add(x.reshape(1, -1)) e3.reconfigure() # Should be same to that by add several times the nreconfigure self.assertTrue(np.allclose(e1.codes, e3.codes)) self.assertEqual(e1.posting_lists, e3.posting_lists)
Nt = 10000000 # Use top 10M vectors for training with path_train.open("rb") as f: for vec in texmex_python.reader.read_bvec_iter(f): Xt.append(vec) if len(Xt) == Nt: break Xt = np.array(Xt, dtype=np.float32) print("Xt.shape: {}, Xt.dtype: {}".format(Xt.shape, Xt.dtype)) # Train a PQ codec and save it M = 8 # The number of subspace. path_codec = p / 'cache/codec_m{}.pkl'.format(M) if not path_codec.exists(): print("Start to train a codec") codec = nanopq.PQ(M=M, verbose=True).fit(vecs=Xt) pickle.dump(codec, path_codec.open("wb")) print("Dump the codec in {}".format(path_codec)) else: print("Read a codec from cache: {}".format(path_codec)) codec = pickle.load(path_codec.open("rb")) # Construct a search engine path_engine = p / 'cache/engine_m{}.pkl'.format(M) if not path_engine.exists(): print("Start to construct a Rii engine") e = rii.Rii(fine_quantizer=codec) batch_size = 10000000 with path_base.open("rb") as f: for n, batch in enumerate(more_itertools.chunked(texmex_python.reader.read_bvec_iter(f), batch_size)):
datas = [] for file in files: img_1 = cv2.imread(path + "/" + file, 0) img1 = cv2.resize(img_1, (65, 64), interpolation=cv2.INTER_LINEAR) dhash = dHash(img1, 64) data = list(map(int, dhash)) datas.append(data) datas = np.asarray(datas, dtype=np.float32) N = len(datas) D = 64 * 64 query = datas[0] # np.random.random((D,)).astype(np.float32) # a 128-dim vector # Instantiate with M=8 sub-spaces pq = nanopq.PQ(M=8, Ks=48) # Train with the top 1000 vectors pq.fit(datas) # Encode to PQ-codes X_code = pq.encode(datas) # (10000, 8) with dtype=np.uint8 time1 = datetime.datetime.now() # Results: create a distance table online, and compute Asymmetric Distance to each PQ-code dists = pq.dtable(query).adist(X_code) nsmallestList = heapq.nsmallest(54, dists) print(nsmallestList) indexs = [dists.tolist().index(i) for i in nsmallestList] print(indexs)
def train(self, vecs): codec = nanopq.PQ(M=self.M, verbose=False).fit(vecs=vecs) self.index = rii.Rii(fine_quantizer=codec)