def test_search_IP(self): ds = datasets.SyntheticDataset(32, 1000, 200, 100) xt = ds.get_train() xb = ds.get_database() xq = ds.get_queries() ir = faiss.IndexResidualQuantizer( ds.d, 3, 4, faiss.METRIC_INNER_PRODUCT) ir.rq.train_type = faiss.ResidualQuantizer.Train_default ir.train(xt) ir.add(xb) Dref, Iref = ir.search(xq, 4) AQ = faiss.AdditiveQuantizer ir2 = faiss.IndexResidualQuantizer( ds.d, 3, 4, faiss.METRIC_INNER_PRODUCT, AQ.ST_LUT_nonorm) ir2.rq.codebooks = ir.rq.codebooks # fake training ir2.rq.is_trained = True ir2.is_trained = True ir2.add(xb) D2, I2 = ir2.search(xq, 4) np.testing.assert_array_equal(Iref, I2) np.testing.assert_array_almost_equal(Dref, D2, decimal=5)
def do_test_accuracy(self, by_residual, st): ds = datasets.SyntheticDataset(32, 3000, 1000, 100) quantizer = faiss.IndexFlatL2(ds.d) index = faiss.IndexIVFResidualQuantizer( quantizer, ds.d, 100, 3, 4, faiss.METRIC_L2, st ) index.by_residual = by_residual index.rq.train_type index.rq.train_type = faiss.ResidualQuantizer.Train_default index.rq.max_beam_size = 30 index.train(ds.get_train()) index.add(ds.get_database()) inters = [] for nprobe in 1, 2, 5, 10, 20, 50: index.nprobe = nprobe D, I = index.search(ds.get_queries(), 10) inter = faiss.eval_intersection(I, ds.get_groundtruth(10)) # print(st, "nprobe=", nprobe, "inter=", inter) inters.append(inter) # do a little I/O test index2 = faiss.deserialize_index(faiss.serialize_index(index)) D2, I2 = index2.search(ds.get_queries(), 10) np.testing.assert_array_equal(I2, I) np.testing.assert_array_equal(D2, D) inters = np.array(inters) if by_residual: # check that we have increasing intersection measures with # nprobe self.assertTrue(np.all(inters[1:] >= inters[:-1])) else: self.assertTrue(np.all(inters[1:3] >= inters[:2])) # check that we have the same result as the flat residual quantizer iflat = faiss.IndexResidualQuantizer( ds.d, 3, 4, faiss.METRIC_L2, st) iflat.rq.train_type iflat.rq.train_type = faiss.ResidualQuantizer.Train_default iflat.rq.max_beam_size = 30 iflat.train(ds.get_train()) iflat.rq.codebooks = index.rq.codebooks iflat.add(ds.get_database()) Dref, Iref = iflat.search(ds.get_queries(), 10) index.nprobe = 100 D2, I2 = index.search(ds.get_queries(), 10) np.testing.assert_array_almost_equal(Dref, D2, decimal=5) # there are many ties because the codes are so short self.assertLess((Iref != I2).sum(), Iref.size * 0.2)
def test_equiv_rq(self): """ make sure it is equivalent to search a RQ and to search an IVF with RCQ + RQ with the same codebooks. """ ds = datasets.SyntheticDataset(32, 3000, 1000, 50) # make a flat RQ iflat = faiss.IndexResidualQuantizer(ds.d, 5, 4) iflat.rq.train_type = faiss.ResidualQuantizer.Train_default iflat.train(ds.get_train()) iflat.add(ds.get_database()) # ref search result Dref, Iref = iflat.search(ds.get_queries(), 10) # get its codebooks + encoded version of the dataset codebooks = get_additive_quantizer_codebooks(iflat.rq) codes = faiss.vector_to_array(iflat.codes).reshape(-1, iflat.code_size) # make an IVF with 2x4 + 3x4 = 5x4 bits ivf = faiss.index_factory(ds.d, "IVF256(RCQ2x4),RQ3x4") # initialize the codebooks rcq = faiss.downcast_index(ivf.quantizer) faiss.copy_array_to_vector( np.vstack(codebooks[:rcq.rq.M]).ravel(), rcq.rq.codebooks ) rcq.rq.is_trained = True # translation of AdditiveCoarseQuantizer::train rcq.ntotal = 1 << rcq.rq.tot_bits rcq.centroid_norms.resize(rcq.ntotal) rcq.rq.compute_centroid_norms(rcq.centroid_norms.data()) rcq.is_trained = True faiss.copy_array_to_vector( np.vstack(codebooks[rcq.rq.M:]).ravel(), ivf.rq.codebooks ) ivf.rq.is_trained = True ivf.is_trained = True # add the codes (this works because 2x4 is a multiple of 8 bits) ivf.add_sa_codes(codes) # perform exhaustive search ivf.nprobe = ivf.nlist Dnew, Inew = ivf.search(ds.get_queries(), 10) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
def test_reestimate_codebook_2(self): ds = datasets.SyntheticDataset(32, 1000, 0, 0) xt = ds.get_train() ir = faiss.IndexResidualQuantizer(ds.d, 3, 4) ir.rq.train_type = 0 ir.train(xt) xt_decoded = ir.sa_decode(ir.sa_encode(xt)) err_before = ((xt - xt_decoded)**2).sum() ir = faiss.IndexResidualQuantizer(ds.d, 3, 4) ir.rq.train_type = faiss.ResidualQuantizer.Train_refine_codebook ir.train(xt) xt_decoded = ir.sa_decode(ir.sa_encode(xt)) err_after_refined = ((xt - xt_decoded)**2).sum() print(err_before, err_after_refined) # ref run 7474.98 / 7006.1777 self.assertGreater(err_before, err_after_refined * 1.06)
def test_search_L2(self): ds = datasets.SyntheticDataset(32, 1000, 200, 100) xt = ds.get_train() xb = ds.get_database() xq = ds.get_queries() gt = ds.get_groundtruth(10) ir = faiss.IndexResidualQuantizer(ds.d, 3, 4) ir.rq.train_type = faiss.ResidualQuantizer.Train_default ir.rq.max_beam_size = 30 ir.train(xt) # reference run w/ decoding ir.add(xb) Dref, Iref = ir.search(xq, 10) # 388 inter_ref = faiss.eval_intersection(Iref, gt) AQ = faiss.AdditiveQuantizer for st in AQ.ST_norm_float, AQ.ST_norm_qint8, AQ.ST_norm_qint4, \ AQ.ST_norm_cqint8, AQ.ST_norm_cqint4: ir2 = faiss.IndexResidualQuantizer(ds.d, 3, 4, faiss.METRIC_L2, st) ir2.rq.max_beam_size = 30 ir2.train(xt) # to get the norm bounds ir2.rq.codebooks = ir.rq.codebooks # fake training ir2.add(xb) D2, I2 = ir2.search(xq, 10) if st == AQ.ST_norm_float: np.testing.assert_array_almost_equal(Dref, D2, decimal=5) self.assertLess((Iref != I2).sum(), Iref.size * 0.05) else: inter_2 = faiss.eval_intersection(I2, gt) self.assertGreater(inter_ref, inter_2)
def test_io(self): ds = datasets.SyntheticDataset(32, 1000, 100, 0) xt = ds.get_train() xb = ds.get_database() ir = faiss.IndexResidualQuantizer(ds.d, 3, 4) ir.rq.train_type = faiss.ResidualQuantizer.Train_default ir.train(xt) ref_codes = ir.sa_encode(xb) b = faiss.serialize_index(ir) ir2 = faiss.deserialize_index(b) codes2 = ir2.sa_encode(xb) np.testing.assert_array_equal(ref_codes, codes2)
def test_search_decompress(self): ds = datasets.SyntheticDataset(32, 1000, 1000, 100) xt = ds.get_train() xb = ds.get_database() ir = faiss.IndexResidualQuantizer(ds.d, 3, 4) ir.rq.train_type = faiss.ResidualQuantizer.Train_default ir.train(xt) ir.add(xb) D, I = ir.search(ds.get_queries(), 10) gt = ds.get_groundtruth() recalls = { rank: (I[:, :rank] == gt[:, :1]).sum() / len(gt) for rank in [1, 10, 100] } # recalls are {1: 0.05, 10: 0.37, 100: 0.37} self.assertGreater(recalls[10], 0.35)
def test_reestimate_codebook(self): ds = datasets.SyntheticDataset(32, 1000, 1000, 100) xt = ds.get_train() xb = ds.get_database() ir = faiss.IndexResidualQuantizer(ds.d, 3, 4) ir.train(xt) # ir.rq.verbose = True xb_decoded = ir.sa_decode(ir.sa_encode(xb)) err_before = ((xb - xb_decoded)**2).sum() # test manual call of retrain_AQ_codebook ref_C, ref_codebook = retrain_AQ_codebook(ir, xb) ir.rq.retrain_AQ_codebook(len(xb), faiss.swig_ptr(xb)) xb_decoded = ir.sa_decode(ir.sa_encode(xb)) err_after = ((xb - xb_decoded)**2).sum() # ref run: 8347.857 vs. 7710.014 self.assertGreater(err_before, err_after * 1.05)