Beispiel #1
0
    def eval_quant_loss(self, by_residual, metric=faiss.METRIC_L2):
        ds  = datasets.SyntheticDataset(32, 2000, 5000, 1000)

        index = faiss.index_factory(32, "IVF32,PQ16x4np", metric)
        index.train(ds.get_train())
        index.add(ds.get_database())
        index.nprobe = 4
        index.by_residual = by_residual
        Da, Ia = index.search(ds.get_queries(), 10)

        # loss due to int8 quantization of LUTs
        index2 = faiss.IndexIVFPQFastScan(index)
        index2.implem = 2
        Db, Ib = index2.search(ds.get_queries(), 10)

        m3 = three_metrics(Da, Ia, Db, Ib)


        # print(by_residual, metric, recall_at_1, recall_at_10, intersection_at_10)
        ref_results = {
            (True, 1): [0.985, 1.0, 9.872],
            (True, 0): [ 0.987, 1.0, 9.914],
            (False, 1): [0.991, 1.0, 9.907],
            (False, 0): [0.986, 1.0, 9.917],
        }

        ref = ref_results[(by_residual, metric)]

        self.assertGreaterEqual(m3[0], ref[0] * 0.995)
        self.assertGreaterEqual(m3[1], ref[1] * 0.995)
        self.assertGreaterEqual(m3[2], ref[2] * 0.995)
Beispiel #2
0
    def test_equiv_pq(self):
        ds  = datasets.SyntheticDataset(32, 2000, 200, 4)

        index = faiss.index_factory(32, "IVF1,PQ16x4np")
        index.by_residual = False
        # force coarse quantizer
        index.quantizer.add(np.zeros((1, 32), dtype='float32'))
        index.train(ds.get_train())
        index.add(ds.get_database())
        Dref, Iref = index.search(ds.get_queries(), 4)

        index_pq = faiss.index_factory(32, "PQ16x4np")
        index_pq.pq = index.pq
        index_pq.is_trained = True
        index_pq.codes = faiss. downcast_InvertedLists(
            index.invlists).codes.at(0)
        index_pq.ntotal = index.ntotal
        Dnew, Inew = index_pq.search(ds.get_queries(), 4)

        np.testing.assert_array_equal(Iref, Inew)
        np.testing.assert_array_equal(Dref, Dnew)

        index_pq2 = faiss.IndexPQFastScan(index_pq)
        index_pq2.implem = 12
        Dref, Iref = index_pq2.search(ds.get_queries(), 4)

        index2 = faiss.IndexIVFPQFastScan(index)
        index2.implem = 12
        Dnew, Inew = index2.search(ds.get_queries(), 4)
        np.testing.assert_array_equal(Iref, Inew)
        np.testing.assert_array_equal(Dref, Dnew)
    def test_clipping(self):
        """ verify that a clipped residual quantizer gives the same
        code prefix + suffix as the full RQ """
        ds = datasets.SyntheticDataset(32, 1000, 100, 0)

        rq = faiss.ResidualQuantizer(ds.d, 5, 4)
        rq.train_type = faiss.ResidualQuantizer.Train_default
        rq.max_beam_size = 5
        rq.train(ds.get_train())

        rq.max_beam_size = 1  # is not he same for a large beam size
        codes = rq.compute_codes(ds.get_database())

        rq2 = faiss.ResidualQuantizer(ds.d, 2, 4)
        rq2.initialize_from(rq)
        self.assertEqual(rq2.M, 2)
        # verify that the beginning of the codes are the same
        codes2 = rq2.compute_codes(ds.get_database())

        rq3 = faiss.ResidualQuantizer(ds.d, 3, 4)
        rq3.initialize_from(rq, 2)
        self.assertEqual(rq3.M, 3)
        codes3 = rq3.compute_codes(ds.get_database() - rq2.decode(codes2))

        # verify that prefixes are the same
        for i in range(ds.nb):
            print(i, ds.nb)
            br = faiss.BitstringReader(faiss.swig_ptr(codes[i]), rq.code_size)
            br2 = faiss.BitstringReader(faiss.swig_ptr(codes2[i]),
                                        rq2.code_size)
            self.assertEqual(br.read(rq2.tot_bits), br2.read(rq2.tot_bits))
            br3 = faiss.BitstringReader(faiss.swig_ptr(codes3[i]),
                                        rq3.code_size)
            self.assertEqual(br.read(rq3.tot_bits), br3.read(rq3.tot_bits))
Beispiel #4
0
    def test_sparse_routines(self):
        """ the sparse assignment routine """
        ds = datasets.SyntheticDataset(1000, 2000, 0, 200)
        xt = ds.get_train().copy()
        faiss.normalize_L2(xt)

        mask = np.abs(xt) > 0.045
        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
        xt[np.logical_not(mask)] = 0

        centroids = ds.get_queries()
        assert len(centroids) == 200

        xsparse = scipy.sparse.csr_matrix(xt)

        Dref, Iref = faiss.knn(xsparse.todense(), centroids, 1)
        D, I = clustering.sparse_assign_to_dense(xsparse, centroids)

        np.testing.assert_array_equal(Iref.ravel(), I)
        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)

        D, I = clustering.sparse_assign_to_dense_blocks(xsparse,
                                                        centroids,
                                                        qbs=123,
                                                        bbs=33,
                                                        nt=4)

        np.testing.assert_array_equal(Iref.ravel(), I)
        np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)
Beispiel #5
0
    def subtest_from_idxaq(self, implem, metric):
        if metric == 'L2':
            metric_type = faiss.METRIC_L2
            st = '_Nrq2x4'
        else:
            metric_type = faiss.METRIC_INNER_PRODUCT
            st = ''

        d = 16
        ds = datasets.SyntheticDataset(d, 1000, 2000, 1000, metric=metric)
        gt = ds.get_groundtruth(k=1)
        index = faiss.index_factory(d, 'RQ8x4' + st, metric_type)
        index.train(ds.get_train())
        index.add(ds.get_database())
        index.nprobe = 16
        Dref, Iref = index.search(ds.get_queries(), 1)

        indexfs = faiss.IndexAdditiveQuantizerFastScan(index)
        indexfs.implem = implem
        D1, I1 = indexfs.search(ds.get_queries(), 1)

        nq = Iref.shape[0]
        recall_ref = (Iref == gt).sum() / nq
        recall1 = (I1 == gt).sum() / nq
        print(recall_ref, recall1)
        assert abs(recall_ref - recall1) < 0.05
Beispiel #6
0
    def do_test(self, by_residual, metric=faiss.METRIC_L2, d=32):
        ds = datasets.SyntheticDataset(d, 2000, 5000, 200)

        index = faiss.index_factory(d, f"IVF32,PQ{d//2}x4np", metric)
        # force coarse quantizer
        # index.quantizer.add(np.zeros((1, 32), dtype='float32'))
        index.by_residual = by_residual
        index.train(ds.get_train())
        index.add(ds.get_database())
        index.nprobe = 4

        index2 = faiss.IndexIVFPQFastScan(index)
        index2.implem = 2
        Dref, Iref = index2.search(ds.get_queries(), 4)
        index2 = faiss.IndexIVFPQFastScan(index)
        index2.implem = self.IMPLEM
        Dnew, Inew = index2.search(ds.get_queries(), 4)

        verify_with_draws(self, Dref, Iref, Dnew, Inew)

        stats = faiss.cvar.indexIVF_stats
        stats.reset()

        # also verify with single result
        Dnew, Inew = index2.search(ds.get_queries(), 1)
        for q in range(len(Dref)):
            if Dref[q, 1] == Dref[q, 0]:
                # then we cannot conclude
                continue
            self.assertEqual(Iref[q, 0], Inew[q, 0])
            np.testing.assert_almost_equal(Dref[q, 0], Dnew[q, 0], decimal=5)

        self.assertGreater(stats.ndis, 0)
Beispiel #7
0
 def test_synthetic(self):
     ds = datasets.SyntheticDataset(32, 1000, 2000, 10)
     xq = ds.get_queries()
     self.assertEqual(xq.shape, (10, 32))
     xb = ds.get_database()
     self.assertEqual(xb.shape, (2000, 32))
     ds.check_sizes()
    def test_lut(self):
        """test compute_LUT function"""
        ds = datasets.SyntheticDataset(16, 1000, 0, 100)

        xt = ds.get_train()
        xq = ds.get_queries()

        nsplits = 2
        Msub = 2
        nbits = 4
        nq, d = xq.shape
        dsub = d // nsplits

        plsq = faiss.ProductLocalSearchQuantizer(ds.d, nsplits, Msub, nbits)
        plsq.train(xt)

        subcodebook_size = Msub * (1 << nbits)
        codebook_size = nsplits * subcodebook_size
        lut = np.zeros((nq, codebook_size), dtype=np.float32)
        plsq.compute_LUT(nq, sp(xq), sp(lut))

        codebooks = faiss.vector_to_array(plsq.codebooks)
        codebooks = codebooks.reshape(nsplits, subcodebook_size, dsub)
        xq = xq.reshape(nq, nsplits, dsub)
        lut_ref = np.zeros((nq, nsplits, subcodebook_size), dtype=np.float32)
        for i in range(nsplits):
            lut_ref[:, i] = xq[:, i] @ codebooks[i].T
        lut_ref = lut_ref.reshape(nq, codebook_size)

        # max rtoal in OSX: 2.87e-6
        np.testing.assert_allclose(lut, lut_ref, rtol=5e-06)
    def test_IndexLocalSearchQuantizer(self):
        ds = datasets.SyntheticDataset(32, 1000, 200, 100)
        gt = ds.get_groundtruth(10)

        ir = faiss.IndexLocalSearchQuantizer(ds.d, 4, 5)
        ir.train(ds.get_train())
        ir.add(ds.get_database())
        Dref, Iref = ir.search(ds.get_queries(), 10)
        inter_ref = faiss.eval_intersection(Iref, gt)

        # 467
        self.assertGreater(inter_ref, 460)

        AQ = faiss.AdditiveQuantizer
        ir2 = faiss.IndexLocalSearchQuantizer(ds.d, 4, 5, faiss.METRIC_L2,
                                              AQ.ST_norm_float)

        ir2.train(ds.get_train())  # just to set flags properly
        ir2.lsq.codebooks = ir.lsq.codebooks

        ir2.add(ds.get_database())
        D2, I2 = ir2.search(ds.get_queries(), 10)
        np.testing.assert_array_almost_equal(Dref, D2, decimal=5)
        self.assertLess((Iref != I2).sum(), Iref.size * 0.01)

        # test I/O
        ir3 = faiss.deserialize_index(faiss.serialize_index(ir))
        D3, I3 = ir3.search(ds.get_queries(), 10)
        np.testing.assert_array_equal(Iref, I3)
        np.testing.assert_array_equal(Dref, D3)
Beispiel #10
0
    def test_constructor(self):
        d = 32
        ds = datasets.SyntheticDataset(d, 2000, 5000, 200)

        index = faiss.index_factory(d, f'PQ{d//2}x4np')
        index.train(ds.get_train())
        index.add(ds.get_database())
        Dref, Iref = index.search(ds.get_queries(), 10)
        nq = Iref.shape[0]

        index2 = faiss.IndexPQFastScan(d, d // 2, 4)
        index2.train(ds.get_train())
        index2.add(ds.get_database())
        Dnew, Inew = index2.search(ds.get_queries(), 10)

        recall_at_1 = (Iref[:, 0] == Inew[:, 0]).sum() / nq

        self.assertGreater(recall_at_1, 0.99)

        data = faiss.serialize_index(index2)
        index3 = faiss.deserialize_index(data)

        self.assertEqual(index2.implem, index3.implem)

        D3, I3 = index3.search(ds.get_queries(), 10)
        np.testing.assert_array_equal(D3, Dnew)
        np.testing.assert_array_equal(I3, Inew)
Beispiel #11
0
    def do_test(self, metric_type):
        ds = datasets.SyntheticDataset(32, 0, 1000, 200)
        index = faiss.IndexFlat(ds.d, metric_type)
        index.add(ds.get_database())

        # find a reasonable radius
        D, _ = index.search(ds.get_queries(), 10)
        radius0 = float(np.median(D[:, -1]))

        # baseline = search with that radius
        lims_ref, Dref, Iref = index.range_search(ds.get_queries(), radius0)

        # now see if using just the total number of results, we can get back the same
        # result table
        query_iterator = exponential_query_iterator(ds.get_queries())

        init_radius = 1e10 if metric_type == faiss.METRIC_L2 else -1e10
        radius1, lims_new, Dnew, Inew = range_search_max_results(
            index,
            query_iterator,
            init_radius,
            min_results=Dref.size,
            clip_to_min=True)

        evaluation.test_ref_range_results(lims_ref, Dref, Iref, lims_new, Dnew,
                                          Inew)
Beispiel #12
0
    def test_search_IP(self):
        ds = datasets.SyntheticDataset(32, 1000, 200, 100)

        xt = ds.get_train()
        xb = ds.get_database()
        xq = ds.get_queries()

        ir = faiss.IndexResidualQuantizer(
            ds.d, 3, 4, faiss.METRIC_INNER_PRODUCT)
        ir.rq.train_type = faiss.ResidualQuantizer.Train_default
        ir.train(xt)

        ir.add(xb)

        Dref, Iref = ir.search(xq, 4)

        AQ = faiss.AdditiveQuantizer
        ir2 = faiss.IndexResidualQuantizer(
            ds.d, 3, 4, faiss.METRIC_INNER_PRODUCT, AQ.ST_LUT_nonorm)
        ir2.rq.codebooks = ir.rq.codebooks    # fake training
        ir2.rq.is_trained = True
        ir2.is_trained = True
        ir2.add(xb)

        D2, I2 = ir2.search(xq, 4)

        np.testing.assert_array_equal(Iref, I2)
        np.testing.assert_array_almost_equal(Dref, D2, decimal=5)
Beispiel #13
0
    def subtest_accuracy(self, aq, st, implem, metric_type='L2'):
        """
        Compare IndexAdditiveQuantizerFastScan with IndexAQ (qint8)
        """
        d = 16
        # ds = datasets.SyntheticDataset(d, 1000, 2000, 1000, metric_type)
        ds = datasets.SyntheticDataset(d, 1000, 1000, 500, metric_type)
        gt = ds.get_groundtruth(k=1)

        if metric_type == 'L2':
            metric = faiss.METRIC_L2
            postfix1 = '_Nqint8'
            postfix2 = f'_N{st}2x4'
        else:
            metric = faiss.METRIC_INNER_PRODUCT
            postfix1 = postfix2 = ''

        index = faiss.index_factory(d, f'{aq}3x4{postfix1}', metric)
        index.train(ds.get_train())
        index.add(ds.get_database())
        Dref, Iref = index.search(ds.get_queries(), 1)

        indexfs = faiss.index_factory(d, f'{aq}3x4fs_32{postfix2}', metric)
        indexfs.train(ds.get_train())
        indexfs.add(ds.get_database())
        indexfs.implem = implem
        Da, Ia = indexfs.search(ds.get_queries(), 1)

        nq = Iref.shape[0]
        recall_ref = (Iref == gt).sum() / nq
        recall = (Ia == gt).sum() / nq

        print(aq, st, implem, metric_type, recall_ref, recall)
        assert abs(recall_ref - recall) < 0.05
Beispiel #14
0
    def test_ivfsq(self):
        ds = datasets.SyntheticDataset(32, 3000, 1000, 100)

        xt = ds.get_train()
        xb = ds.get_database()

        gt = ds.get_groundtruth(1)

        # RQ 2x5 = 10 bits = 1024 centroids
        index = faiss.index_factory(ds.d, "IVF1024(RCQ2x5),SQ8")
        quantizer = faiss.downcast_index(index.quantizer)
        rq = quantizer.rq
        rq.train_type = faiss.ResidualQuantizer.Train_default

        index.train(xt)
        index.add(xb)

        # make sure that increasing the nprobe increases accuracy

        index.nprobe = 10
        D, I = index.search(ds.get_queries(), 10)
        r10 = (I == gt[None, :]).sum() / ds.nq

        index.nprobe = 40
        D, I = index.search(ds.get_queries(), 10)
        r40 = (I == gt[None, :]).sum() / ds.nq

        self.assertGreater(r40, r10)
Beispiel #15
0
    def test_PQ4_speed(self):
        ds = datasets.SyntheticDataset(32, 2000, 5000, 1000)
        xt = ds.get_train()
        xb = ds.get_database()
        xq = ds.get_queries()

        index = faiss.index_factory(32, 'PQ16x4')
        index.train(xt)
        index.add(xb)

        t0 = time.time()
        D1, I1 = index.search(xq, 10)
        t1 = time.time()
        pq_t = t1 - t0
        print('PQ16x4 search time:', pq_t)

        index2 = faiss.index_factory(32, 'PQ16x4fs')
        index2.train(xt)
        index2.add(xb)

        t0 = time.time()
        D2, I2 = index2.search(xq, 10)
        t1 = time.time()
        pqfs_t = t1 - t0
        print('PQ16x4fs search time:', pqfs_t)
        self.assertLess(pqfs_t * 5, pq_t)
Beispiel #16
0
    def test_training(self):
        """check that the error is in the same ballpark as PQ """
        ds = datasets.SyntheticDataset(32, 3000, 1000, 0)

        xt = ds.get_train()
        xb = ds.get_database()

        rq = faiss.ResidualQuantizer(ds.d, 4, 6)
        rq.verbose
        rq.verbose = True
        #
        rq.train_type = faiss.ResidualQuantizer.Train_default
        rq.cp.verbose
        # rq.cp.verbose = True
        rq.train(xt)
        err_rq = eval_codec(rq, xb)

        pq = faiss.ProductQuantizer(ds.d, 4, 6)
        pq.train(xt)
        err_pq = eval_codec(pq, xb)

        # in practice RQ is often better than PQ but it does not the case here, so just check
        # that we are within some factor.
        print(err_pq, err_rq)
        self.assertLess(err_rq, err_pq * 1.2)
Beispiel #17
0
    def test_query_iterator(self, metric=faiss.METRIC_L2):
        ds = datasets.SyntheticDataset(32, 0, 1000, 1000)
        xq = ds.get_queries()
        xb = ds.get_database()
        D, I = faiss.knn(xq, xb, 10, metric=metric)
        threshold = float(D[:, -1].mean())
        print(threshold)

        index = faiss.IndexFlat(32, metric)
        index.add(xb)
        ref_lims, ref_D, ref_I = index.range_search(xq, threshold)

        def matrix_iterator(xb, bs):
            for i0 in range(0, xb.shape[0], bs):
                yield xb[i0:i0 + bs]

        # check repro OK
        _, new_lims, new_D, new_I = range_search_max_results(
            index, matrix_iterator(xq, 100), threshold)

        evaluation.test_ref_range_results(ref_lims, ref_D, ref_I, new_lims,
                                          new_D, new_I)

        max_res = ref_lims[-1] // 2

        new_threshold, new_lims, new_D, new_I = range_search_max_results(
            index, matrix_iterator(xq, 100), threshold, max_results=max_res)

        self.assertLessEqual(new_lims[-1], max_res)

        ref_lims, ref_D, ref_I = index.range_search(xq, new_threshold)

        evaluation.test_ref_range_results(ref_lims, ref_D, ref_I, new_lims,
                                          new_D, new_I)
Beispiel #18
0
    def do_test_rounding(self, implem=4, metric=faiss.METRIC_L2):
        ds = datasets.SyntheticDataset(32, 2000, 5000, 200)

        index = faiss.index_factory(32, 'PQ16x4', metric)
        index.train(ds.get_train())
        index.add(ds.get_database())
        Dref, Iref = index.search(ds.get_queries(), 10)
        nq = Iref.shape[0]

        index2 = faiss.IndexPQFastScan(index)

        # simply repro normal search
        index2.implem = 2
        D2, I2 = index2.search(ds.get_queries(), 10)
        np.testing.assert_array_equal(I2, Iref)
        np.testing.assert_array_equal(D2, Dref)

        # rounded LUT with correction
        index2.implem = implem
        D4, I4 = index2.search(ds.get_queries(), 10)
        # check accuracy of indexes
        recalls = {}
        for rank in 1, 10:
            recalls[rank] = (Iref[:, :1] == I4[:, :rank]).sum() / nq

        min_r1 = 0.98 if metric == faiss.METRIC_INNER_PRODUCT else 0.99
        self.assertGreater(recalls[1], min_r1)
        self.assertGreater(recalls[10], 0.995)
        # check accuracy of distances
        # err3 = ((D3 - D2) ** 2).sum()
        err4 = ((D4 - D2) ** 2).sum()
        nf = (D2 ** 2).sum()
        self.assertLess(err4, nf * 1e-4)
    def eval_index_accuracy(self, factory_key):
        # just do a single test, most search functions are already stress
        # tested in test_residual_quantizer.py
        ds = datasets.SyntheticDataset(32, 3000, 1000, 100)
        index = faiss.index_factory(ds.d, factory_key)

        index.train(ds.get_train())
        index.add(ds.get_database())

        inters = []
        for nprobe in 1, 2, 5, 10, 20, 50:
            index.nprobe = nprobe
            D, I = index.search(ds.get_queries(), 10)
            inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
            # print("nprobe=", nprobe, "inter=", inter)
            inters.append(inter)

        inters = np.array(inters)
        # in fact the results should be the same for the decoding and the
        # reconstructing versions
        self.assertTrue(np.all(inters[1:] >= inters[:-1]))

        # do a little I/O test
        index2 = faiss.deserialize_index(faiss.serialize_index(index))
        D2, I2 = index2.search(ds.get_queries(), 10)
        np.testing.assert_array_equal(I2, I)
        np.testing.assert_array_equal(D2, D)
Beispiel #20
0
    def test_precomp(self):
        ds = datasets.SyntheticDataset(32, 1000, 1000, 0)

        # make sure it work with varying nb of bits
        nbits = faiss.UInt64Vector()
        nbits.push_back(5)
        nbits.push_back(6)
        nbits.push_back(7)

        rq = faiss.ResidualQuantizer(ds.d, nbits)
        rq.train_type = faiss.ResidualQuantizer.Train_default
        rq.train(ds.get_train())

        codebooks = get_additive_quantizer_codebooks(rq)
        precomp = precomp_codebooks(codebooks)
        codebook_cross_prods_ref, cent_norms_ref = precomp

        # check C++ precomp tables
        codebook_cross_prods_ref = np.hstack([
            np.vstack(c) for c in codebook_cross_prods_ref])

        rq.compute_codebook_tables()
        codebook_cross_prods = faiss.vector_to_array(
            rq.codebook_cross_products)
        codebook_cross_prods = codebook_cross_prods.reshape(
            rq.total_codebook_size, rq.total_codebook_size)
        cent_norms = faiss.vector_to_array(rq.cent_norms)

        np.testing.assert_array_almost_equal(
            codebook_cross_prods, codebook_cross_prods_ref, decimal=5)
        np.testing.assert_array_almost_equal(
            np.hstack(cent_norms_ref), cent_norms, decimal=5)

        # validate that the python tab-based encoding works
        xb = ds.get_database()
        ref_codes, _, _ = beam_search_encoding_ref(codebooks, xb, 7)
        new_codes, _ = beam_search_encoding_tab(codebooks, xb, 7, precomp)
        np.testing.assert_array_equal(ref_codes, new_codes)

        # validate the C++ beam_search_encode_step_tab function
        beam_search_encoding_tab(codebooks, xb, 7, precomp, implem="ref cpp")

        # check implem w/ residuals
        n = ref_codes.shape[0]
        sp = faiss.swig_ptr
        ref_codes_packed = np.zeros((n, rq.code_size), dtype='uint8')
        ref_codes_int32 = ref_codes.astype('int32')
        rq.pack_codes(
            n, sp(ref_codes_int32),
            sp(ref_codes_packed), rq.M * ref_codes.shape[1]
        )

        rq.max_beam_size = 7
        codes_ref_residuals = rq.compute_codes(xb)
        np.testing.assert_array_equal(ref_codes_packed, codes_ref_residuals)

        rq.use_beam_LUT = 1
        codes_new = rq.compute_codes(xb)
        np.testing.assert_array_equal(codes_ref_residuals, codes_new)
Beispiel #21
0
 def test_synthetic_iterator(self):
     ds = datasets.SyntheticDataset(32, 1000, 2000, 10)
     xb = ds.get_database()
     xb2 = []
     for xbi in ds.database_iterator():
         xb2.append(xbi)
     xb2 = np.vstack(xb2)
     np.testing.assert_array_equal(xb, xb2)
Beispiel #22
0
 def test_synthetic_ip(self):
     ds = datasets.SyntheticDataset(32, 1000, 2000, 10, "IP")
     index = faiss.IndexFlatIP(32)
     index.add(ds.get_database())
     np.testing.assert_array_equal(
         ds.get_groundtruth(100),
         index.search(ds.get_queries(), 100)[1]
     )
Beispiel #23
0
    def do_test_accuracy(self, by_residual, st):
        ds = datasets.SyntheticDataset(32, 3000, 1000, 100)

        quantizer = faiss.IndexFlatL2(ds.d)

        index = faiss.IndexIVFResidualQuantizer(
            quantizer, ds.d, 100, 3, 4,
            faiss.METRIC_L2, st
        )
        index.by_residual = by_residual

        index.rq.train_type
        index.rq.train_type = faiss.ResidualQuantizer.Train_default
        index.rq.max_beam_size = 30

        index.train(ds.get_train())
        index.add(ds.get_database())

        inters = []
        for nprobe in 1, 2, 5, 10, 20, 50:
            index.nprobe = nprobe
            D, I = index.search(ds.get_queries(), 10)
            inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
            # print(st, "nprobe=", nprobe, "inter=", inter)
            inters.append(inter)

        # do a little I/O test
        index2 = faiss.deserialize_index(faiss.serialize_index(index))
        D2, I2 = index2.search(ds.get_queries(), 10)
        np.testing.assert_array_equal(I2, I)
        np.testing.assert_array_equal(D2, D)

        inters = np.array(inters)

        if by_residual:
            # check that we have increasing intersection measures with
            # nprobe
            self.assertTrue(np.all(inters[1:] >= inters[:-1]))
        else:
            self.assertTrue(np.all(inters[1:3] >= inters[:2]))
            # check that we have the same result as the flat residual quantizer
            iflat = faiss.IndexResidualQuantizer(
                ds.d, 3, 4, faiss.METRIC_L2, st)
            iflat.rq.train_type
            iflat.rq.train_type = faiss.ResidualQuantizer.Train_default
            iflat.rq.max_beam_size = 30
            iflat.train(ds.get_train())
            iflat.rq.codebooks = index.rq.codebooks

            iflat.add(ds.get_database())
            Dref, Iref = iflat.search(ds.get_queries(), 10)

            index.nprobe = 100
            D2, I2 = index.search(ds.get_queries(), 10)
            np.testing.assert_array_almost_equal(Dref, D2, decimal=5)
            # there are many ties because the codes are so short
            self.assertLess((Iref != I2).sum(), Iref.size * 0.2)
Beispiel #24
0
    def test_binary(self):
        ds = datasets.SyntheticDataset(128, 2000, 2000, 200)

        d = ds.d
        xt = ds.get_train()
        xq = ds.get_queries()
        xb = ds.get_database()

        # define alternative quantizer on the 20 first dims of vectors (will be in float)
        km = faiss.Kmeans(20, 50)
        km.train(xt[:, :20].copy())
        alt_quantizer = km.index

        binarizer = faiss.index_factory(d, "ITQ,LSHt")
        binarizer.train(xt)

        xb_bin = binarizer.sa_encode(xb)
        xq_bin = binarizer.sa_encode(xq)

        index = faiss.index_binary_factory(d, "BIVF200")

        fake_centroids = np.zeros((index.nlist, index.d // 8), dtype="uint8")
        index.quantizer.add(fake_centroids)
        index.is_trained = True

        # add elements xb
        a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel()
        ivf_tools.add_preassigned(index, xb_bin, a)

        # search elements xq, increase nprobe, check 4 first results w/ groundtruth
        prev_inter_perf = 0
        for nprobe in 1, 10, 20:

            index.nprobe = nprobe
            a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]
            D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a)
            inter_perf = (I == ds.get_groundtruth()[:, :4]).sum() / I.size
            self.assertTrue(inter_perf >= prev_inter_perf)
            prev_inter_perf = inter_perf

        # test range search

        index.nprobe = 20

        a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]

        # just to find a reasonable radius
        D, I = ivf_tools.search_preassigned(index, xq_bin, 4, a)
        radius = int(D.max() + 1)

        lims, DR, IR = ivf_tools.range_search_preassigned(
            index, xq_bin, radius, a)

        # with that radius the k-NN results are a subset of the range search results
        for q in range(len(xq)):
            l0, l1 = lims[q], lims[q + 1]
            self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
Beispiel #25
0
    def test_float(self):
        ds = datasets.SyntheticDataset(128, 2000, 2000, 200)

        d = ds.d
        xt = ds.get_train()
        xq = ds.get_queries()
        xb = ds.get_database()

        # define alternative quantizer on the 20 first dims of vectors
        km = faiss.Kmeans(20, 50)
        km.train(xt[:, :20].copy())
        alt_quantizer = km.index

        index = faiss.index_factory(d, "IVF50,PQ16np")
        index.by_residual = False

        # (optional) fake coarse quantizer
        fake_centroids = np.zeros((index.nlist, index.d), dtype="float32")
        index.quantizer.add(fake_centroids)

        # train the PQ part
        index.train(xt)

        # add elements xb
        a = alt_quantizer.search(xb[:, :20].copy(), 1)[1].ravel()
        ivf_tools.add_preassigned(index, xb, a)

        # search elements xq, increase nprobe, check 4 first results w/ groundtruth
        prev_inter_perf = 0
        for nprobe in 1, 10, 20:

            index.nprobe = nprobe
            a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]
            D, I = ivf_tools.search_preassigned(index, xq, 4, a)
            inter_perf = faiss.eval_intersection(I,
                                                 ds.get_groundtruth()[:, :4])
            self.assertTrue(inter_perf >= prev_inter_perf)
            prev_inter_perf = inter_perf

        # test range search

        index.nprobe = 20

        a = alt_quantizer.search(xq[:, :20].copy(), index.nprobe)[1]

        # just to find a reasonable radius
        D, I = ivf_tools.search_preassigned(index, xq, 4, a)
        radius = D.max() * 1.01

        lims, DR, IR = ivf_tools.range_search_preassigned(index, xq, radius, a)

        # with that radius the k-NN results are a subset of the range search results
        for q in range(len(xq)):
            l0, l1 = lims[q], lims[q + 1]
            self.assertTrue(set(I[q]) <= set(IR[l0:l1]))
Beispiel #26
0
    def test_2level(self):
        " verify that 2-level clustering is not too sub-optimal "
        ds = datasets.SyntheticDataset(32, 10000, 0, 0)
        xt = ds.get_train()
        km_ref = faiss.Kmeans(ds.d, 100)
        km_ref.train(xt)
        err = faiss.knn(xt, km_ref.centroids, 1)[0].sum()

        centroids2, _ = clustering.two_level_clustering(xt, 10, 10)
        err2 = faiss.knn(xt, centroids2, 1)[0].sum()

        self.assertLess(err2, err * 1.1)
Beispiel #27
0
    def test_equiv_rq(self):
        """
        make sure it is equivalent to search a RQ and to search an IVF
        with RCQ + RQ with the same codebooks.
        """
        ds = datasets.SyntheticDataset(32, 3000, 1000, 50)

        # make a flat RQ
        iflat = faiss.IndexResidualQuantizer(ds.d, 5, 4)
        iflat.rq.train_type = faiss.ResidualQuantizer.Train_default
        iflat.train(ds.get_train())
        iflat.add(ds.get_database())

        # ref search result
        Dref, Iref = iflat.search(ds.get_queries(), 10)

        # get its codebooks + encoded version of the dataset
        codebooks = get_additive_quantizer_codebooks(iflat.rq)
        codes = faiss.vector_to_array(iflat.codes).reshape(-1, iflat.code_size)

        # make an IVF with 2x4 + 3x4 = 5x4 bits
        ivf = faiss.index_factory(ds.d, "IVF256(RCQ2x4),RQ3x4")

        # initialize the codebooks
        rcq = faiss.downcast_index(ivf.quantizer)
        faiss.copy_array_to_vector(
            np.vstack(codebooks[:rcq.rq.M]).ravel(),
            rcq.rq.codebooks
        )
        rcq.rq.is_trained = True
        # translation of AdditiveCoarseQuantizer::train
        rcq.ntotal = 1 << rcq.rq.tot_bits
        rcq.centroid_norms.resize(rcq.ntotal)
        rcq.rq.compute_centroid_norms(rcq.centroid_norms.data())
        rcq.is_trained = True

        faiss.copy_array_to_vector(
            np.vstack(codebooks[rcq.rq.M:]).ravel(),
            ivf.rq.codebooks
        )
        ivf.rq.is_trained = True
        ivf.is_trained = True

        # add the codes (this works because 2x4 is a multiple of 8 bits)
        ivf.add_sa_codes(codes)

        # perform exhaustive search
        ivf.nprobe = ivf.nlist

        Dnew, Inew = ivf.search(ds.get_queries(), 10)

        np.testing.assert_array_equal(Iref, Inew)
        np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
Beispiel #28
0
    def test_aqfastscan(self):
        ds = datasets.SyntheticDataset(20, 1000, 1000, 0)

        index = faiss.index_factory(20, 'RQ5x4_Nrq2x4')
        index.train(ds.get_train())
        index.add(ds.get_database())
        recons = index.reconstruct_n(0, index.ntotal)

        index2 = faiss.IndexAdditiveQuantizerFastScan(index)
        recons2 = index2.reconstruct_n(0, index.ntotal)

        np.testing.assert_array_equal(recons, recons2)
Beispiel #29
0
    def test_RCQ_knn(self):
        ds = datasets.SyntheticDataset(32, 1000, 0, 123)
        xt = ds.get_train()
        xq = ds.get_queries()

        # RQ 3+4+5 = 12 bits = 4096 centroids
        rcq = faiss.index_factory(ds.d, "RCQ1x3_1x4_1x5")
        rcq.train(xt)

        aq = rcq.rq

        cents = rcq.reconstruct_n(0, rcq.ntotal)

        sp = faiss.swig_ptr

        # test norms computation

        norms_ref = (cents ** 2).sum(1)
        norms = np.zeros(1 << aq.tot_bits, dtype="float32")
        aq.compute_centroid_norms(sp(norms))

        np.testing.assert_array_almost_equal(norms, norms_ref, decimal=5)

        # test IP search

        Dref, Iref = faiss.knn(
            xq, cents, 10,
            metric=faiss.METRIC_INNER_PRODUCT
        )

        Dnew = np.zeros_like(Dref)
        Inew = np.zeros_like(Iref)

        aq.knn_centroids_inner_product(len(xq), sp(xq), 10, sp(Dnew), sp(Inew))

        np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
        np.testing.assert_array_equal(Iref, Inew)

        # test L2 search

        Dref, Iref = faiss.knn(xq, cents, 10, metric=faiss.METRIC_L2)

        Dnew = np.zeros_like(Dref)
        Inew = np.zeros_like(Iref)

        aq.knn_centroids_L2(len(xq), sp(xq), 10, sp(Dnew), sp(Inew), sp(norms))

        np.testing.assert_array_equal(Iref, Inew)
        np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
Beispiel #30
0
    def test_PQ4_accuracy(self):
        ds  = datasets.SyntheticDataset(32, 2000, 5000, 1000)

        index_gt = faiss.IndexFlatL2(32)
        index_gt.add(ds.get_database())
        Dref, Iref = index_gt.search(ds.get_queries(), 10)

        index = faiss.index_factory(32, 'PQ16x4')
        index.train(ds.get_train())
        index.add(ds.get_database())
        Da, Ia = index.search(ds.get_queries(), 10)

        nq = Iref.shape[0]
        recall_at_1 = (Iref[:, 0] == Ia[:, 0]).sum() / nq
        assert recall_at_1 > 0.6