Beispiel #1
0
    def do_mmappedIO(self, sparse, in_pretransform=False):
        d = 10
        nb = 1000
        nq = 200
        nt = 200
        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        quantizer = faiss.IndexFlatL2(d)
        index1 = faiss.IndexIVFFlat(quantizer, d, 20)
        if sparse:
            # makes the inverted lists sparse because all elements get
            # assigned to the same invlist
            xt += (np.ones(10) * 1000).astype('float32')

        if in_pretransform:
            # make sure it still works when wrapped in an IndexPreTransform
            index1 = faiss.IndexPreTransform(index1)

        index1.train(xt)
        index1.add(xb)

        _, fname = tempfile.mkstemp()
        try:

            faiss.write_index(index1, fname)

            index2 = faiss.read_index(fname)
            self.compare_results(index1, index2, xq)

            index3 = faiss.read_index(fname, faiss.IO_FLAG_MMAP)
            self.compare_results(index1, index3, xq)
        finally:
            if os.path.exists(fname):
                os.unlink(fname)
Beispiel #2
0
    def test_serialize_to_vector(self):
        d = 10
        nb = 1000
        nq = 200
        nt = 500
        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        index = faiss.IndexFlatL2(d)
        index.add(xb)

        Dref, Iref = index.search(xq, 5)

        writer = faiss.VectorIOWriter()
        faiss.write_index(index, writer)

        ar_data = faiss.vector_to_array(writer.data)

        # direct transfer of vector
        reader = faiss.VectorIOReader()
        reader.data.swap(writer.data)

        index2 = faiss.read_index(reader)

        Dnew, Inew = index2.search(xq, 5)
        assert np.all(Dnew == Dref) and np.all(Inew == Iref)

        # from intermediate numpy array
        reader = faiss.VectorIOReader()
        faiss.copy_array_to_vector(ar_data, reader.data)

        index3 = faiss.read_index(reader)

        Dnew, Inew = index3.search(xq, 5)
        assert np.all(Dnew == Dref) and np.all(Inew == Iref)
Beispiel #3
0
    def test_IMI_2(self):
        d = 32
        nb = 1000
        nt = 1500
        nq = 200

        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)
        d = xt.shape[1]

        gt_index = faiss.IndexFlatL2(d)
        gt_index.add(xb)
        D, gt_nns = gt_index.search(xq, 1)

        ############# redo including training
        nbits = 5
        ai0 = faiss.IndexFlatL2(int(d / 2))
        ai1 = faiss.IndexFlatL2(int(d / 2))

        coarse_quantizer = faiss.MultiIndexQuantizer2(d, nbits, ai0, ai1)
        index = faiss.IndexIVFPQ(coarse_quantizer, d, (1 << nbits) ** 2, 8, 8)
        index.quantizer_trains_alone = 1
        index.train(xt)
        index.add(xb)
        index.nprobe = 100
        D, nns = index.search(xq, 10)
        n_ok = (nns == gt_nns).sum()

        # should return the same result
        self.assertGreater(n_ok, 165)
Beispiel #4
0
    def test_slice_vstack(self):
        d = 10
        nb = 1000
        nq = 100
        nt = 200

        xt, xb, xq = get_dataset_2(d, nb, nt, nq)

        quantizer = faiss.IndexFlatL2(d)
        index = faiss.IndexIVFFlat(quantizer, d, 30)

        index.train(xt)
        index.add(xb)
        Dref, Iref = index.search(xq, 10)

        # faiss.wait()

        il0 = index.invlists
        ils = []
        ilv = faiss.InvertedListsPtrVector()
        for sl in 0, 1, 2:
            il = faiss.SliceInvertedLists(il0, sl * 10, sl * 10 + 10)
            ils.append(il)
            ilv.push_back(il)

        il2 = faiss.VStackInvertedLists(ilv.size(), ilv.data())

        index2 = faiss.IndexIVFFlat(quantizer, d, 30)
        index2.replace_invlists(il2)
        index2.ntotal = index.ntotal

        D, I = index2.search(xq, 10)
        assert np.all(D == Dref)
        assert np.all(I == Iref)
Beispiel #5
0
    def test_encoded(self):
        d = 32
        k = 5
        xt, xb, xq = get_dataset_2(d, 1000, 0, 0)

        # make sure that training on a compressed then decompressed
        # dataset gives the same result as decompressing on-the-fly

        codec = faiss.IndexScalarQuantizer(d, faiss.ScalarQuantizer.QT_4bit)
        codec.train(xt)
        codes = codec.sa_encode(xt)

        xt2 = codec.sa_decode(codes)

        clus = faiss.Clustering(d, k)
        # clus.verbose = True
        clus.niter = 0
        index = faiss.IndexFlatL2(d)
        clus.train(xt2, index)
        ref_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d)

        _, ref_errs = index.search(xt2, 1)

        clus = faiss.Clustering(d, k)
        # clus.verbose = True
        clus.niter = 0
        clus.decode_block_size = 120
        index = faiss.IndexFlatL2(d)
        clus.train_encoded(codes, codec, index)
        new_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d)

        _, new_errs = index.search(xt2, 1)

        # It's the same operation, so should be bit-exact the same
        self.assertTrue(np.all(ref_centroids == new_centroids))
Beispiel #6
0
 def test_stats(self):
     d = 32
     k = 5
     xt, xb, xq = get_dataset_2(d, 1000, 0, 0)
     km = faiss.Kmeans(d, k, niter=4)
     km.train(xt)
     assert list(km.obj) == [st['obj'] for st in km.iteration_stats]
Beispiel #7
0
    def test_IVFPQ_non8bit(self):
        d = 16
        xt, xb, xq = get_dataset_2(d, 10000, 2000, 200)
        nlist = 64

        gt_index = faiss.IndexFlat(d)
        gt_index.add(xb)
        gt_D, gt_I = gt_index.search(xq, 10)

        quantizer = faiss.IndexFlat(d)
        ninter = {}
        for v in '2x8', '8x2':
            if v == '8x2':
                index = faiss.IndexIVFPQ(quantizer, d, nlist, 2, 8)
            else:
                index = faiss.IndexIVFPQ(quantizer, d, nlist, 8, 2)
            index.train(xt)
            index.add(xb)
            index.npobe = 16

            D, I = index.search(xq, 10)
            ninter[v] = faiss.eval_intersection(I, gt_I)
        print('ninter=', ninter)
        # this should be the case but we don't observe
        # that... Probavly too few test points
        #  assert ninter['2x8'] > ninter['8x2']
        # ref numbers on 2019-11-02
        assert abs(ninter['2x8'] - 458) < 4
        assert abs(ninter['8x2'] - 465) < 4
Beispiel #8
0
    def do_encode_twice(self, factory_key):
        d = 96
        nb = 1000
        nq = 0
        nt = 2000

        xt, x, _ = get_dataset_2(d, nt, nb, nq)

        assert x.size > 0

        codec = faiss.index_factory(d, factory_key)

        codec.train(xt)

        codes = codec.sa_encode(x)
        x2 = codec.sa_decode(codes)

        codes2 = codec.sa_encode(x2)

        if 'IVF' not in factory_key:
            self.assertTrue(np.all(codes == codes2))
        else:
            # some rows are not reconstructed exactly because they
            # flip into another quantization cell
            nrowdiff = (codes != codes2).any(axis=1).sum()
            self.assertTrue(nrowdiff < 10)

        x3 = codec.sa_decode(codes2)
        if 'IVF' not in factory_key:
            self.assertTrue(np.allclose(x2, x3))
        else:
            diffs = np.abs(x2 - x3).sum(axis=1)
            avg = np.abs(x2).sum(axis=1).mean()
            diffs.sort()
            assert diffs[-10] < avg * 1e-5
Beispiel #9
0
    def compare_accuracy(self, lowac, highac, max_errs=(1e10, 1e10)):
        d = 96
        nb = 1000
        nq = 0
        nt = 2000

        xt, x, _ = get_dataset_2(d, nt, nb, nq)

        errs = []

        for factory_string in lowac, highac:

            codec = faiss.index_factory(d, factory_string)
            print('sa codec: code size %d' % codec.sa_code_size())
            codec.train(xt)

            codes = codec.sa_encode(x)
            x2 = codec.sa_decode(codes)

            err = ((x - x2) ** 2).sum()
            errs.append(err)

        print(errs)
        self.assertGreater(errs[0], errs[1])

        self.assertGreater(max_errs[0], errs[0])
        self.assertGreater(max_errs[1], errs[1])

        # just a small IndexLattice I/O test
        if 'Lattice' in highac:
            codec2 = faiss.deserialize_index(
                faiss.serialize_index(codec))
            codes = codec.sa_encode(x)
            x3 = codec.sa_decode(codes)
            self.assertTrue(np.all(x2 == x3))
Beispiel #10
0
    def subtest(self, d, K, metric):
        metric_names = {faiss.METRIC_L1: 'L1',
                        faiss.METRIC_L2: 'L2',
                        faiss.METRIC_INNER_PRODUCT: 'IP'}

        nb = 1000
        _, xb, _ = get_dataset_2(d, 0, nb, 0)

        _, knn = faiss.knn(xb, xb, K + 1, metric)
        knn = knn[:, 1:]

        index = faiss.IndexNNDescentFlat(d, K, metric)
        index.nndescent.S = 10
        index.nndescent.R = 32
        index.nndescent.L = K + 20
        index.nndescent.iter = 5
        index.verbose = True

        index.add(xb)
        graph = index.nndescent.final_graph
        graph = faiss.vector_to_array(graph)
        graph = graph.reshape(nb, K)

        recalls = 0
        for i in range(nb):
            for j in range(K):
                for k in range(K):
                    if graph[i, j] == knn[i, k]:
                        recalls += 1
                        break
        recall = 1.0 * recalls / (nb * K)
        print('Metric: {}, knng accuracy: {}'.format(metric_names[metric], recall))
        assert recall > 0.99
Beispiel #11
0
    def test_IndexIVFPQ(self):
        d = 32
        nb = 1000
        nt = 1500
        nq = 200

        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)

        coarse_quantizer = faiss.IndexFlatL2(d)
        index = faiss.IndexIVFPQ(coarse_quantizer, d, 32, 8, 8)
        index.cp.min_points_per_centroid = 5    # quiet warning
        index.train(xt)
        index.add(xb)

        # invalid nprobe
        index.nprobe = 0
        k = 10
        self.assertRaises(RuntimeError, index.search, xq, k)

        # invalid k
        index.nprobe = 4
        k = -10
        self.assertRaises(AssertionError, index.search, xq, k)

        # valid params
        index.nprobe = 4
        k = 10
        D, nns = index.search(xq, k)

        self.assertEquals(D.shape[0], nq)
        self.assertEquals(D.shape[1], k)
    def subtest(self, mt):
        d = 32
        xt, xb, xq = get_dataset_2(d, 1000, 2000, 200)
        nlist = 64

        gt_index = faiss.IndexFlat(d, mt)
        gt_index.add(xb)
        gt_D, gt_I = gt_index.search(xq, 10)
        quantizer = faiss.IndexFlat(d, mt)
        for qname in '8bit 4bit 8bit_uniform 4bit_uniform fp16'.split():
            qtype = getattr(faiss.ScalarQuantizer, 'QT_' + qname)
            index = faiss.IndexIVFScalarQuantizer(quantizer, d, nlist, qtype,
                                                  mt)
            index.train(xt)
            index.add(xb)
            index.nprobe = 4  # hopefully more robust than 1
            D, I = index.search(xq, 10)
            ninter = faiss.eval_intersection(I, gt_I)
            print('(%d, %s): %d, ' % (mt, repr(qname), ninter))
            assert abs(ninter - self.ref_results[(mt, qname)]) <= 9

            D2, I2 = self.subtest_add2col(xb, xq, index, qname)

            assert np.all(I2 == I)

            # also test range search

            if mt == faiss.METRIC_INNER_PRODUCT:
                radius = float(D[:, -1].max())
            else:
                radius = float(D[:, -1].min())
            print('radius', radius)

            lims, D3, I3 = index.range_search(xq, radius)
            ntot = ndiff = 0
            for i in range(len(xq)):
                l0, l1 = lims[i], lims[i + 1]
                Inew = set(I3[l0:l1])
                if mt == faiss.METRIC_INNER_PRODUCT:
                    mask = D2[i] > radius
                else:
                    mask = D2[i] < radius
                Iref = set(I2[i, mask])
                ndiff += len(Inew ^ Iref)
                ntot += len(Iref)
            print('ndiff %d / %d' % (ndiff, ntot))
            assert ndiff < ntot * 0.01

            for pm in 1, 2:
                print('parallel_mode=%d' % pm)
                index.parallel_mode = pm
                lims4, D4, I4 = index.range_search(xq, radius)
                print('sizes', lims4[1:] - lims4[:-1])
                for qno in range(len(lims) - 1):
                    Iref = I3[lims[qno]:lims[qno + 1]]
                    Inew = I4[lims4[qno]:lims4[qno + 1]]
                    assert set(Iref) == set(
                        Inew), "q %d ref %s new %s" % (qno, Iref, Inew)
Beispiel #13
0
    def __init__(self, *args, **kwargs):
        unittest.TestCase.__init__(self, *args, **kwargs)
        d = 32
        nt = 0
        nb = 1500
        nq = 500
        self.GK = 32

        _, self.xb, self.xq = get_dataset_2(d, nt, nb, nq)
Beispiel #14
0
 def test_polysemous_OOM(self):
     """ this used to cause OOM when training polysemous with large
     nb bits"""
     d = 32
     xt, xb, xq = get_dataset_2(d, 10000, 0, 0)
     index = faiss.IndexPQ(d, M, 13)
     index.do_polysemous_training = True
     index.pq.cp.niter = 0
     index.polysemous_training.max_memory = 128 * 1024 * 1024
     self.assertRaises(RuntimeError, index.train, xt)
Beispiel #15
0
    def test_init(self):
        d = 32
        k = 5
        xt, xb, xq = get_dataset_2(d, 1000, 0, 0)
        km = faiss.Kmeans(d, k, niter=4)
        km.train(xt)

        km2 = faiss.Kmeans(d, k, niter=4)
        km2.train(xt, init_centroids=km.centroids)

        # check that the intial objective is better for km2 than km
        self.assertGreater(km.obj[0], km2.obj[0] * 1.01)
Beispiel #16
0
    def __init__(self, *args, **kwargs):
        unittest.TestCase.__init__(self, *args, **kwargs)
        d = 32
        nt = 0
        nb = 1500
        nq = 500

        (_, self.xb, self.xq) = get_dataset_2(d, nt, nb, nq)
        index = faiss.IndexFlatL2(d)
        index.add(self.xb)
        Dref, Iref = index.search(self.xq, 1)
        self.Iref = Iref
Beispiel #17
0
    def test_bf_knn(self):
        d = 64
        k = 10
        xt, xb, xq = get_dataset_2(d, 0, 10000, 100)

        index = faiss.IndexFlatL2(d)
        index.add(xb)
        Dref, Iref = index.search(xq, k)

        res = faiss.StandardGpuResources()

        D, I = knn_gpu(res, xb, xq, k)

        np.testing.assert_array_equal(Iref, I)
        np.testing.assert_almost_equal(Dref, D, decimal=4)

        # Test transpositions
        xbt = np.ascontiguousarray(xb.T)

        D, I = knn_gpu(res, xbt.T, xq, k)

        np.testing.assert_array_equal(Iref, I)
        np.testing.assert_almost_equal(Dref, D, decimal=4)

        xqt = np.ascontiguousarray(xq.T)

        D, I = knn_gpu(res, xb, xqt.T, k)

        np.testing.assert_array_equal(Iref, I)
        np.testing.assert_almost_equal(Dref, D, decimal=4)

        D, I = knn_gpu(res, xbt.T, xqt.T, k)

        np.testing.assert_array_equal(Iref, I)
        np.testing.assert_almost_equal(Dref, D, decimal=4)

        # Test f16 data types
        xb16 = xb.astype(np.float16)
        xq16 = xq.astype(np.float16)

        D, I = knn_gpu(res, xb, xq, k)

        np.testing.assert_array_equal(Iref, I)
        np.testing.assert_almost_equal(Dref, D, decimal=4)

        # Test i32 indices
        I32 = np.empty((xq.shape[0], k), dtype=np.int32)

        D, _ = knn_gpu(res, xb, xq, k, I=I32)

        np.testing.assert_array_equal(Iref, I32)
        np.testing.assert_almost_equal(Dref, D, decimal=4)
Beispiel #18
0
    def test_4variants_ivf(self):
        d = 32
        nt = 2500
        nq = 400
        nb = 5000

        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)

        # common quantizer
        quantizer = faiss.IndexFlatL2(d)

        ncent = 64

        index_gt = faiss.IndexFlatL2(d)
        index_gt.add(xb)
        D, I_ref = index_gt.search(xq, 10)

        nok = {}

        index = faiss.IndexIVFFlat(quantizer, d, ncent, faiss.METRIC_L2)
        index.cp.min_points_per_centroid = 5  # quiet warning
        index.nprobe = 4
        index.train(xt)
        index.add(xb)
        D, I = index.search(xq, 10)
        nok['flat'] = (I[:, 0] == I_ref[:, 0]).sum()

        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split(
        ):
            qtype = getattr(faiss.ScalarQuantizer, qname)
            index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent, qtype,
                                                  faiss.METRIC_L2)

            index.nprobe = 4
            index.train(xt)
            index.add(xb)
            D, I = index.search(xq, 10)

            nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
        print(nok, nq)

        self.assertGreaterEqual(nok['flat'], nq * 0.6)
        # The tests below are a bit fragile, it happens that the
        # ordering between uniform and non-uniform are reverted,
        # probably because the dataset is small, which introduces
        # jitter
        self.assertGreaterEqual(nok['flat'], nok['QT_8bit'])
        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
        self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
        self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
        self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
Beispiel #19
0
    def subtest_8bit_direct(self, metric_type, d):
        xt, xb, xq = get_dataset_2(d, 500, 1000, 30)

        # rescale everything to get integer
        tmin, tmax = xt.min(), xt.max()

        def rescale(x):
            x = np.floor((x - tmin) * 256 / (tmax - tmin))
            x[x < 0] = 0
            x[x > 255] = 255
            return x

        xt = rescale(xt)
        xb = rescale(xb)
        xq = rescale(xq)

        gt_index = faiss.IndexFlat(d, metric_type)
        gt_index.add(xb)
        Dref, Iref = gt_index.search(xq, 10)

        index = faiss.IndexScalarQuantizer(
            d, faiss.ScalarQuantizer.QT_8bit_direct, metric_type)
        index.add(xb)
        D, I = index.search(xq, 10)

        assert np.all(I == Iref)
        assert np.all(D == Dref)

        # same, with IVF

        nlist = 64
        quantizer = faiss.IndexFlat(d, metric_type)

        gt_index = faiss.IndexIVFFlat(quantizer, d, nlist, metric_type)
        gt_index.nprobe = 4
        gt_index.train(xt)
        gt_index.add(xb)
        Dref, Iref = gt_index.search(xq, 10)

        index = faiss.IndexIVFScalarQuantizer(
            quantizer, d, nlist,
            faiss.ScalarQuantizer.QT_8bit_direct, metric_type)
        index.nprobe = 4
        index.by_residual = False
        index.train(xt)
        index.add(xb)
        D, I = index.search(xq, 10)

        assert np.all(I == Iref)
        assert np.all(D == Dref)
Beispiel #20
0
    def test_IMI(self):
        d = 32
        nb = 1000
        nt = 1500
        nq = 200

        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)
        d = xt.shape[1]

        gt_index = faiss.IndexFlatL2(d)
        gt_index.add(xb)
        D, gt_nns = gt_index.search(xq, 1)

        nbits = 5
        coarse_quantizer = faiss.MultiIndexQuantizer(d, 2, nbits)
        index = faiss.IndexIVFPQ(coarse_quantizer, d, (1 << nbits)**2, 8, 8)
        index.quantizer_trains_alone = 1
        index.train(xt)
        index.add(xb)
        index.nprobe = 100
        D, nns = index.search(xq, 10)
        n_ok = (nns == gt_nns).sum()

        # Should return 166 on mac, and 170 on linux.
        self.assertGreater(n_ok, 165)

        ############# replace with explicit assignment indexes
        nbits = 5
        pq = coarse_quantizer.pq
        centroids = faiss.vector_to_array(pq.centroids)
        centroids = centroids.reshape(pq.M, pq.ksub, pq.dsub)
        ai0 = faiss.IndexFlatL2(pq.dsub)
        ai0.add(centroids[0])
        ai1 = faiss.IndexFlatL2(pq.dsub)
        ai1.add(centroids[1])

        coarse_quantizer_2 = faiss.MultiIndexQuantizer2(d, nbits, ai0, ai1)
        coarse_quantizer_2.pq = pq
        coarse_quantizer_2.is_trained = True

        index.quantizer = coarse_quantizer_2

        index.reset()
        index.add(xb)

        D, nns = index.search(xq, 10)
        n_ok = (nns == gt_nns).sum()

        # should return the same result
        self.assertGreater(n_ok, 165)
Beispiel #21
0
    def subtest(self, mt):
        d = 32
        xt, xb, xq = get_dataset_2(d, 1000, 2000, 200)
        nlist = 64

        gt_index = faiss.IndexFlat(d, mt)
        gt_index.add(xb)
        gt_D, gt_I = gt_index.search(xq, 10)
        quantizer = faiss.IndexFlat(d, mt)
        for by_residual in True, False:

            index = faiss.IndexIVFPQ(quantizer, d, nlist, 4, 8)
            index.metric_type = mt
            index.by_residual = by_residual
            if by_residual:
                # perform cheap polysemous training
                index.do_polysemous_training = True
                pt = faiss.PolysemousTraining()
                pt.n_iter = 50000
                pt.n_redo = 1
                index.polysemous_training = pt

            index.train(xt)
            index.add(xb)
            index.nprobe = 4
            D, I = index.search(xq, 10)

            ninter = faiss.eval_intersection(I, gt_I)
            print('(%d, %s): %d, ' % (mt, by_residual, ninter))

            assert ninter >= self.ref_results[mt, by_residual] - 2

            index.use_precomputed_table = 0
            D2, I2 = index.search(xq, 10)
            assert np.all(I == I2)

            if by_residual:

                index.use_precomputed_table = 1
                index.polysemous_ht = 20
                D, I = index.search(xq, 10)
                ninter = faiss.eval_intersection(I, gt_I)
                print('(%d, %s, %d): %d, ' %
                      (mt, by_residual, index.polysemous_ht, ninter))

                # polysemous behaves bizarrely on ARM
                assert (
                    ninter >=
                    self.ref_results[mt, by_residual, index.polysemous_ht] - 4)
Beispiel #22
0
    def test_rename(self):
        d = 10
        nb = 500
        nq = 100
        nt = 100

        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        quantizer = faiss.IndexFlatL2(d)

        index1 = faiss.IndexIVFFlat(quantizer, d, 20)
        index1.train(xt)

        dirname = tempfile.mkdtemp()

        try:

            # make an index with ondisk invlists
            invlists = faiss.OnDiskInvertedLists(
                index1.nlist, index1.code_size,
                dirname + '/aa.ondisk')
            index1.replace_invlists(invlists)
            index1.add(xb)
            D1, I1 = index1.search(xq, 10)
            faiss.write_index(index1, dirname + '/aa.ivf')

            # move the index elsewhere
            os.mkdir(dirname + '/1')
            for fname in 'aa.ondisk', 'aa.ivf':
                os.rename(dirname + '/' + fname,
                          dirname + '/1/' + fname)

            # try to read it: fails!
            try:
                index2 = faiss.read_index(dirname + '/1/aa.ivf')
            except RuntimeError:
                pass   # normal
            else:
                assert False

            # read it with magic flag
            index2 = faiss.read_index(dirname + '/1/aa.ivf',
                                      faiss.IO_FLAG_ONDISK_SAME_DIR)
            D2, I2 = index2.search(xq, 10)
            assert np.all(I1 == I2)

        finally:
            shutil.rmtree(dirname)
Beispiel #23
0
    def test_parallel_mode(self):
        d = 32
        xt, xb, xq = get_dataset_2(d, 2000, 1000, 200)

        index = faiss.index_factory(d, "IVF64,SQ8")
        index.train(xt)
        index.add(xb)
        index.nprobe = 4   # hopefully more robust than 1
        Dref, Iref = index.search(xq, 10)

        for pm in 1, 2, 3:
            index.parallel_mode = pm

            Dnew, Inew = index.search(xq, 10)
            np.testing.assert_array_equal(Iref, Inew)
            np.testing.assert_array_equal(Dref, Dnew)
Beispiel #24
0
    def do_test(self, nq, metric_type=faiss.METRIC_L2, k=10):
        d = 32
        nb = 1000
        nt = 0

        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)
        index = faiss.IndexFlat(d, metric_type)

        ### k-NN search

        index.add(xb)
        D1, I1 = index.search(xq, k)

        if metric_type == faiss.METRIC_L2:
            all_dis = ((xq.reshape(nq, 1, d) - xb.reshape(1, nb, d)) ** 2).sum(2)
            Iref = all_dis.argsort(axis=1)[:, :k]
        else:
            all_dis = np.dot(xq, xb.T)
            Iref = all_dis.argsort(axis=1)[:, ::-1][:, :k]

        Dref = all_dis[np.arange(nq)[:, None], Iref]
        self.assertLessEqual((Iref != I1).sum(), Iref.size * 0.0001)
        #  np.testing.assert_equal(Iref, I1)
        np.testing.assert_almost_equal(Dref, D1, decimal=5)

        ### Range search

        radius = float(np.median(Dref[:, -1]))

        lims, D2, I2 = index.range_search(xq, radius)

        for i in range(nq):
            l0, l1 = lims[i:i + 2]
            _, Il = D2[l0:l1], I2[l0:l1]
            if metric_type == faiss.METRIC_L2:
                Ilref, = np.where(all_dis[i] < radius)
            else:
                Ilref, = np.where(all_dis[i] > radius)
            Il.sort()
            Ilref.sort()
            np.testing.assert_equal(Il, Ilref)
            np.testing.assert_almost_equal(
                all_dis[i, Ilref], D2[l0:l1],
                decimal=5
            )
Beispiel #25
0
    def test_progressive_dim(self):
        d = 32
        n = 10000
        k = 50
        xt, _, _ = get_dataset_2(d, n, 0, 0)

        # basic kmeans
        kmeans = faiss.Kmeans(d, k, gpu=True)
        kmeans.train(xt)

        pca = faiss.PCAMatrix(d, d)
        pca.train(xt)
        xt_pca = pca.apply(xt)

        # same test w/ Kmeans wrapper
        kmeans2 = faiss.Kmeans(d, k, progressive_dim_steps=5, gpu=True)
        kmeans2.train(xt_pca)
        self.assertLess(kmeans2.obj[-1], kmeans.obj[-1])
Beispiel #26
0
    def test_compute_GT(self):
        d = 64
        xt, xb, xq = get_dataset_2(d, 0, 10000, 100)

        index = faiss.IndexFlatL2(d)
        index.add(xb)
        Dref, Iref = index.search(xq, 10)

        # iterator function on the matrix

        def matrix_iterator(xb, bs):
            for i0 in range(0, xb.shape[0], bs):
                yield xb[i0:i0 + bs]

        Dnew, Inew = knn_ground_truth(xq, matrix_iterator(xb, 1000), 10)

        np.testing.assert_array_equal(Iref, Inew)
        np.testing.assert_almost_equal(Dref, Dnew, decimal=4)
Beispiel #27
0
    def do_test_knn(self, mt):
        d = 10
        nb = 100
        nq = 50
        nt = 0
        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        index = faiss.IndexFlat(d, mt)
        index.add(xb)

        D, I = index.search(xq, 10)

        dis = faiss.pairwise_distances(xq, xb, mt)
        o = dis.argsort(axis=1)
        assert np.all(I == o[:, :10])

        for q in range(nq):
            assert np.all(D[q] == dis[q, I[q]])
Beispiel #28
0
    def test_hnsw(self):

        d = 10
        nb = 1000
        nq = 100
        nt = 0
        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        mt = faiss.METRIC_L1

        index = faiss.IndexHNSW(faiss.IndexFlat(d, mt))
        index.add(xb)

        D, I = index.search(xq, 10)

        dis = faiss.pairwise_distances(xq, xb, mt)

        for q in range(nq):
            assert np.all(D[q] == dis[q, I[q]])
Beispiel #29
0
    def do_merge_then_remove(self, ondisk):
        d = 10
        nb = 1000
        nq = 200
        nt = 200

        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        quantizer = faiss.IndexFlatL2(d)

        index1 = faiss.IndexIVFFlat(quantizer, d, 20)
        index1.train(xt)

        filename = None
        if ondisk:
            filename = tempfile.mkstemp()[1]
            invlists = faiss.OnDiskInvertedLists(
                index1.nlist, index1.code_size,
                filename)
            index1.replace_invlists(invlists)

        index1.add(xb[:int(nb / 2)])

        index2 = faiss.IndexIVFFlat(quantizer, d, 20)
        assert index2.is_trained
        index2.add(xb[int(nb / 2):])

        Dref, Iref = index1.search(xq, 10)
        index1.merge_from(index2, int(nb / 2))

        assert index1.ntotal == nb

        index1.remove_ids(faiss.IDSelectorRange(int(nb / 2), nb))

        assert index1.ntotal == int(nb / 2)
        Dnew, Inew = index1.search(xq, 10)

        assert np.all(Dnew == Dref)
        assert np.all(Inew == Iref)

        if filename is not None:
            os.unlink(filename)
Beispiel #30
0
    def test_stop_words(self):
        d = 10
        nb = 1000
        nq = 1
        nt = 200

        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        index = faiss.index_factory(d, "IVF32,Flat")
        index.nprobe = 4
        index.train(xt)
        index.add(xb)
        Dref, Iref = index.search(xq, 10)

        il = index.invlists
        maxsz = max(il.list_size(i) for i in range(il.nlist))

        il2 = faiss.StopWordsInvertedLists(il, maxsz + 1)
        index.own_invlists
        index.own_invlists = False

        index.replace_invlists(il2, False)
        D1, I1 = index.search(xq, 10)
        np.testing.assert_array_equal(Dref, D1)
        np.testing.assert_array_equal(Iref, I1)

        # cleanup to avoid segfault on exit
        index.replace_invlists(il, False)

        # voluntarily unbalance one invlist
        i = int(I1[0, 0])
        index.add(np.vstack([xb[i]] * (maxsz + 10)))

        # introduce stopwords again
        index.replace_invlists(il2, False)

        D2, I2 = index.search(xq, 10)
        self.assertFalse(i in list(I2.ravel()))

        # avoid mem leak
        index.replace_invlists(il, True)