def do_mmappedIO(self, sparse, in_pretransform=False): d = 10 nb = 1000 nq = 200 nt = 200 xt, xb, xq = get_dataset_2(d, nt, nb, nq) quantizer = faiss.IndexFlatL2(d) index1 = faiss.IndexIVFFlat(quantizer, d, 20) if sparse: # makes the inverted lists sparse because all elements get # assigned to the same invlist xt += (np.ones(10) * 1000).astype('float32') if in_pretransform: # make sure it still works when wrapped in an IndexPreTransform index1 = faiss.IndexPreTransform(index1) index1.train(xt) index1.add(xb) _, fname = tempfile.mkstemp() try: faiss.write_index(index1, fname) index2 = faiss.read_index(fname) self.compare_results(index1, index2, xq) index3 = faiss.read_index(fname, faiss.IO_FLAG_MMAP) self.compare_results(index1, index3, xq) finally: if os.path.exists(fname): os.unlink(fname)
def test_serialize_to_vector(self): d = 10 nb = 1000 nq = 200 nt = 500 xt, xb, xq = get_dataset_2(d, nt, nb, nq) index = faiss.IndexFlatL2(d) index.add(xb) Dref, Iref = index.search(xq, 5) writer = faiss.VectorIOWriter() faiss.write_index(index, writer) ar_data = faiss.vector_to_array(writer.data) # direct transfer of vector reader = faiss.VectorIOReader() reader.data.swap(writer.data) index2 = faiss.read_index(reader) Dnew, Inew = index2.search(xq, 5) assert np.all(Dnew == Dref) and np.all(Inew == Iref) # from intermediate numpy array reader = faiss.VectorIOReader() faiss.copy_array_to_vector(ar_data, reader.data) index3 = faiss.read_index(reader) Dnew, Inew = index3.search(xq, 5) assert np.all(Dnew == Dref) and np.all(Inew == Iref)
def test_IMI_2(self): d = 32 nb = 1000 nt = 1500 nq = 200 (xt, xb, xq) = get_dataset_2(d, nt, nb, nq) d = xt.shape[1] gt_index = faiss.IndexFlatL2(d) gt_index.add(xb) D, gt_nns = gt_index.search(xq, 1) ############# redo including training nbits = 5 ai0 = faiss.IndexFlatL2(int(d / 2)) ai1 = faiss.IndexFlatL2(int(d / 2)) coarse_quantizer = faiss.MultiIndexQuantizer2(d, nbits, ai0, ai1) index = faiss.IndexIVFPQ(coarse_quantizer, d, (1 << nbits) ** 2, 8, 8) index.quantizer_trains_alone = 1 index.train(xt) index.add(xb) index.nprobe = 100 D, nns = index.search(xq, 10) n_ok = (nns == gt_nns).sum() # should return the same result self.assertGreater(n_ok, 165)
def test_slice_vstack(self): d = 10 nb = 1000 nq = 100 nt = 200 xt, xb, xq = get_dataset_2(d, nb, nt, nq) quantizer = faiss.IndexFlatL2(d) index = faiss.IndexIVFFlat(quantizer, d, 30) index.train(xt) index.add(xb) Dref, Iref = index.search(xq, 10) # faiss.wait() il0 = index.invlists ils = [] ilv = faiss.InvertedListsPtrVector() for sl in 0, 1, 2: il = faiss.SliceInvertedLists(il0, sl * 10, sl * 10 + 10) ils.append(il) ilv.push_back(il) il2 = faiss.VStackInvertedLists(ilv.size(), ilv.data()) index2 = faiss.IndexIVFFlat(quantizer, d, 30) index2.replace_invlists(il2) index2.ntotal = index.ntotal D, I = index2.search(xq, 10) assert np.all(D == Dref) assert np.all(I == Iref)
def test_encoded(self): d = 32 k = 5 xt, xb, xq = get_dataset_2(d, 1000, 0, 0) # make sure that training on a compressed then decompressed # dataset gives the same result as decompressing on-the-fly codec = faiss.IndexScalarQuantizer(d, faiss.ScalarQuantizer.QT_4bit) codec.train(xt) codes = codec.sa_encode(xt) xt2 = codec.sa_decode(codes) clus = faiss.Clustering(d, k) # clus.verbose = True clus.niter = 0 index = faiss.IndexFlatL2(d) clus.train(xt2, index) ref_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d) _, ref_errs = index.search(xt2, 1) clus = faiss.Clustering(d, k) # clus.verbose = True clus.niter = 0 clus.decode_block_size = 120 index = faiss.IndexFlatL2(d) clus.train_encoded(codes, codec, index) new_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d) _, new_errs = index.search(xt2, 1) # It's the same operation, so should be bit-exact the same self.assertTrue(np.all(ref_centroids == new_centroids))
def test_stats(self): d = 32 k = 5 xt, xb, xq = get_dataset_2(d, 1000, 0, 0) km = faiss.Kmeans(d, k, niter=4) km.train(xt) assert list(km.obj) == [st['obj'] for st in km.iteration_stats]
def test_IVFPQ_non8bit(self): d = 16 xt, xb, xq = get_dataset_2(d, 10000, 2000, 200) nlist = 64 gt_index = faiss.IndexFlat(d) gt_index.add(xb) gt_D, gt_I = gt_index.search(xq, 10) quantizer = faiss.IndexFlat(d) ninter = {} for v in '2x8', '8x2': if v == '8x2': index = faiss.IndexIVFPQ(quantizer, d, nlist, 2, 8) else: index = faiss.IndexIVFPQ(quantizer, d, nlist, 8, 2) index.train(xt) index.add(xb) index.npobe = 16 D, I = index.search(xq, 10) ninter[v] = faiss.eval_intersection(I, gt_I) print('ninter=', ninter) # this should be the case but we don't observe # that... Probavly too few test points # assert ninter['2x8'] > ninter['8x2'] # ref numbers on 2019-11-02 assert abs(ninter['2x8'] - 458) < 4 assert abs(ninter['8x2'] - 465) < 4
def do_encode_twice(self, factory_key): d = 96 nb = 1000 nq = 0 nt = 2000 xt, x, _ = get_dataset_2(d, nt, nb, nq) assert x.size > 0 codec = faiss.index_factory(d, factory_key) codec.train(xt) codes = codec.sa_encode(x) x2 = codec.sa_decode(codes) codes2 = codec.sa_encode(x2) if 'IVF' not in factory_key: self.assertTrue(np.all(codes == codes2)) else: # some rows are not reconstructed exactly because they # flip into another quantization cell nrowdiff = (codes != codes2).any(axis=1).sum() self.assertTrue(nrowdiff < 10) x3 = codec.sa_decode(codes2) if 'IVF' not in factory_key: self.assertTrue(np.allclose(x2, x3)) else: diffs = np.abs(x2 - x3).sum(axis=1) avg = np.abs(x2).sum(axis=1).mean() diffs.sort() assert diffs[-10] < avg * 1e-5
def compare_accuracy(self, lowac, highac, max_errs=(1e10, 1e10)): d = 96 nb = 1000 nq = 0 nt = 2000 xt, x, _ = get_dataset_2(d, nt, nb, nq) errs = [] for factory_string in lowac, highac: codec = faiss.index_factory(d, factory_string) print('sa codec: code size %d' % codec.sa_code_size()) codec.train(xt) codes = codec.sa_encode(x) x2 = codec.sa_decode(codes) err = ((x - x2) ** 2).sum() errs.append(err) print(errs) self.assertGreater(errs[0], errs[1]) self.assertGreater(max_errs[0], errs[0]) self.assertGreater(max_errs[1], errs[1]) # just a small IndexLattice I/O test if 'Lattice' in highac: codec2 = faiss.deserialize_index( faiss.serialize_index(codec)) codes = codec.sa_encode(x) x3 = codec.sa_decode(codes) self.assertTrue(np.all(x2 == x3))
def subtest(self, d, K, metric): metric_names = {faiss.METRIC_L1: 'L1', faiss.METRIC_L2: 'L2', faiss.METRIC_INNER_PRODUCT: 'IP'} nb = 1000 _, xb, _ = get_dataset_2(d, 0, nb, 0) _, knn = faiss.knn(xb, xb, K + 1, metric) knn = knn[:, 1:] index = faiss.IndexNNDescentFlat(d, K, metric) index.nndescent.S = 10 index.nndescent.R = 32 index.nndescent.L = K + 20 index.nndescent.iter = 5 index.verbose = True index.add(xb) graph = index.nndescent.final_graph graph = faiss.vector_to_array(graph) graph = graph.reshape(nb, K) recalls = 0 for i in range(nb): for j in range(K): for k in range(K): if graph[i, j] == knn[i, k]: recalls += 1 break recall = 1.0 * recalls / (nb * K) print('Metric: {}, knng accuracy: {}'.format(metric_names[metric], recall)) assert recall > 0.99
def test_IndexIVFPQ(self): d = 32 nb = 1000 nt = 1500 nq = 200 (xt, xb, xq) = get_dataset_2(d, nt, nb, nq) coarse_quantizer = faiss.IndexFlatL2(d) index = faiss.IndexIVFPQ(coarse_quantizer, d, 32, 8, 8) index.cp.min_points_per_centroid = 5 # quiet warning index.train(xt) index.add(xb) # invalid nprobe index.nprobe = 0 k = 10 self.assertRaises(RuntimeError, index.search, xq, k) # invalid k index.nprobe = 4 k = -10 self.assertRaises(AssertionError, index.search, xq, k) # valid params index.nprobe = 4 k = 10 D, nns = index.search(xq, k) self.assertEquals(D.shape[0], nq) self.assertEquals(D.shape[1], k)
def subtest(self, mt): d = 32 xt, xb, xq = get_dataset_2(d, 1000, 2000, 200) nlist = 64 gt_index = faiss.IndexFlat(d, mt) gt_index.add(xb) gt_D, gt_I = gt_index.search(xq, 10) quantizer = faiss.IndexFlat(d, mt) for qname in '8bit 4bit 8bit_uniform 4bit_uniform fp16'.split(): qtype = getattr(faiss.ScalarQuantizer, 'QT_' + qname) index = faiss.IndexIVFScalarQuantizer(quantizer, d, nlist, qtype, mt) index.train(xt) index.add(xb) index.nprobe = 4 # hopefully more robust than 1 D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('(%d, %s): %d, ' % (mt, repr(qname), ninter)) assert abs(ninter - self.ref_results[(mt, qname)]) <= 9 D2, I2 = self.subtest_add2col(xb, xq, index, qname) assert np.all(I2 == I) # also test range search if mt == faiss.METRIC_INNER_PRODUCT: radius = float(D[:, -1].max()) else: radius = float(D[:, -1].min()) print('radius', radius) lims, D3, I3 = index.range_search(xq, radius) ntot = ndiff = 0 for i in range(len(xq)): l0, l1 = lims[i], lims[i + 1] Inew = set(I3[l0:l1]) if mt == faiss.METRIC_INNER_PRODUCT: mask = D2[i] > radius else: mask = D2[i] < radius Iref = set(I2[i, mask]) ndiff += len(Inew ^ Iref) ntot += len(Iref) print('ndiff %d / %d' % (ndiff, ntot)) assert ndiff < ntot * 0.01 for pm in 1, 2: print('parallel_mode=%d' % pm) index.parallel_mode = pm lims4, D4, I4 = index.range_search(xq, radius) print('sizes', lims4[1:] - lims4[:-1]) for qno in range(len(lims) - 1): Iref = I3[lims[qno]:lims[qno + 1]] Inew = I4[lims4[qno]:lims4[qno + 1]] assert set(Iref) == set( Inew), "q %d ref %s new %s" % (qno, Iref, Inew)
def __init__(self, *args, **kwargs): unittest.TestCase.__init__(self, *args, **kwargs) d = 32 nt = 0 nb = 1500 nq = 500 self.GK = 32 _, self.xb, self.xq = get_dataset_2(d, nt, nb, nq)
def test_polysemous_OOM(self): """ this used to cause OOM when training polysemous with large nb bits""" d = 32 xt, xb, xq = get_dataset_2(d, 10000, 0, 0) index = faiss.IndexPQ(d, M, 13) index.do_polysemous_training = True index.pq.cp.niter = 0 index.polysemous_training.max_memory = 128 * 1024 * 1024 self.assertRaises(RuntimeError, index.train, xt)
def test_init(self): d = 32 k = 5 xt, xb, xq = get_dataset_2(d, 1000, 0, 0) km = faiss.Kmeans(d, k, niter=4) km.train(xt) km2 = faiss.Kmeans(d, k, niter=4) km2.train(xt, init_centroids=km.centroids) # check that the intial objective is better for km2 than km self.assertGreater(km.obj[0], km2.obj[0] * 1.01)
def __init__(self, *args, **kwargs): unittest.TestCase.__init__(self, *args, **kwargs) d = 32 nt = 0 nb = 1500 nq = 500 (_, self.xb, self.xq) = get_dataset_2(d, nt, nb, nq) index = faiss.IndexFlatL2(d) index.add(self.xb) Dref, Iref = index.search(self.xq, 1) self.Iref = Iref
def test_bf_knn(self): d = 64 k = 10 xt, xb, xq = get_dataset_2(d, 0, 10000, 100) index = faiss.IndexFlatL2(d) index.add(xb) Dref, Iref = index.search(xq, k) res = faiss.StandardGpuResources() D, I = knn_gpu(res, xb, xq, k) np.testing.assert_array_equal(Iref, I) np.testing.assert_almost_equal(Dref, D, decimal=4) # Test transpositions xbt = np.ascontiguousarray(xb.T) D, I = knn_gpu(res, xbt.T, xq, k) np.testing.assert_array_equal(Iref, I) np.testing.assert_almost_equal(Dref, D, decimal=4) xqt = np.ascontiguousarray(xq.T) D, I = knn_gpu(res, xb, xqt.T, k) np.testing.assert_array_equal(Iref, I) np.testing.assert_almost_equal(Dref, D, decimal=4) D, I = knn_gpu(res, xbt.T, xqt.T, k) np.testing.assert_array_equal(Iref, I) np.testing.assert_almost_equal(Dref, D, decimal=4) # Test f16 data types xb16 = xb.astype(np.float16) xq16 = xq.astype(np.float16) D, I = knn_gpu(res, xb, xq, k) np.testing.assert_array_equal(Iref, I) np.testing.assert_almost_equal(Dref, D, decimal=4) # Test i32 indices I32 = np.empty((xq.shape[0], k), dtype=np.int32) D, _ = knn_gpu(res, xb, xq, k, I=I32) np.testing.assert_array_equal(Iref, I32) np.testing.assert_almost_equal(Dref, D, decimal=4)
def test_4variants_ivf(self): d = 32 nt = 2500 nq = 400 nb = 5000 (xt, xb, xq) = get_dataset_2(d, nt, nb, nq) # common quantizer quantizer = faiss.IndexFlatL2(d) ncent = 64 index_gt = faiss.IndexFlatL2(d) index_gt.add(xb) D, I_ref = index_gt.search(xq, 10) nok = {} index = faiss.IndexIVFFlat(quantizer, d, ncent, faiss.METRIC_L2) index.cp.min_points_per_centroid = 5 # quiet warning index.nprobe = 4 index.train(xt) index.add(xb) D, I = index.search(xq, 10) nok['flat'] = (I[:, 0] == I_ref[:, 0]).sum() for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split( ): qtype = getattr(faiss.ScalarQuantizer, qname) index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent, qtype, faiss.METRIC_L2) index.nprobe = 4 index.train(xt) index.add(xb) D, I = index.search(xq, 10) nok[qname] = (I[:, 0] == I_ref[:, 0]).sum() print(nok, nq) self.assertGreaterEqual(nok['flat'], nq * 0.6) # The tests below are a bit fragile, it happens that the # ordering between uniform and non-uniform are reverted, # probably because the dataset is small, which introduces # jitter self.assertGreaterEqual(nok['flat'], nok['QT_8bit']) self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit']) self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform']) self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform']) self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
def subtest_8bit_direct(self, metric_type, d): xt, xb, xq = get_dataset_2(d, 500, 1000, 30) # rescale everything to get integer tmin, tmax = xt.min(), xt.max() def rescale(x): x = np.floor((x - tmin) * 256 / (tmax - tmin)) x[x < 0] = 0 x[x > 255] = 255 return x xt = rescale(xt) xb = rescale(xb) xq = rescale(xq) gt_index = faiss.IndexFlat(d, metric_type) gt_index.add(xb) Dref, Iref = gt_index.search(xq, 10) index = faiss.IndexScalarQuantizer( d, faiss.ScalarQuantizer.QT_8bit_direct, metric_type) index.add(xb) D, I = index.search(xq, 10) assert np.all(I == Iref) assert np.all(D == Dref) # same, with IVF nlist = 64 quantizer = faiss.IndexFlat(d, metric_type) gt_index = faiss.IndexIVFFlat(quantizer, d, nlist, metric_type) gt_index.nprobe = 4 gt_index.train(xt) gt_index.add(xb) Dref, Iref = gt_index.search(xq, 10) index = faiss.IndexIVFScalarQuantizer( quantizer, d, nlist, faiss.ScalarQuantizer.QT_8bit_direct, metric_type) index.nprobe = 4 index.by_residual = False index.train(xt) index.add(xb) D, I = index.search(xq, 10) assert np.all(I == Iref) assert np.all(D == Dref)
def test_IMI(self): d = 32 nb = 1000 nt = 1500 nq = 200 (xt, xb, xq) = get_dataset_2(d, nt, nb, nq) d = xt.shape[1] gt_index = faiss.IndexFlatL2(d) gt_index.add(xb) D, gt_nns = gt_index.search(xq, 1) nbits = 5 coarse_quantizer = faiss.MultiIndexQuantizer(d, 2, nbits) index = faiss.IndexIVFPQ(coarse_quantizer, d, (1 << nbits)**2, 8, 8) index.quantizer_trains_alone = 1 index.train(xt) index.add(xb) index.nprobe = 100 D, nns = index.search(xq, 10) n_ok = (nns == gt_nns).sum() # Should return 166 on mac, and 170 on linux. self.assertGreater(n_ok, 165) ############# replace with explicit assignment indexes nbits = 5 pq = coarse_quantizer.pq centroids = faiss.vector_to_array(pq.centroids) centroids = centroids.reshape(pq.M, pq.ksub, pq.dsub) ai0 = faiss.IndexFlatL2(pq.dsub) ai0.add(centroids[0]) ai1 = faiss.IndexFlatL2(pq.dsub) ai1.add(centroids[1]) coarse_quantizer_2 = faiss.MultiIndexQuantizer2(d, nbits, ai0, ai1) coarse_quantizer_2.pq = pq coarse_quantizer_2.is_trained = True index.quantizer = coarse_quantizer_2 index.reset() index.add(xb) D, nns = index.search(xq, 10) n_ok = (nns == gt_nns).sum() # should return the same result self.assertGreater(n_ok, 165)
def subtest(self, mt): d = 32 xt, xb, xq = get_dataset_2(d, 1000, 2000, 200) nlist = 64 gt_index = faiss.IndexFlat(d, mt) gt_index.add(xb) gt_D, gt_I = gt_index.search(xq, 10) quantizer = faiss.IndexFlat(d, mt) for by_residual in True, False: index = faiss.IndexIVFPQ(quantizer, d, nlist, 4, 8) index.metric_type = mt index.by_residual = by_residual if by_residual: # perform cheap polysemous training index.do_polysemous_training = True pt = faiss.PolysemousTraining() pt.n_iter = 50000 pt.n_redo = 1 index.polysemous_training = pt index.train(xt) index.add(xb) index.nprobe = 4 D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('(%d, %s): %d, ' % (mt, by_residual, ninter)) assert ninter >= self.ref_results[mt, by_residual] - 2 index.use_precomputed_table = 0 D2, I2 = index.search(xq, 10) assert np.all(I == I2) if by_residual: index.use_precomputed_table = 1 index.polysemous_ht = 20 D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('(%d, %s, %d): %d, ' % (mt, by_residual, index.polysemous_ht, ninter)) # polysemous behaves bizarrely on ARM assert ( ninter >= self.ref_results[mt, by_residual, index.polysemous_ht] - 4)
def test_rename(self): d = 10 nb = 500 nq = 100 nt = 100 xt, xb, xq = get_dataset_2(d, nt, nb, nq) quantizer = faiss.IndexFlatL2(d) index1 = faiss.IndexIVFFlat(quantizer, d, 20) index1.train(xt) dirname = tempfile.mkdtemp() try: # make an index with ondisk invlists invlists = faiss.OnDiskInvertedLists( index1.nlist, index1.code_size, dirname + '/aa.ondisk') index1.replace_invlists(invlists) index1.add(xb) D1, I1 = index1.search(xq, 10) faiss.write_index(index1, dirname + '/aa.ivf') # move the index elsewhere os.mkdir(dirname + '/1') for fname in 'aa.ondisk', 'aa.ivf': os.rename(dirname + '/' + fname, dirname + '/1/' + fname) # try to read it: fails! try: index2 = faiss.read_index(dirname + '/1/aa.ivf') except RuntimeError: pass # normal else: assert False # read it with magic flag index2 = faiss.read_index(dirname + '/1/aa.ivf', faiss.IO_FLAG_ONDISK_SAME_DIR) D2, I2 = index2.search(xq, 10) assert np.all(I1 == I2) finally: shutil.rmtree(dirname)
def test_parallel_mode(self): d = 32 xt, xb, xq = get_dataset_2(d, 2000, 1000, 200) index = faiss.index_factory(d, "IVF64,SQ8") index.train(xt) index.add(xb) index.nprobe = 4 # hopefully more robust than 1 Dref, Iref = index.search(xq, 10) for pm in 1, 2, 3: index.parallel_mode = pm Dnew, Inew = index.search(xq, 10) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_array_equal(Dref, Dnew)
def do_test(self, nq, metric_type=faiss.METRIC_L2, k=10): d = 32 nb = 1000 nt = 0 (xt, xb, xq) = get_dataset_2(d, nt, nb, nq) index = faiss.IndexFlat(d, metric_type) ### k-NN search index.add(xb) D1, I1 = index.search(xq, k) if metric_type == faiss.METRIC_L2: all_dis = ((xq.reshape(nq, 1, d) - xb.reshape(1, nb, d)) ** 2).sum(2) Iref = all_dis.argsort(axis=1)[:, :k] else: all_dis = np.dot(xq, xb.T) Iref = all_dis.argsort(axis=1)[:, ::-1][:, :k] Dref = all_dis[np.arange(nq)[:, None], Iref] self.assertLessEqual((Iref != I1).sum(), Iref.size * 0.0001) # np.testing.assert_equal(Iref, I1) np.testing.assert_almost_equal(Dref, D1, decimal=5) ### Range search radius = float(np.median(Dref[:, -1])) lims, D2, I2 = index.range_search(xq, radius) for i in range(nq): l0, l1 = lims[i:i + 2] _, Il = D2[l0:l1], I2[l0:l1] if metric_type == faiss.METRIC_L2: Ilref, = np.where(all_dis[i] < radius) else: Ilref, = np.where(all_dis[i] > radius) Il.sort() Ilref.sort() np.testing.assert_equal(Il, Ilref) np.testing.assert_almost_equal( all_dis[i, Ilref], D2[l0:l1], decimal=5 )
def test_progressive_dim(self): d = 32 n = 10000 k = 50 xt, _, _ = get_dataset_2(d, n, 0, 0) # basic kmeans kmeans = faiss.Kmeans(d, k, gpu=True) kmeans.train(xt) pca = faiss.PCAMatrix(d, d) pca.train(xt) xt_pca = pca.apply(xt) # same test w/ Kmeans wrapper kmeans2 = faiss.Kmeans(d, k, progressive_dim_steps=5, gpu=True) kmeans2.train(xt_pca) self.assertLess(kmeans2.obj[-1], kmeans.obj[-1])
def test_compute_GT(self): d = 64 xt, xb, xq = get_dataset_2(d, 0, 10000, 100) index = faiss.IndexFlatL2(d) index.add(xb) Dref, Iref = index.search(xq, 10) # iterator function on the matrix def matrix_iterator(xb, bs): for i0 in range(0, xb.shape[0], bs): yield xb[i0:i0 + bs] Dnew, Inew = knn_ground_truth(xq, matrix_iterator(xb, 1000), 10) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_almost_equal(Dref, Dnew, decimal=4)
def do_test_knn(self, mt): d = 10 nb = 100 nq = 50 nt = 0 xt, xb, xq = get_dataset_2(d, nt, nb, nq) index = faiss.IndexFlat(d, mt) index.add(xb) D, I = index.search(xq, 10) dis = faiss.pairwise_distances(xq, xb, mt) o = dis.argsort(axis=1) assert np.all(I == o[:, :10]) for q in range(nq): assert np.all(D[q] == dis[q, I[q]])
def test_hnsw(self): d = 10 nb = 1000 nq = 100 nt = 0 xt, xb, xq = get_dataset_2(d, nt, nb, nq) mt = faiss.METRIC_L1 index = faiss.IndexHNSW(faiss.IndexFlat(d, mt)) index.add(xb) D, I = index.search(xq, 10) dis = faiss.pairwise_distances(xq, xb, mt) for q in range(nq): assert np.all(D[q] == dis[q, I[q]])
def do_merge_then_remove(self, ondisk): d = 10 nb = 1000 nq = 200 nt = 200 xt, xb, xq = get_dataset_2(d, nt, nb, nq) quantizer = faiss.IndexFlatL2(d) index1 = faiss.IndexIVFFlat(quantizer, d, 20) index1.train(xt) filename = None if ondisk: filename = tempfile.mkstemp()[1] invlists = faiss.OnDiskInvertedLists( index1.nlist, index1.code_size, filename) index1.replace_invlists(invlists) index1.add(xb[:int(nb / 2)]) index2 = faiss.IndexIVFFlat(quantizer, d, 20) assert index2.is_trained index2.add(xb[int(nb / 2):]) Dref, Iref = index1.search(xq, 10) index1.merge_from(index2, int(nb / 2)) assert index1.ntotal == nb index1.remove_ids(faiss.IDSelectorRange(int(nb / 2), nb)) assert index1.ntotal == int(nb / 2) Dnew, Inew = index1.search(xq, 10) assert np.all(Dnew == Dref) assert np.all(Inew == Iref) if filename is not None: os.unlink(filename)
def test_stop_words(self): d = 10 nb = 1000 nq = 1 nt = 200 xt, xb, xq = get_dataset_2(d, nt, nb, nq) index = faiss.index_factory(d, "IVF32,Flat") index.nprobe = 4 index.train(xt) index.add(xb) Dref, Iref = index.search(xq, 10) il = index.invlists maxsz = max(il.list_size(i) for i in range(il.nlist)) il2 = faiss.StopWordsInvertedLists(il, maxsz + 1) index.own_invlists index.own_invlists = False index.replace_invlists(il2, False) D1, I1 = index.search(xq, 10) np.testing.assert_array_equal(Dref, D1) np.testing.assert_array_equal(Iref, I1) # cleanup to avoid segfault on exit index.replace_invlists(il, False) # voluntarily unbalance one invlist i = int(I1[0, 0]) index.add(np.vstack([xb[i]] * (maxsz + 10))) # introduce stopwords again index.replace_invlists(il2, False) D2, I2 = index.search(xq, 10) self.assertFalse(i in list(I2.ravel())) # avoid mem leak index.replace_invlists(il, True)