def do_read_callback(self, bsz): d, n = 32, 1000 x = np.random.uniform(size=(n, d)).astype('float32') index = faiss.IndexFlatL2(d) index.add(x) fd, fname = tempfile.mkstemp() os.close(fd) try: faiss.write_index(index, fname) with open(fname, 'rb') as f: reader = faiss.PyCallbackIOReader(f.read, 1234) if bsz > 0: reader = faiss.BufferedIOReader(reader, bsz) index2 = faiss.read_index(reader) self.assertEqual(index.d, index2.d) np.testing.assert_array_equal(faiss.vector_to_array(index.xb), faiss.vector_to_array(index2.xb)) # This is not a callable function: should raise an exception reader = faiss.PyCallbackIOReader("blabla") self.assertRaises(Exception, faiss.read_index, reader) finally: if os.path.exists(fname): os.unlink(fname)
def run_kmeans(x, nmb_clusters, verbose=False, use_gpu=True): """Runs kmeans on 1 GPU. Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ n_data, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 if use_gpu: res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = 0 index = faiss.GpuIndexFlatL2(res, d, flat_config) else: index = faiss.IndexFlatL2(d) # perform the training clus.train(x, index) _, I = index.search(x, 1) centroids = faiss.vector_to_array(clus.centroids).reshape( (nmb_clusters, d)) # Also return centroids! losses = faiss.vector_to_array(clus.obj) if verbose: print('k-means loss evolution: {0}'.format(losses)) return [int(n[0]) for n in I], losses[-1], centroids, index
def test_8bit_equiv(self): rs = np.random.RandomState(123) for _it in range(20): for d in 13, 16, 24: x = np.floor(rs.rand(5, d) * 256).astype('float32') x[0] = 0 x[1] = 255 # make sure to test extreme cases x[2, 0] = 0 x[3, 0] = 255 x[2, 1] = 255 x[3, 1] = 0 ref_index = faiss.IndexScalarQuantizer( d, faiss.ScalarQuantizer.QT_8bit) ref_index.train(x[:2]) ref_index.add(x[2:3]) index = faiss.IndexScalarQuantizer( d, faiss.ScalarQuantizer.QT_8bit_direct) assert index.is_trained index.add(x[2:3]) assert np.all( faiss.vector_to_array(ref_index.codes) == faiss.vector_to_array(index.codes)) # Note that distances are not the same because ref_index # reconstructs x as x + 0.5 D, I = index.search(x[3:], 1) # assert D[0, 0] == Dref[0, 0] print(D[0, 0], ((x[3] - x[2]) ** 2).sum()) assert D[0, 0] == ((x[3] - x[2]) ** 2).sum()
def preprocess_features(npdata, pca=256, pca_info=None): """Preprocess an array of features. Args: npdata (np.array N * ndim): features to preprocess pca (int): dim of output Returns: np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized """ _, ndim = npdata.shape npdata = npdata.astype('float32') if pca_info is None: # Apply PCA-whitening with Faiss pca_matrix = faiss.PCAMatrix(ndim, pca, eigen_power=-0.5) pca_matrix.train(npdata) assert pca_matrix.is_trained npdata = pca_matrix.apply_py(npdata) pca_A = np.transpose( faiss.vector_to_array(pca_matrix.A).reshape((pca, ndim))) pca_b = faiss.vector_to_array(pca_matrix.b) pca_info = (pca_A, pca_b) else: npdata = np.dot(npdata, pca_info[0]) + pca_info[1] # L2 normalization row_sums = np.linalg.norm(npdata, axis=1) npdata = npdata / row_sums[:, np.newaxis] return npdata, pca_info
def run_kmeans(x, nmb_clusters, verbose=False): """Runs kmeans on 1 GPU. Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ n_data, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 index = faiss.IndexFlatL2(d) # perform the training clus.train(x, index) Dist, Ind = index.search(x, 1) losses = faiss.vector_to_array(clus.obj) # if verbose: # print('k-means loss evolution: {0}'.format(losses)) centers = faiss.vector_to_array(clus.centroids).reshape(nmb_clusters, -1) kmeans_trans_weight = -2 * centers kmeans_trans_bias = (centers**2).sum(axis=1) # (K,) return [int(n[0]) for n in Ind], [float(n[0]) for n in Dist], losses[-1], clus, \ [kmeans_trans_weight, kmeans_trans_bias], centers
def test_encoded(self): d = 32 k = 5 xt, xb, xq = get_dataset_2(d, 1000, 0, 0) # make sure that training on a compressed then decompressed # dataset gives the same result as decompressing on-the-fly codec = faiss.IndexScalarQuantizer(d, faiss.ScalarQuantizer.QT_4bit) codec.train(xt) codes = codec.sa_encode(xt) xt2 = codec.sa_decode(codes) clus = faiss.Clustering(d, k) # clus.verbose = True clus.niter = 0 index = faiss.IndexFlatL2(d) clus.train(xt2, index) ref_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d) _, ref_errs = index.search(xt2, 1) clus = faiss.Clustering(d, k) # clus.verbose = True clus.niter = 0 clus.decode_block_size = 120 index = faiss.IndexFlatL2(d) clus.train_encoded(codes, codec, index) new_centroids = faiss.vector_to_array(clus.centroids).reshape(-1, d) _, new_errs = index.search(xt2, 1) # It's the same operation, so should be bit-exact the same self.assertTrue(np.all(ref_centroids == new_centroids))
def do_write_callback(self, bsz): d, n = 32, 1000 x = np.random.uniform(size=(n, d)).astype('float32') index = faiss.IndexFlatL2(d) index.add(x) f = io.BytesIO() # test with small block size writer = faiss.PyCallbackIOWriter(f.write, 1234) if bsz > 0: writer = faiss.BufferedIOWriter(writer, bsz) faiss.write_index(index, writer) del writer # make sure all writes committed if sys.version_info[0] < 3: buf = f.getvalue() else: buf = f.getbuffer() index2 = faiss.deserialize_index(np.frombuffer(buf, dtype='uint8')) self.assertEqual(index.d, index2.d) self.assertTrue( np.all( faiss.vector_to_array(index.xb) == faiss.vector_to_array( index2.xb))) # This is not a callable function: shoudl raise an exception writer = faiss.PyCallbackIOWriter("blabla") self.assertRaises(Exception, faiss.write_index, index, writer)
def test_precomp(self): ds = datasets.SyntheticDataset(32, 1000, 1000, 0) # make sure it work with varying nb of bits nbits = faiss.UInt64Vector() nbits.push_back(5) nbits.push_back(6) nbits.push_back(7) rq = faiss.ResidualQuantizer(ds.d, nbits) rq.train_type = faiss.ResidualQuantizer.Train_default rq.train(ds.get_train()) codebooks = get_additive_quantizer_codebooks(rq) precomp = precomp_codebooks(codebooks) codebook_cross_prods_ref, cent_norms_ref = precomp # check C++ precomp tables codebook_cross_prods_ref = np.hstack([ np.vstack(c) for c in codebook_cross_prods_ref]) rq.compute_codebook_tables() codebook_cross_prods = faiss.vector_to_array( rq.codebook_cross_products) codebook_cross_prods = codebook_cross_prods.reshape( rq.total_codebook_size, rq.total_codebook_size) cent_norms = faiss.vector_to_array(rq.cent_norms) np.testing.assert_array_almost_equal( codebook_cross_prods, codebook_cross_prods_ref, decimal=5) np.testing.assert_array_almost_equal( np.hstack(cent_norms_ref), cent_norms, decimal=5) # validate that the python tab-based encoding works xb = ds.get_database() ref_codes, _, _ = beam_search_encoding_ref(codebooks, xb, 7) new_codes, _ = beam_search_encoding_tab(codebooks, xb, 7, precomp) np.testing.assert_array_equal(ref_codes, new_codes) # validate the C++ beam_search_encode_step_tab function beam_search_encoding_tab(codebooks, xb, 7, precomp, implem="ref cpp") # check implem w/ residuals n = ref_codes.shape[0] sp = faiss.swig_ptr ref_codes_packed = np.zeros((n, rq.code_size), dtype='uint8') ref_codes_int32 = ref_codes.astype('int32') rq.pack_codes( n, sp(ref_codes_int32), sp(ref_codes_packed), rq.M * ref_codes.shape[1] ) rq.max_beam_size = 7 codes_ref_residuals = rq.compute_codes(xb) np.testing.assert_array_equal(ref_codes_packed, codes_ref_residuals) rq.use_beam_LUT = 1 codes_new = rq.compute_codes(xb) np.testing.assert_array_equal(codes_ref_residuals, codes_new)
def do_test(self, d, dsub, nbit=8, metric=None): if metric is None: self.do_test(d, dsub, nbit, faiss.METRIC_INNER_PRODUCT) self.do_test(d, dsub, nbit, faiss.METRIC_L2) return # faiss.cvar.distance_compute_blas_threshold = 1000000 M = d // dsub pq = faiss.ProductQuantizer(d, M, nbit) xt = faiss.randn((max(1000, pq.ksub * 50), d), 123) pq.cp.niter = 4 # to avoid timeouts in tests pq.train(xt) centroids = faiss.vector_to_array(pq.centroids) centroids = centroids.reshape(pq.M, pq.ksub, pq.dsub) nx = 100 x = faiss.randn((nx, d), 555) ref_tab = np.zeros((nx, M, pq.ksub), "float32") # computation of tables in numpy for sq in range(M): i0, i1 = sq * dsub, (sq + 1) * dsub xsub = x[:, i0:i1] centsq = centroids[sq, :, :] if metric == faiss.METRIC_INNER_PRODUCT: ref_tab[:, sq, :] = xsub @ centsq.T elif metric == faiss.METRIC_L2: xsub3 = xsub.reshape(nx, 1, dsub) cent3 = centsq.reshape(1, pq.ksub, dsub) ref_tab[:, sq, :] = ((xsub3 - cent3)**2).sum(2) else: assert False sp = faiss.swig_ptr new_tab = np.zeros((nx, M, pq.ksub), "float32") if metric == faiss.METRIC_INNER_PRODUCT: pq.compute_inner_prod_tables(nx, sp(x), sp(new_tab)) elif metric == faiss.METRIC_L2: pq.compute_distance_tables(nx, sp(x), sp(new_tab)) else: assert False # compute sdc tables in numpy cent1 = np.expand_dims(centroids, axis=2) # [M, ksub, 1, dsub] cent2 = np.expand_dims(centroids, axis=1) # [M, 1, ksub, dsub] ref_sdc_tab = ((cent1 - cent2)**2).sum(3) pq.compute_sdc_table() new_sdc_tab = faiss.vector_to_array(pq.sdc_table) new_sdc_tab = new_sdc_tab.reshape(M, pq.ksub, pq.ksub) np.testing.assert_array_almost_equal(ref_tab, new_tab, decimal=5) np.testing.assert_array_almost_equal(ref_sdc_tab, new_sdc_tab, decimal=5)
def run_kmeans(x, nmb_clusters): n_data, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) # Change faiss seed at each k-means so that the randomly picked # initialization centroids do not correspond to the same feature ids # from an epoch to another. clus.seed = np.random.randint(1234) clus.niter = 20 clus.max_points_per_centroid = 10000000 res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = 0 index = faiss.GpuIndexFlatL2(res, d, flat_config) # perform the training clus.train(x, index) _, I = index.search(x, 1) losses = faiss.vector_to_array(clus.obj) print('k-means loss evolution: {0}'.format(losses)) return [int(n[0]) for n in I], losses[-1]
def run_kmeans(x, nmb_clusters, verbose=False, seed=DEFAULT_KMEANS_SEED, gpu_device=0): """ Runs kmeans on 1 GPU. Args: ----- x: data nmb_clusters (int): number of clusters Returns: -------- list: ids of data in each cluster """ n_data, d = x.shape # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) clus.niter = 20 clus.max_points_per_centroid = 10000000 clus.seed = seed res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.useFloat16 = False flat_config.device = gpu_device index = faiss.GpuIndexFlatL2(res, d, flat_config) # perform the training clus.train(x, index) _, I = index.search(x, 1) losses = faiss.vector_to_array(clus.obj) if verbose: print('k-means loss evolution: {0}'.format(losses)) return [int(n[0]) for n in I], losses[-1]
def preprocess_features(npdata, pca=32, eps=1e-5): """Preprocess an array of features. Args: npdata (np.array N * ndim): features to preprocess pca (int): initial dim of output Returns: np.array of dim N * pca: data PCA-reduced, whitened and L2-normalized """ _, ndim = npdata.shape npdata = npdata.astype('float32') npdata = npdata - np.mean(npdata, axis=0) # Apply PCA-whitening with Faiss mat = faiss.PCAMatrix(ndim, pca, eigen_power=-0.5) mat.train(npdata) assert mat.is_trained eigs = faiss.vector_to_array(mat.eigenvalues) pca = np.argwhere( np.cumsum(sorted(eigs / np.sum(eigs), reverse=True)) >= 0.95)[0, 0] mat = faiss.PCAMatrix(ndim, int(pca), eigen_power=-0.5) mat.train(npdata) assert mat.is_trained npdata = mat.apply_py(npdata) # L2 normalization row_sums = np.linalg.norm(npdata, axis=1) npdata = npdata / np.clip(row_sums[:, np.newaxis], eps, None) return npdata
def run_kmeans(x, nmb_clusters, verbose=False): """Runs kmeans on 1 GPU. Args: x: data nmb_clusters (int): number of clusters Returns: list: ids of data in each cluster """ n_data, d = x.shape # print(n_data, d) # faiss implementation of k-means clus = faiss.Clustering(d, nmb_clusters) # Change faiss seed at each k-means so that the randomly picked # initialization centroids do not correspond to the same feature ids # from an epoch to another. clus.seed = np.random.randint(1234) clus.niter = 20 # clus.min_points_per_centroid = 5 clus.max_points_per_centroid = 100000000 res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() # flat_config = faiss.GpuIndexIVFFlatConfig() # IVF flat_config.useFloat16 = False flat_config.device = 0 # index = faiss.GpuIndexIVFFlat(res, d, nmb_clusters, faiss.METRIC_L2, flat_config) # faiss.Metric_INNER_PRODUCT, index = faiss.GpuIndexFlatL2(res, d, flat_config) # index = faiss.GpuIndexIP(res, d, flat_config) # Inner product between samples # perform the training clus.train(x, index) D, I = index.search(x, 1) losses = faiss.vector_to_array(clus.obj) if verbose: print('k-means loss evolution: {0}'.format(losses)) return [int(n[0]) for n in I], losses[-1], np.array([(d[0]) for d in D])
def test_serialize_to_vector(self): d = 10 nb = 1000 nq = 200 nt = 500 xt, xb, xq = get_dataset_2(d, nt, nb, nq) index = faiss.IndexFlatL2(d) index.add(xb) Dref, Iref = index.search(xq, 5) writer = faiss.VectorIOWriter() faiss.write_index(index, writer) ar_data = faiss.vector_to_array(writer.data) # direct transfer of vector reader = faiss.VectorIOReader() reader.data.swap(writer.data) index2 = faiss.read_index(reader) Dnew, Inew = index2.search(xq, 5) assert np.all(Dnew == Dref) and np.all(Inew == Iref) # from intermediate numpy array reader = faiss.VectorIOReader() faiss.copy_array_to_vector(ar_data, reader.data) index3 = faiss.read_index(reader) Dnew, Inew = index3.search(xq, 5) assert np.all(Dnew == Dref) and np.all(Inew == Iref)
def test_rq3(self): index = faiss.index_factory(5, "RQ2x16_3x8_6x4") np.testing.assert_array_equal( faiss.vector_to_array(index.rq.nbits), np.array([16, 16, 8, 8, 8, 4, 4, 4, 4, 4, 4]) )
def test_lut(self): """test compute_LUT function""" ds = datasets.SyntheticDataset(16, 1000, 0, 100) xt = ds.get_train() xq = ds.get_queries() nsplits = 2 Msub = 2 nbits = 4 nq, d = xq.shape dsub = d // nsplits plsq = faiss.ProductLocalSearchQuantizer(ds.d, nsplits, Msub, nbits) plsq.train(xt) subcodebook_size = Msub * (1 << nbits) codebook_size = nsplits * subcodebook_size lut = np.zeros((nq, codebook_size), dtype=np.float32) plsq.compute_LUT(nq, sp(xq), sp(lut)) codebooks = faiss.vector_to_array(plsq.codebooks) codebooks = codebooks.reshape(nsplits, subcodebook_size, dsub) xq = xq.reshape(nq, nsplits, dsub) lut_ref = np.zeros((nq, nsplits, subcodebook_size), dtype=np.float32) for i in range(nsplits): lut_ref[:, i] = xq[:, i] @ codebooks[i].T lut_ref = lut_ref.reshape(nq, codebook_size) # max rtoal in OSX: 2.87e-6 np.testing.assert_allclose(lut, lut_ref, rtol=5e-06)
def test_int64(self): # see https://github.com/facebookresearch/faiss/issues/1529 v = faiss.Int64Vector() for i in range(10): v.push_back(i) a = faiss.vector_to_array(v) assert a.dtype == 'int64' np.testing.assert_array_equal(a, np.arange(10, dtype='int64')) # check if it works in an IDMap idx = faiss.IndexIDMap(faiss.IndexFlatL2(32)) idx.add_with_ids( np.random.rand(10, 32).astype('float32'), np.random.randint(1000, size=10, dtype='int64')) faiss.vector_to_array(idx.id_map)
def subtest(self, d, K, metric): metric_names = {faiss.METRIC_L1: 'L1', faiss.METRIC_L2: 'L2', faiss.METRIC_INNER_PRODUCT: 'IP'} nb = 1000 _, xb, _ = get_dataset_2(d, 0, nb, 0) _, knn = faiss.knn(xb, xb, K + 1, metric) knn = knn[:, 1:] index = faiss.IndexNNDescentFlat(d, K, metric) index.nndescent.S = 10 index.nndescent.R = 32 index.nndescent.L = K + 20 index.nndescent.iter = 5 index.verbose = True index.add(xb) graph = index.nndescent.final_graph graph = faiss.vector_to_array(graph) graph = graph.reshape(nb, K) recalls = 0 for i in range(nb): for j in range(K): for k in range(K): if graph[i, j] == knn[i, k]: recalls += 1 break recall = 1.0 * recalls / (nb * K) print('Metric: {}, knng accuracy: {}'.format(metric_names[metric], recall)) assert recall > 0.99
def run_kmeans(x, num_clusters, temperature): """ Args: x: data to be clustered """ print('performing kmeans clustering') results = {'im2cluster': [], 'centroids': [], 'density': []} for seed, num_cluster in enumerate(num_clusters): # intialize faiss clustering parameters d = x.shape[1] k = int(num_cluster) clus = faiss.Clustering(d, k) clus.verbose = False clus.niter = 20 clus.nredo = 5 clus.seed = seed clus.max_points_per_centroid = 1000 clus.min_points_per_centroid = 5 res = faiss.StandardGpuResources() cfg = faiss.GpuIndexFlatConfig() cfg.useFloat16 = False cfg.device = 0 index = faiss.GpuIndexFlatL2(res, d, cfg) clus.train(x, index) D, I = index.search( x, 1) # for each sample, find cluster distance and assignments im2cluster = [int(n[0]) for n in I] # get cluster centroids centroids = faiss.vector_to_array(clus.centroids).reshape(k, d) # sample-to-centroid distances for each cluster Dcluster = [[] for c in range(k)] for im, i in enumerate(im2cluster): Dcluster[i].append(D[im][0]) # concentration estimation (phi) density = np.zeros(k) for i, dist in enumerate(Dcluster): if len(dist) > 1: d = (np.asarray(dist)**0.5).mean() / np.log(len(dist) + 10) density[i] = d # if cluster only has one point, use the max to estimate its concentration dmax = density.max() for i, dist in enumerate(Dcluster): if len(dist) <= 1: density[i] = dmax density = density.clip(np.percentile(density, 10), np.percentile( density, 90)) # clamp extreme values for stability density = temperature * density / density.mean( ) # scale the mean to temperature # convert to cuda Tensors for broadcast centroids = torch.Tensor(centroids).cuda() centroids = nn.functional.normalize(centroids, p=2, dim=1) im2cluster = torch.LongTensor(im2cluster).cuda() density = torch.Tensor(density).cuda() results['centroids'].append(centroids) results['density'].append(density) results['im2cluster'].append(im2cluster) return results
def test_redo(self): d = 64 n = 1000 rs = np.random.RandomState(123) x = rs.uniform(size=(n, d)).astype('float32') clus = faiss.Clustering(d, 20) clus.nredo = 1 clus.train(x, faiss.IndexFlatL2(d)) obj1 = faiss.vector_to_array(clus.obj) clus = faiss.Clustering(d, 20) clus.nredo = 10 clus.train(x, faiss.IndexFlatL2(d)) obj10 = faiss.vector_to_array(clus.obj) self.assertGreater(obj1[-1], obj10[-1])
def train_and_get_centroids(override_kmeans_index): index = faiss.index_binary_factory(d, b"BIVF10") index.verbose = True if override_kmeans_index is not None: index.clustering_index = override_kmeans_index index.train(xt) centroids = faiss.downcast_IndexBinary(index.quantizer).xb return faiss.vector_to_array(centroids).reshape(-1, d // 8)
def test_read_buffer(self): d, n = 32, 1000 x = np.random.uniform(size=(n, d)).astype('float32') index = faiss.IndexFlatL2(d) index.add(x) _, fname = tempfile.mkstemp() try: faiss.write_index(index, fname) reader = faiss.BufferedIOReader(faiss.FileIOReader(fname), 1234) index2 = faiss.read_index(reader) self.assertEqual(index.d, index2.d) np.testing.assert_array_equal(faiss.vector_to_array(index.xb), faiss.vector_to_array(index2.xb)) finally: if os.path.exists(fname): os.unlink(fname)
def test_equiv_rq(self): """ make sure it is equivalent to search a RQ and to search an IVF with RCQ + RQ with the same codebooks. """ ds = datasets.SyntheticDataset(32, 3000, 1000, 50) # make a flat RQ iflat = faiss.IndexResidualQuantizer(ds.d, 5, 4) iflat.rq.train_type = faiss.ResidualQuantizer.Train_default iflat.train(ds.get_train()) iflat.add(ds.get_database()) # ref search result Dref, Iref = iflat.search(ds.get_queries(), 10) # get its codebooks + encoded version of the dataset codebooks = get_additive_quantizer_codebooks(iflat.rq) codes = faiss.vector_to_array(iflat.codes).reshape(-1, iflat.code_size) # make an IVF with 2x4 + 3x4 = 5x4 bits ivf = faiss.index_factory(ds.d, "IVF256(RCQ2x4),RQ3x4") # initialize the codebooks rcq = faiss.downcast_index(ivf.quantizer) faiss.copy_array_to_vector( np.vstack(codebooks[:rcq.rq.M]).ravel(), rcq.rq.codebooks ) rcq.rq.is_trained = True # translation of AdditiveCoarseQuantizer::train rcq.ntotal = 1 << rcq.rq.tot_bits rcq.centroid_norms.resize(rcq.ntotal) rcq.rq.compute_centroid_norms(rcq.centroid_norms.data()) rcq.is_trained = True faiss.copy_array_to_vector( np.vstack(codebooks[rcq.rq.M:]).ravel(), ivf.rq.codebooks ) ivf.rq.is_trained = True ivf.is_trained = True # add the codes (this works because 2x4 is a multiple of 8 bits) ivf.add_sa_codes(codes) # perform exhaustive search ivf.nprobe = ivf.nlist Dnew, Inew = ivf.search(ds.get_queries(), 10) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
def main(): parser = get_parser() args = parser.parse_args() print("Reading features") x = np.load(args.data, mmap_mode="r") print("Computing PCA") pca = faiss.PCAMatrix(x.shape[-1], args.dim, args.eigen_power) pca.train(x) b = faiss.vector_to_array(pca.b) A = faiss.vector_to_array(pca.A).reshape(pca.d_out, pca.d_in) os.makedirs(args.output, exist_ok=True) prefix = str(args.dim) if args.eigen_power != 0: prefix += f"_{args.eigen_power}" np.save(osp.join(args.output, f"{prefix}_pca_A"), A.T) np.save(osp.join(args.output, f"{prefix}_pca_b"), b)
def __init__(self, ds, indexfile): self.d = ds.d self.metric = ds.metric self.nq = ds.nq self.xq = ds.get_queries() # get the xb set src_index = faiss.read_index(indexfile) src_quant = faiss.downcast_index(src_index.quantizer) centroids = faiss.vector_to_array(src_quant.xb) self.xb = centroids.reshape(-1, self.d) self.nb = self.nt = len(self.xb)
def subtest_cluster1d(self, n, k): rs = np.random.RandomState(123) x = rs.uniform(size=(n, 1)).astype('float32') clus = faiss.Clustering1D(k) clus.train_exact(x) centroids = faiss.vector_to_array(clus.centroids).reshape((-1, 1)) obj = self.evaluate_obj(centroids, x) clus2 = faiss.Kmeans(1, k) clus2.train(x) obj2 = self.evaluate_obj(clus2.centroids, x) self.assertLessEqual(obj, obj2)
def unpack_codes(rq, packed_codes): nbits = faiss.vector_to_array(rq.nbits) if np.all(nbits == 8): return packed_codes.astype("uint32") nbits = [int(x) for x in nbits] nb = len(nbits) n, code_size = packed_codes.shape codes = np.zeros((n, nb), dtype="uint32") for i in range(n): br = faiss.BitstringReader(faiss.swig_ptr(packed_codes[i]), code_size) for j, nbi in enumerate(nbits): codes[i, j] = br.read(nbi) return codes
def train(self, vecs: np.ndarray, *args, **kwargs) -> None: import faiss num_samples, num_dim = vecs.shape assert self.output_dim <= num_samples, 'training PCA requires at least %d points, but %d was given' % ( self.output_dim, num_samples) assert self.output_dim < num_dim, 'PCA output dimension should < data dimension, received (%d, %d)' % ( self.output_dim, num_dim) pca = faiss.PCAMatrix(num_dim, self.output_dim) self.mean = np.mean(vecs, axis=0) # 1 x 768 pca.train(vecs) explained_variance_ratio = faiss.vector_to_array( pca.eigenvalues)[:self.output_dim] components = faiss.vector_to_array(pca.PCAMat).reshape( [-1, num_dim])[:self.output_dim] # permutate engive according to variance opt_order = get_perm(explained_variance_ratio, self.num_locals) comp_tmp = np.reshape(components[opt_order], [self.output_dim, num_dim]) self.pca_components = np.transpose(comp_tmp) # 768 x 200
def test_int64(self): # see https://github.com/facebookresearch/faiss/issues/1529 sizeof_long = array.array("l").itemsize if sizeof_long == 4: v = faiss.LongLongVector() elif sizeof_long == 8: v = faiss.LongVector() else: raise AssertionError("weird long size") for i in range(10): v.push_back(i) a = faiss.vector_to_array(v) assert a.dtype == 'int64' np.testing.assert_array_equal(a, np.arange(10, dtype='int64')) # check if it works in an IDMap idx = faiss.IndexIDMap(faiss.IndexFlatL2(32)) idx.add_with_ids( np.random.rand(10, 32).astype('float32'), np.random.randint(1000, size=10, dtype='int64')) faiss.vector_to_array(idx.id_map)
def subtest_add2col(self, xb, xq, index, qname): """Test with 2 additional dimensions to take also the non-SIMD codepath. We don't retrain anything but add 2 dims to the queries, the centroids and the trained ScalarQuantizer. """ nb, d = xb.shape d2 = d + 2 xb2 = self.add2columns(xb) xq2 = self.add2columns(xq) nlist = index.nlist quantizer = faiss.downcast_index(index.quantizer) quantizer2 = faiss.IndexFlat(d2, index.metric_type) centroids = faiss.vector_to_array(quantizer.xb).reshape(nlist, d) centroids2 = self.add2columns(centroids) quantizer2.add(centroids2) index2 = faiss.IndexIVFScalarQuantizer( quantizer2, d2, index.nlist, index.sq.qtype, index.metric_type) index2.nprobe = 4 if qname in ('8bit', '4bit'): trained = faiss.vector_to_array(index.sq.trained).reshape(2, -1) nt = trained.shape[1] # 2 lines: vmins and vdiffs new_nt = int(nt * d2 / d) trained2 = np.hstack(( trained, np.zeros((2, new_nt - nt), dtype='float32') )) trained2[1, nt:] = 1.0 # set vdiff to 1 to avoid div by 0 faiss.copy_array_to_vector(trained2.ravel(), index2.sq.trained) else: index2.sq.trained = index.sq.trained index2.is_trained = True index2.add(xb2) return index2.search(xq2, 10)