def nanopq_to_faiss(pq_nanopq): """Convert a :class:`nanopq.PQ` instance to `faiss.IndexPQ <https://github.com/facebookresearch/faiss/blob/master/IndexPQ.h>`_. To use this function, `faiss module needs to be installed <https://github.com/facebookresearch/faiss/blob/master/INSTALL.md>`_. Args: pq_nanopq (nanopq.PQ): An input PQ instance. Returns: faiss.IndexPQ: A converted PQ instance, with the same codewords to the input. """ assert isinstance(pq_nanopq, PQ), "Error. pq_nanopq must be nanopq.pq" assert pq_nanopq.codewords is not None, "Error. pq_nanopq.codewords must have been set beforehand" D = pq_nanopq.Ds * pq_nanopq.M nbits = {np.uint8: 8, np.uint16: 16, np.uint32: 32}[pq_nanopq.code_dtype] pq_faiss = faiss.IndexPQ(D, pq_nanopq.M, nbits) for m in range(pq_nanopq.M): # Prepare std::vector<float> codewords_cpp_m = faiss.FloatVector() # Flatten m-th codewords from (Ks, Ds) to (Ks * Ds, ), then copy them to cpp faiss.copy_array_to_vector(pq_nanopq.codewords[m].reshape(-1), codewords_cpp_m) # Set the codeword to ProductQuantizer in IndexPQ pq_faiss.pq.set_params(centroids=codewords_cpp_m.data(), m=m) pq_faiss.is_trained = True return pq_faiss
def do_test_codec(self, nbit): pq = faiss.ProductQuantizer(16, 2, nbit) # simulate training rs = np.random.RandomState(123) centroids = rs.rand(2, 1 << nbit, 8).astype('float32') faiss.copy_array_to_vector(centroids.ravel(), pq.centroids) idx = rs.randint(1 << nbit, size=(100, 2)) # can be encoded exactly x = np.hstack(( centroids[0, idx[:, 0]], centroids[1, idx[:, 1]] )) # encode / decode codes = pq.compute_codes(x) xr = pq.decode(codes) assert np.all(xr == x) # encode w/ external index assign_index = faiss.IndexFlatL2(8) pq.assign_index = assign_index codes2 = np.empty((100, pq.code_size), dtype='uint8') pq.compute_codes_with_assign_index( faiss.swig_ptr(x), faiss.swig_ptr(codes2), 100) assert np.all(codes == codes2)
def test_serialize_to_vector(self): d = 10 nb = 1000 nq = 200 nt = 500 xt, xb, xq = get_dataset_2(d, nt, nb, nq) index = faiss.IndexFlatL2(d) index.add(xb) Dref, Iref = index.search(xq, 5) writer = faiss.VectorIOWriter() faiss.write_index(index, writer) ar_data = faiss.vector_to_array(writer.data) # direct transfer of vector reader = faiss.VectorIOReader() reader.data.swap(writer.data) index2 = faiss.read_index(reader) Dnew, Inew = index2.search(xq, 5) assert np.all(Dnew == Dref) and np.all(Inew == Iref) # from intermediate numpy array reader = faiss.VectorIOReader() faiss.copy_array_to_vector(ar_data, reader.data) index3 = faiss.read_index(reader) Dnew, Inew = index3.search(xq, 5) assert np.all(Dnew == Dref) and np.all(Inew == Iref)
def test_equiv_rq(self): """ make sure it is equivalent to search a RQ and to search an IVF with RCQ + RQ with the same codebooks. """ ds = datasets.SyntheticDataset(32, 3000, 1000, 50) # make a flat RQ iflat = faiss.IndexResidualQuantizer(ds.d, 5, 4) iflat.rq.train_type = faiss.ResidualQuantizer.Train_default iflat.train(ds.get_train()) iflat.add(ds.get_database()) # ref search result Dref, Iref = iflat.search(ds.get_queries(), 10) # get its codebooks + encoded version of the dataset codebooks = get_additive_quantizer_codebooks(iflat.rq) codes = faiss.vector_to_array(iflat.codes).reshape(-1, iflat.code_size) # make an IVF with 2x4 + 3x4 = 5x4 bits ivf = faiss.index_factory(ds.d, "IVF256(RCQ2x4),RQ3x4") # initialize the codebooks rcq = faiss.downcast_index(ivf.quantizer) faiss.copy_array_to_vector( np.vstack(codebooks[:rcq.rq.M]).ravel(), rcq.rq.codebooks ) rcq.rq.is_trained = True # translation of AdditiveCoarseQuantizer::train rcq.ntotal = 1 << rcq.rq.tot_bits rcq.centroid_norms.resize(rcq.ntotal) rcq.rq.compute_centroid_norms(rcq.centroid_norms.data()) rcq.is_trained = True faiss.copy_array_to_vector( np.vstack(codebooks[rcq.rq.M:]).ravel(), ivf.rq.codebooks ) ivf.rq.is_trained = True ivf.is_trained = True # add the codes (this works because 2x4 is a multiple of 8 bits) ivf.add_sa_codes(codes) # perform exhaustive search ivf.nprobe = ivf.nlist Dnew, Inew = ivf.search(ds.get_queries(), 10) np.testing.assert_array_equal(Iref, Inew) np.testing.assert_array_almost_equal(Dref, Dnew, decimal=5)
def test_encode_to_binary(self): d = 256 nt = 256 nb = 1500 nq = 500 (xt, xb, xq) = make_binary_dataset(d, nt, nb, nq) pq = faiss.ProductQuantizer(d, int(d / 8), 8) centroids = binary_to_float( np.tile(np.arange(256), int(d / 8)).astype('uint8').reshape(-1, 1)) faiss.copy_array_to_vector(centroids.ravel(), pq.centroids) pq.is_trained = True codes = pq.compute_codes(binary_to_float(xb)) assert np.all(codes == xb) indexpq = faiss.IndexPQ(d, int(d / 8), 8) indexpq.pq = pq indexpq.is_trained = True indexpq.add(binary_to_float(xb)) D, I = indexpq.search(binary_to_float(xq), 3) for i in range(nq): for j, dj in zip(I[i], D[i]): ref_dis = binary_dis(xq[i], xb[j]) assert 4 * ref_dis == dj nlist = 32 quantizer = faiss.IndexFlatL2(d) # pretext class for training iflat = faiss.IndexIVFFlat(quantizer, d, nlist) iflat.train(binary_to_float(xt)) indexivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, int(d / 8), 8) indexivfpq.pq = pq indexivfpq.is_trained = True indexivfpq.by_residual = False indexivfpq.add(binary_to_float(xb)) indexivfpq.nprobe = 4 D, I = indexivfpq.search(binary_to_float(xq), 3) for i in range(nq): for j, dj in zip(I[i], D[i]): ref_dis = binary_dis(xq[i], xb[j]) assert 4 * ref_dis == dj
def assign_beta_2(beta_centroids, x, rfn, Inn): _, _, sq = Inn if rfn.k == 1: return np.zeros(x.shape[0], dtype=int) # add dummy dimensions to beta_centroids and x all_beta_centroids = np.zeros((rfn.nsq, rfn.k, rfn.M + 1), dtype='float32') all_beta_centroids[sq] = beta_centroids all_x = np.zeros((len(x), rfn.d), dtype='float32') all_x[:, sq * rfn.dsub:(sq + 1) * rfn.dsub] = x rfn.codes.clear() rfn.ntotal = 0 faiss.copy_array_to_vector(all_beta_centroids.ravel(), rfn.codebook) rfn.add_codes(len(x), faiss.swig_ptr(all_x)) codes = faiss.vector_to_array(rfn.codes) codes = codes.reshape(-1, rfn.nsq) return codes[:, sq]
def test_recons_orthogona_impossible(self): lt = faiss.LinearTransform(20, 10, True) rs = np.random.RandomState(10) A = rs.randn(10 * 20).astype('float32') faiss.copy_array_to_vector(A.ravel(), lt.A) faiss.copy_array_to_vector(rs.randn(10).astype('float32'), lt.b) lt.set_is_orthonormal() assert not lt.is_orthonormal x = rs.rand(30, 20).astype('float32') xt = lt.apply_py(x) try: xtt = lt.reverse_transform(xt) except Exception: pass else: self.assertFalse('should do an exception')
def test_recons_orthonormal(self): lt = faiss.LinearTransform(20, 10, True) rs = np.random.RandomState(10) A, _ = np.linalg.qr(rs.randn(20, 20)) A = A[:10].astype('float32') faiss.copy_array_to_vector(A.ravel(), lt.A) faiss.copy_array_to_vector(rs.randn(10).astype('float32'), lt.b) lt.set_is_orthonormal() assert lt.is_orthonormal x = rs.rand(30, 20).astype('float32') xt = lt.apply_py(x) xtt = lt.reverse_transform(xt) xttt = lt.apply_py(xtt) err = ((xt - xttt)**2).sum() self.assertGreater(1e-5, err)
def subtest_add2col(self, xb, xq, index, qname): """Test with 2 additional dimensions to take also the non-SIMD codepath. We don't retrain anything but add 2 dims to the queries, the centroids and the trained ScalarQuantizer. """ nb, d = xb.shape d2 = d + 2 xb2 = self.add2columns(xb) xq2 = self.add2columns(xq) nlist = index.nlist quantizer = faiss.downcast_index(index.quantizer) quantizer2 = faiss.IndexFlat(d2, index.metric_type) centroids = faiss.vector_to_array(quantizer.xb).reshape(nlist, d) centroids2 = self.add2columns(centroids) quantizer2.add(centroids2) index2 = faiss.IndexIVFScalarQuantizer( quantizer2, d2, index.nlist, index.sq.qtype, index.metric_type) index2.nprobe = 4 if qname in ('8bit', '4bit'): trained = faiss.vector_to_array(index.sq.trained).reshape(2, -1) nt = trained.shape[1] # 2 lines: vmins and vdiffs new_nt = int(nt * d2 / d) trained2 = np.hstack(( trained, np.zeros((2, new_nt - nt), dtype='float32') )) trained2[1, nt:] = 1.0 # set vdiff to 1 to avoid div by 0 faiss.copy_array_to_vector(trained2.ravel(), index2.sq.trained) else: index2.sq.trained = index.sq.trained index2.is_trained = True index2.add(xb2) return index2.search(xq2, 10)
def resume(self, inc, resume_full_path): """ Load previous REMIND model to continue training. :param inc: which increment number was saved :param resume_full_path: path where weights are saved :return: (classifier state dict, latent dict, rehearsal ixs list, class id to item ix dict) """ print(f'\nResuming REMIND model from {resume_full_path}') state = torch.load( os.path.join(resume_full_path, 'remind_classifier_F_%d.pth' % inc)) self.classifier_F.load_state_dict(state['model_state_dict']) self.optimizer.load_state_dict(state['optimizer_state_dict']) # load parameters with open(os.path.join(resume_full_path, 'remind_buffer_%d.pkl' % inc), 'rb') as f: d = pickle.load(f) nbits = int(np.log2(self.codebook_size)) pq = faiss.ProductQuantizer(self.num_channels, self.num_codebooks, nbits) faiss.copy_array_to_vector(d['pq_centroids'].ravel(), pq.centroids) return state, d['latent_dict'], d['rehearsal_ixs'], d[ 'class_id_to_item_ix_dict'], pq
print nrun if parametersets == ['autotune']: ps.n_experiments = args.n_autotune ps.min_test_duration = args.min_test_duration for kv in args.autotune_max: k, vmax = kv.split(':') vmax = float(vmax) print "limiting %s to %g" % (k, vmax) pr = ps.add_range(k) values = faiss.vector_to_array(pr.values) values = np.array([v for v in values if v < vmax]) faiss.copy_array_to_vector(values, pr.values) for kv in args.autotune_range: k, vals = kv.split(':') vals = np.fromstring(vals, sep=',') print "setting %s to %s" % (k, vals) pr = ps.add_range(k) faiss.copy_array_to_vector(vals, pr.values) # setup the Criterion object: optimize for 1-R@1 crit = faiss.OneRecallAtRCriterion(nq, 1) # by default, the criterion will request only 1 NN crit.nnn = 100 crit.set_groundtruth(None, gt.astype('int64'))
rfn = faiss.ReconstructFromNeighbors(index_hnsw, k, nsq) else: print "train beta centroids" rfn = faiss.ReconstructFromNeighbors(index_hnsw, args.beta_k, args.beta_nsq) xb_full = vec_transform(sanitize(xb[:args.beta_ntrain])) beta_centroids = neighbor_codec.train_beta_codebook( rfn, xb_full, niter=args.beta_niter) print " storing", args.beta_centroids np.save(args.beta_centroids, beta_centroids) faiss.copy_array_to_vector(beta_centroids.ravel(), rfn.codebook) index_hnsw.reconstruct_from_neighbors = rfn if rfn.k == 1: pass # no codes to take care of elif os.path.exists(args.neigh_recons_codes): print "loading neigh codes", args.neigh_recons_codes codes = np.load(args.neigh_recons_codes) assert codes.size == rfn.code_size * index.ntotal faiss.copy_array_to_vector(codes.astype('uint8'), rfn.codes) rfn.ntotal = index.ntotal else: print "encoding neigh codes" t0 = time.time() bs = 1000000 if args.add_bs == -1 else args.add_bs
if ngpu > 0: print "moving index to GPU" index = faiss.index_cpu_to_all_gpus(index) clustering = faiss.Clustering(d, args.k) clustering.verbose = True clustering.seed = args.seed clustering.max_points_per_centroid = 10**6 clustering.min_points_per_centroid = 1 for iter0 in range(0, args.niter, args.eval_freq): iter1 = min(args.niter, iter0 + args.eval_freq) clustering.niter = iter1 - iter0 if iter0 > 0: faiss.copy_array_to_vector(centroids.ravel(), clustering.centroids) clustering.train(sanitize(xt), index) index.reset() centroids = faiss.vector_to_array(clustering.centroids).reshape(args.k, d) index.add(centroids) _, I = index.search(sanitize(xb), 1) error = ((xb - centroids[I.ravel()]) ** 2).sum() print "iter1=%d quantization error on test: %.4f" % (iter1, error)
def deserialize_index(data): reader = faiss.VectorIOReader() faiss.copy_array_to_vector(data, reader.data) return faiss.read_index(reader)