vec_transform = lambda x: x assert isinstance(index_ivf, faiss.IndexIVF) index_ivf.verbose = True index_ivf.quantizer.verbose = True index_ivf.cp.verbose = True maxtrain = args.maxtrain if maxtrain == 0: if 'IMI' in args.indexkey: maxtrain = int(256 * 2**(np.log2(index_ivf.nlist) / 2)) else: maxtrain = 50 * index_ivf.nlist print "setting maxtrain to %d" % maxtrain args.maxtrain = maxtrain xt2 = sanitize(xt[:args.maxtrain]) assert np.all(np.isfinite(xt2)) print "train, size", xt2.shape if args.get_centroids_from == '': if args.clustering_niter >= 0: print("setting nb of clustering iterations to %d" % args.clustering_niter) index_ivf.cp.niter = args.clustering_niter if args.train_on_gpu: print "add a training index on GPU" train_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d)) index_ivf.clustering_index = train_index
else: index_hnsw = index vec_transform = lambda x: x hnsw = index_hnsw.hnsw hnsw.efConstruction = args.efConstruction hnsw_stats = faiss.cvar.hnsw_stats index.verbose = True index_hnsw.verbose = True index_hnsw.storage.verbose = True if args.M0 != -1: print "set level 0 nb of neighbors to", args.M0 hnsw.set_nb_neighbors(0, args.M0) xt2 = sanitize(xt[:args.maxtrain]) assert np.all(np.isfinite(xt2)) print "train, size", xt.shape t0 = time.time() index.train(xt2) print " train in %.3f s" % (time.time() - t0) print "adding" t0 = time.time() if args.add_bs == -1: index.add(sanitize(xb)) else: for i0 in range(0, nb, args.add_bs): i1 = min(nb, i0 + args.add_bs) print " adding %d:%d / %d" % (i0, i1, nb)
if not index_ivf.quantizer.is_trained: print(" training quantizer") index_ivf.quantizer.train(centroids) print(" add centroids to quantizer") index_ivf.quantizer.add(centroids) del src_index t0 = time.time() index.train(xt2) print(" train in %.3f s" % (time.time() - t0)) print("adding") t0 = time.time() if args.add_bs == -1: index.add(sanitize(ds.get_database())) else: i0 = 0 for xblock in ds.database_iterator(bs=args.add_bs): i1 = i0 + len(xblock) print(" adding %d:%d / %d [%.3f s, RSS %d kiB] " % ( i0, i1, ds.nb, time.time() - t0, faiss.get_mem_usage_kb())) index.add(xblock) i0 = i1 print(" add in %.3f s" % (time.time() - t0)) if args.indexfile: print("storing", args.indexfile) faiss.write_index(index, args.indexfile)
xt_pca = xt[args.nt:args.nt + 10000] xt = xt[:args.nt] else: xt_pca = xt[args.nt_sample:args.nt_sample + 10000] rs = np.random.RandomState(args.seed) idx = rs.choice(args.nt_sample, size=args.nt, replace=False) xt = xt[idx] xb = xb[:args.nb] d = xb.shape[1] if args.pcadim != -1: print "training PCA: %d -> %d" % (d, args.pcadim) pca = faiss.PCAMatrix(d, args.pcadim) pca.train(sanitize(xt_pca)) xt = pca.apply_py(sanitize(xt)) xb = pca.apply_py(sanitize(xb)) d = xb.shape[1] ###################################################### # Run clustering ###################################################### index = faiss.IndexFlatL2(d) if ngpu > 0: print "moving index to GPU" index = faiss.index_cpu_to_all_gpus(index)