def test_interleaved(self): res = faiss.StandardGpuResources() d = 128 nb = 5000 nq = 50 rs = np.random.RandomState(123) xb = rs.rand(nb, d).astype('float32') xq = rs.rand(nq, d).astype('float32') nlist = int(math.sqrt(nb)) sub_q = 16 bits_per_code = 8 nprobe = 4 config = faiss.GpuIndexIVFPQConfig() config.alternativeLayout = True idx_gpu = faiss.GpuIndexIVFPQ(res, d, nlist, sub_q, bits_per_code, faiss.METRIC_L2, config) q = faiss.IndexFlatL2(d) idx_cpu = faiss.IndexIVFPQ(q, d, nlist, sub_q, bits_per_code, faiss.METRIC_L2) idx_gpu.train(xb) idx_gpu.add(xb) idx_cpu.train(xb) idx_cpu.add(xb) idx_gpu.nprobe = nprobe idx_cpu.nprobe = nprobe # Try without precomputed codes d_g, i_g = idx_gpu.search(xq, 10) d_c, i_c = idx_cpu.search(xq, 10) self.assertGreaterEqual((i_g == i_c).sum(), i_g.size - 10) self.assertTrue(np.allclose(d_g, d_c)) # Try with precomputed codes (different kernel) idx_gpu.setPrecomputedCodes(True) d_g, i_g = idx_gpu.search(xq, 10) d_c, i_c = idx_cpu.search(xq, 10) self.assertGreaterEqual((i_g == i_c).sum(), i_g.size - 10) self.assertTrue(np.allclose(d_g, d_c))
def create_index(template_path, index_file=None): gallery_file = '/media/kaicao/data2/AutomatedLatentRecognition/Results/template/NISTSD14_F.mat' if os.path.exists(gallery_file): D_gallery = sio.loadmat(gallery_file) else: des, finger_ID, minutiae = get_all_faetures( template_path=template_path, prefix='F') D_gallery = {} D_gallery['des'] = des D_gallery['finger_ID'] = finger_ID D_gallery['minutiae'] = minutiae sio.savemat(gallery_file, D_gallery) # query_file = '/media/kaicao/data2/AutomatedLatentRecognition/Results/template/NISTSD14_S.mat' # if os.path.exists(query_file): # D_query = sio.loadmat(query_file) # else: # des, finger_ID, minutiae = get_all_faetures(template_path = template_path,prefix = 'S') # D_query = {} # D_query['des'] = des # D_query['finger_ID'] = finger_ID # D_query['minutiae'] = minutiae # sio.savemat(query_file,D_query) finger_ID = D_gallery['finger_ID'] minutiae = D_gallery['minutiae'] des = D_gallery['des'].copy().astype('float32') #des = des[:1280,:32] print des.shape del D_gallery dim = des.shape[1] # feature dimension nlist = 100 m = 16 quantizer = faiss.IndexFlatL2(dim) # this remains the same index = faiss.IndexIVFPQ(quantizer, dim, nlist, m, 8) #pdb.set_trace() index.train(des) index.add(des) if index_file is not None: faiss.write_index(index, index_file)
def __init__(self, vec_dimension: int, transformation: Callable = None, metric: str = 'l2', num_clusters: int = None, num_probe: int = None, num_bytes: int = None): self.quantizer = faiss.IndexFlat(vec_dimension, self.metrics[metric]) if num_bytes is not None: self.index = faiss.IndexIVFPQ(self.quantizer, vec_dimension, num_clusters, num_bytes, self.metrics[metric]) else: self.index = faiss.IndexIVFFlat(self.quantizer, vec_dimension, num_clusters, self.metrics[metric]) self.index.nprobe = num_probe self.index.make_direct_map() self.transformation = transformation self.mapper = {} self.inverted_mapper = {}
def create_faiss_indices(model_code="dsbert", quantizer="HNSWFlat"): """ Creates indices for efficient similarity search Args: model_code (str): code for model to be used to create sentence embeddings quantizer (str): quantizer for compressing the indices for efficient search """ embeddings = create_sentence_transformer_embeddings( model_code, df.definition.to_list()) if quantizer == "HNSWFlat": quantizer = faiss.IndexHNSWFlat(embeddings.shape[1], 32) index = faiss.IndexIVFPQ(quantizer, embeddings.shape[1], 3, 16, 8) # Step 3: Pass the index to IndexIDMap index = faiss.IndexIDMap(index) # Step 4: Add vectors and their IDs index.train(embeddings) index.add_with_ids(embeddings, df.id.values) faiss.write_index(index, "store/wordnet_" + model_code + ".index")
def init_faiss(self, matrix): self.train_data = np.matrix(matrix).astype('float32') logging.debug('FAISS init quantizer') self.f_quantizer = faiss.IndexFlatL2(self.dim) # Lock index read / wtite until it is built with self._lock: logging.debug('FAISS init index') self.f_index = faiss.IndexIVFPQ(self.f_quantizer, self.dim, self.nlist, self.bytesPerVec, self.bytesPerSubVec) logging.debug('FAISS train index') self.f_index.train(self.train_data) logging.debug('FAISS train index finished') # write index to disk self.model_loaded = self.save_model_to_disk( self.model_location, self.f_index) self.is_initiated_ = self.model_loaded return self.is_initiated_
def _post_process(self, dataset, resources_paths): if self.config.with_index: index_file = resources_paths["embeddings_index"] if os.path.exists(index_file): dataset.load_faiss_index("embeddings", index_file) else: if "embeddings" not in dataset.column_names: raise ValueError( "Couldn't build the index because there are no embeddings." ) import faiss d = 768 train_size = self.config.index_train_size logger.info("Building wiki_dpr faiss index") if self.config.index_name == "exact": index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_8bit, 128, faiss.METRIC_INNER_PRODUCT) index.hnsw.efConstruction = 200 index.hnsw.efSearch = 128 dataset.add_faiss_index("embeddings", custom_index=index, train_size=train_size) else: quantizer = faiss.IndexHNSWFlat(d, 128, faiss.METRIC_INNER_PRODUCT) quantizer.hnsw.efConstruction = 200 quantizer.hnsw.efSearch = 128 ivf_index = faiss.IndexIVFPQ(quantizer, d, 4096, 128, 8, faiss.METRIC_INNER_PRODUCT) ivf_index.nprobe = 64 ivf_index.own_fields = True quantizer.this.disown() dataset.add_faiss_index( "embeddings", train_size=train_size, custom_index=ivf_index, ) logger.info("Saving wiki_dpr faiss index") dataset.save_faiss_index("embeddings", index_file) return dataset
def get_index(path_to_model): global index global word2int global xb global int2word #model = generate_model() model = Word2Vec.load(path_to_model) word2int = {key: k for k, key in enumerate(model.wv.vocab.keys())} int2word = {k: key for k, key in enumerate(model.wv.vocab.keys())} xb = numpy.array([model.wv[word] for word in model.wv.vocab.keys()]) quantizer = faiss.IndexFlatIP(100) # Inner product cosine similarity nlist = 50 # Finetune this number of clusters m = 100 # bytes per vector index = faiss.IndexIVFPQ(quantizer, 100, nlist, m, 8) # reduced accuray, fast #index = faiss.IndexIVFFlat(quantizer, 100, nlist, faiss.METRIC_INNER_PRODUCT) faiss.normalize_L2(xb) #print(xb.shape) index.train(xb) index.add(xb) index.nprobe = 5
def initFaiss(self, nlist, nprobe, bytesPerVec, bytesPerSubVec, dim, matrix): self.nlist = nlist self.nprobe = nprobe self.bytesPerVec = bytesPerVec self.bytesPerSubVec = bytesPerSubVec self.dim = dim self.train_data = np.matrix(matrix).astype('float32') print('FAISS init quantizer', self.train_data, self.train_data.shape) self.f_quantizer = faiss.IndexFlatL2(self.dim) print('FAISS init index') self.f_index = faiss.IndexIVFPQ(self.f_quantizer, self.dim, self.nlist, self.bytesPerVec, self.bytesPerSubVec) print('FAISS train index') self.f_index.train(self.train_data) print('FAISS train index finished') self.modelLoaded = self.saveModelToDisk(model_location, self.f_index) self.is_initiated = self.modelLoaded return self.is_initiated
def make_index_for_merge(self, quant, index_type, master_index): ncent = 40 if index_type == 1: index = faiss.IndexIVFFlat(quant, d, ncent, faiss.METRIC_L2) if master_index: index.is_trained = True elif index_type == 2: index = faiss.IndexIVFPQ(quant, d, ncent, 4, 8) if master_index: index.pq = master_index.pq index.is_trained = True elif index_type == 3: index = faiss.IndexIVFPQR(quant, d, ncent, 4, 8, 8, 8) if master_index: index.pq = master_index.pq index.refine_pq = master_index.refine_pq index.is_trained = True elif index_type == 4: # quant used as the actual index index = faiss.IndexIDMap(quant) return index
def train_index(data, quantizer_path, trained_index_path, fine_quant='SQ8', cuda=False): quantizer = faiss.read_index(quantizer_path) if fine_quant == 'SQ8': trained_index = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d, quantizer.ntotal, faiss.METRIC_L2) elif fine_quant.startswith('PQ'): m = int(fine_quant[2:]) trained_index = faiss.IndexIVFPQ(quantizer, quantizer.d, quantizer.ntotal, m, 8) else: raise ValueError(fine_quant) if cuda: if fine_quant.startswith('PQ'): print('PQ not supported on GPU; keeping CPU.') else: res = faiss.StandardGpuResources() gpu_index = faiss.index_cpu_to_gpu(res, 0, trained_index) gpu_index.train(data) trained_index = faiss.index_gpu_to_cpu(gpu_index) else: trained_index.train(data) faiss.write_index(trained_index, trained_index_path)
def prepare_trained_index(self, preproc, coarse_quantizer, xt): if os.path.exists(self.codes_cachefile): print("load pretrained codebook") return faiss.read_index(self.codes_cachefile) d = preproc.d_out if self.pqflat_str == 'Flat': print("making an IVFFlat index") idx_model = faiss.IndexIVFFlat(coarse_quantizer, d, self.ncent, faiss.METRIC_L2) else: m = int(self.pqflat_str[2:]) assert m < 56 or self.use_float16, "PQ%d will work only with -float16" % m print("making an IVFPQ index, m = ", m) idx_model = faiss.IndexIVFPQ(coarse_quantizer, d, self.ncent, m, 8) coarse_quantizer.this.disown() idx_model.own_fields = True # finish training on CPU t0 = time.time() x = preproc.apply_py(indexfunctions.sanitize(xt[:1000000])) idx_model.train(x) faiss.write_index(idx_model, self.codes_cachefile) return idx_model
def test_IndexIVFPQ(self): d = 32 nb = 1000 nt = 1500 nq = 200 (xt, xb, xq) = get_dataset_2(d, nb, nt, nq) d = xt.shape[1] gt_index = faiss.IndexFlatL2(d) gt_index.add(xb) D, gt_nns = gt_index.search(xq, 1) coarse_quantizer = faiss.IndexFlatL2(d) index = faiss.IndexIVFPQ(coarse_quantizer, d, 32, 8, 8) index.cp.min_points_per_centroid = 5 # quiet warning index.train(xt) index.add(xb) index.nprobe = 4 D, nns = index.search(xq, 10) n_ok = (nns == gt_nns).sum() nq = xq.shape[0] self.assertGreater(n_ok, nq * 0.66) # check that and Index2Layer gives the same reconstruction # this is a bit fragile: it assumes 2 runs of training give # the exact same result. index2 = faiss.Index2Layer(coarse_quantizer, 32, 8) if True: index2.train(xt) else: index2.pq = index.pq index2.is_trained = True index2.add(xb) ref_recons = index.reconstruct_n(0, nb) new_recons = index2.reconstruct_n(0, nb) self.assertTrue(np.all(ref_recons == new_recons))
def _post_process(self, dataset, resources_paths): if self.config.with_index: index_file = resources_paths["embeddings_index"] if os.path.exists(index_file): dataset.load_faiss_index("embeddings", index_file) else: if "embedings" not in dataset.column_names: raise ValueError( "Couldn't build the index because there are no embeddings." ) import faiss train_size = self.config.index_train_size logging.info("Building wiki_dpr faiss index") if self.config.index_type == "exact": dataset.add_faiss_index( "embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT, ) else: d = 768 quantizer = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_INNER_PRODUCT) ivf_index = faiss.IndexIVFPQ(quantizer, d, 4096, 64, 8, faiss.METRIC_INNER_PRODUCT) ivf_index.own_fields = True quantizer.this.disown() dataset.add_faiss_index( "embeddings", train_size=train_size, faiss_verbose=logging.getLogger().level <= logging.DEBUG, custom_index=ivf_index, ) logging.info("Saving wiki_dpr faiss index") dataset.save_faiss_index("embeddings", index_file) return dataset
def test_IndexIVFPQ(self): d = 32 nb = 1000 nt = 1500 nq = 200 (xt, xb, xq) = get_dataset_2(d, nb, nt, nq) d = xt.shape[1] gt_index = faiss.IndexFlatL2(d) gt_index.add(xb) D, gt_nns = gt_index.search(xq, 1) coarse_quantizer = faiss.IndexFlatL2(d) index = faiss.IndexIVFPQ(coarse_quantizer, d, 32, 8, 8) index.train(xt) index.add(xb) index.nprobe = 4 D, nns = index.search(xq, 10) n_ok = (nns == gt_nns).sum() nq = xq.shape[0] self.assertGreater(n_ok, nq * 0.66)
def test_IndexIVFPQ(self): (xt, xb, xq) = self.get_dataset() d = xt.shape[1] dev_no = 0 usePrecomputed = True res = faiss.StandardGpuResources() flat_config = faiss.GpuIndexFlatConfig() flat_config.device = dev_no gt_index = faiss.GpuIndexFlatL2(res, d, flat_config) gt_index.add(xb) D, gt_nns = gt_index.search(xq, 1) coarse_quantizer = faiss.IndexFlatL2(d) ncentroids = int(np.sqrt(xb.shape[0])) * 4 index = faiss.IndexIVFPQ(coarse_quantizer, d, ncentroids, 32, 8) # add implemented on GPU but not train index.train(xt) ivfpq_config = faiss.GpuIndexIVFPQConfig() ivfpq_config.device = dev_no ivfpq_config.usePrecomputedTables = usePrecomputed gpuIndex = faiss.GpuIndexIVFPQ(res, index, ivfpq_config) gpuIndex.setNumProbes(64) index.add(xb) D, nns = index.search(xq, 10) n_ok = (nns == gt_nns).sum() nq = xq.shape[0] print ncentroids, n_ok, nq self.assertGreater(n_ok, nq * 0.2)
def indexing(feats, pos, imgID): feats_np = np.zeros(shape=(len(feats), feats[0].shape[0])) pos_np = np.zeros(shape=(len(feats), 2)) imgID_np = np.zeros(shape=(len(feats), 1)) for i in range(len(feats)): feats_np[i, :] = feats[i] pos_np[i, :] = pos[i] imgID_np[i, :] = imgID[i] # construct the visual vocabulary voc_size = const_params.__voc_size__ niter = 20 verbose = False d = feats[0].shape[0] code_size = 8 quantizer = faiss.IndexFlatL2(d) # this remains the same index_ = faiss.IndexIVFPQ(quantizer, d, voc_size, code_size, 8) # 8 specifies that each sub-vector is encoded as 8 bits index_.train(feats_np.astype('float32')) index_.add(feats_np.astype('float32')) index_.nprobe = 5 #faiss.write_index(faiss.clone_index(index_), '../index.faiss') #fp = open('../query.pkl', 'r') #des, pt = pickle.load(fp) #fp.close() #q = np.asarray(des).astype('float32') #D, I = index_.search(q, 10) return [faiss.clone_index(index_), pos_np, imgID_np]
def index_patches(patches, pca_dims=64): # settings for faiss: num_lists, M, num_bits = 200, 16, 8 # assertions: assert torch.is_tensor(patches) and patches.dim() == 2 assert type(pca_dims) == int and pca_dims > 0 if pca_dims > patches.size(1): print('WARNING: Input dimension < %d. Using fewer PCA dimensions.' % pca_dims) pca_dims = patches.size(1) - (patches.size(1) % M) # construct faiss index: quantizer = faiss.IndexFlatL2(pca_dims) assert pca_dims % M == 0 sub_index = faiss.IndexIVFPQ(quantizer, pca_dims, num_lists, M, num_bits) pca_matrix = faiss.PCAMatrix(patches.size(1), pca_dims, 0, True) faiss_index = faiss.IndexPreTransform(pca_matrix, sub_index) # train faiss index: patches = patches.numpy() faiss_index.train(patches) faiss_index.add(patches) return faiss_index, sub_index
def _ready_faiss(): print("start indexing") datas = {} with open('inception_v3.pickle', mode='rb') as f: datas = pickle.load(f) # databese配列の作成 image_names = [] vectors = [] for k in datas: image_names.append(k) vectors.append(datas[k]) vectors = np.array(vectors).astype("float32") # faissを用いたPQ nlist = 100 m = 8 d = 2048 # 顔特徴ベクトルの次元数 quantizer = faiss.IndexFlatL2(d) # this remains the same index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8) print(vectors.shape) index.train(vectors) index.add(vectors) print("indexing is end") return index
def build_index(self, wv, d, index_type='fast', nlist=100): ''' wv是一个字典,d是向量长度, ''' self.int2word = {k: key for k, key in enumerate(wv.keys())} self.word2int = {key: k for k, key in enumerate(wv.keys())} xb = numpy.array([v for v in wv.values() if len(v) == d]) #不支持float16 if index_type == 'accurate': index = faiss.IndexFlatL2(d) index.add(xb) elif index_type == 'fast': # nlist = 100# ? quantizer = faiss.IndexFlatL2(d) index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2) print xb.shape index.train(xb) index.add(xb) #index.own_fields = False self.quantizer = quantizer #必须加,否则这个东西会被回收,导致self.index悄悄地被修改了! (搞了3个小时.., 问了作者) elif index_type == 'compress': nlist = 64 #? m = 8 # number of bytes per vector quantizer = faiss.IndexFlatL2(d) # this remains the same index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8) # 8 specifies that each sub-vector is encoded as 8 bits index.train(xb) index.add(xb) #index.own_fields = True self.quantizer = quantizer self.index_type = index_type self.index_ = index self.xb = xb
print "loading database" xb = fvecs_read(rootdir + '/sift_base.fvecs') xt = fvecs_read(rootdir + '/sift_learn.fvecs') xq = fvecs_read(rootdir + '/sift_query.fvecs') d = xt.shape[1] gt_index = faiss.IndexFlatL2(d) gt_index.add(xb) D, gt_nns = gt_index.search(xq, 1) coarse_quantizer = faiss.IndexFlatL2(d) index = faiss.IndexIVFPQ(coarse_quantizer, d, 25, 16, 8) print "train" index.train(xt) print "add" index.add(xb) print "search" index.nprobe = 5 D, nns = index.search(xq, 10) n_ok = (nns == gt_nns).sum() nq = xq.shape[0] print "n_ok=%d/%d" % (n_ok, nq)
def train(self, vecs): D = vecs.shape[1] self.quantizer = faiss.IndexFlatL2(D) self.index = faiss.IndexIVFPQ(self.quantizer, D, self.nlist, self.M, 8) self.index.train(vecs)
def make_samples(db, mode, is_Oxford=isOxford, verbose=True): if is_Oxford: dic_addr = Odic_addr vec_addr = Odic_addr index_addr = Oindex_addr else: dic_addr = Ddic_addr vec_addr = Ddic_addr index_addr = Dindex_addr try: dicbase = cPickle.load(open(os.path.join(cache_dir, dic_addr), "rb", True)) # print(dicbase) vecbase = cPickle.load(open(os.path.join(cache_dir, vec_addr), "rb", True)) index = faiss.read_index(os.path.join(cache_dir, index_addr)) if verbose: print("Using cache..., config=%s, depth=%s" % (vec_addr, depth)) except: if verbose: print("Counting histogram..., config=%s, depth=%s" % (vec_addr, depth)) # base_model = VGGNet(load_features_path=LOAD_MODEL_PATH, requires_grad=False) # base_model = Res101(load_features_path=LOAD_MODEL_PATH, # use_Gem_whiten=True, load_whiten_path=LOAD_WHITEN_PATH) # base_model = base_model = load_model(CHECKPOINT, False) base_model.eval() print("load successfully!") if REMOVE_FC: base_model = nn.Sequential(*list(base_model.children())[:-1]) print("Remove FC") if use_gpu: base_model = base_model.cuda() vecbase = [] dicbase = [] data = db.get_data() count = 1 for d in data.itertuples(): # if count == 5: # break d_img, d_cls = getattr(d, "img"), getattr(d, "cls") img = imageio.imread(d_img, pilmode="RGB") img = Image.fromarray(img) img = IMAGE_NORMALIZER(img) img = np.array(img) img = np.expand_dims(img, axis=0) if use_gpu: inputs = torch.autograd.Variable(torch.from_numpy(img).cuda().float()) else: inputs = torch.from_numpy(img) d_hist = base_model(inputs).view(-1, ) d_hist = d_hist.data.cpu().numpy() vecbase.append(d_hist) dicbase.append((d_cls, d_img)) print(count) count += 1 vecbase = np.array(vecbase).astype('float32') print(vecbase.shape) d = vecbase.shape[1] dicbase = pd.DataFrame(dicbase, columns=['cls', 'img']) if mode == 'Linear': index = faiss.IndexFlatL2(d) index.add(vecbase) elif mode == 'IVFPQ': n_list = 100 n_bits = 8 coarse_quantizer = faiss.IndexFlatL2(d) index = faiss.IndexIVFPQ(coarse_quantizer, d, n_list, 8, n_bits) index.nprobe = 10 index.train(vecbase) index.add(vecbase) else: raise ValueError("you should choose a correct retrival mode") cPickle.dump(dicbase, open(os.path.join(cache_dir, dic_addr), "wb", True)) cPickle.dump(vecbase, open(os.path.join(cache_dir, vec_addr), "wb", True)) faiss.write_index(index, os.path.join(cache_dir, index_addr)) return index, dicbase, vecbase
import numpy as np d = 64 # dimension nb = 100000 # database size nq = 10000 # nb of queries np.random.seed(1234) # make reproducible xb = np.random.random((nb, d)).astype('float32') xb[:, 0] += np.arange(nb) / 1000. xq = np.random.random((nq, d)).astype('float32') xq[:, 0] += np.arange(nq) / 1000. import faiss nlist = 100 m = 8 k = 4 quantizer = faiss.IndexFlatL2(d) # this remains the same index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)# 8 specifies that each sub-vector is encoded as 8 bits index.train(xb) index.add(xb) D, I = index.search(xb[:5], k) # sanity check print('In dex',I) print('Distances',D) index.nprobe = 10 # make comparable with experiment above D, I = index.search(xq, k) # search print('I[-5:]',I[-5:]) # n_points_per_cluster_total = 1000 # size_colum = 100 # centers = np.random.randint(-20, 20, size=(size_colum,size_colum)) # # train # X, y = make_blobs(n_samples=n_points_per_cluster_total, # centers=centers, # n_features=size_colum,
def subtest(self, mt): d = 32 xt, xb, xq = get_dataset_2(d, 2000, 1000, 200) nlist = 64 gt_index = faiss.IndexFlat(d, mt) gt_index.add(xb) gt_D, gt_I = gt_index.search(xq, 10) quantizer = faiss.IndexFlat(d, mt) for by_residual in True, False: index = faiss.IndexIVFPQ( quantizer, d, nlist, 4, 8) index.metric_type = mt index.by_residual = by_residual if by_residual: # perform cheap polysemous training index.do_polysemous_training = True pt = faiss.PolysemousTraining() pt.n_iter = 50000 pt.n_redo = 1 index.polysemous_training = pt index.train(xt) index.add(xb) index.nprobe = 4 D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('(%d, %s): %d, ' % (mt, by_residual, ninter)) assert abs(ninter - self.ref_results[mt, by_residual]) <= 3 index.use_precomputed_table = 0 D2, I2 = index.search(xq, 10) assert np.all(I == I2) if by_residual: index.use_precomputed_table = 1 index.polysemous_ht = 20 D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('(%d, %s, %d): %d, ' % ( mt, by_residual, index.polysemous_ht, ninter)) # polysemous behaves bizarrely on ARM assert (ninter >= self.ref_results[ mt, by_residual, index.polysemous_ht] - 4) # also test range search if mt == faiss.METRIC_INNER_PRODUCT: radius = float(D[:, -1].max()) else: radius = float(D[:, -1].min()) print('radius', radius) lims, D3, I3 = index.range_search(xq, radius) ntot = ndiff = 0 for i in range(len(xq)): l0, l1 = lims[i], lims[i + 1] Inew = set(I3[l0:l1]) if mt == faiss.METRIC_INNER_PRODUCT: mask = D2[i] > radius else: mask = D2[i] < radius Iref = set(I2[i, mask]) ndiff += len(Inew ^ Iref) ntot += len(Iref) print('ndiff %d / %d' % (ndiff, ntot)) assert ndiff < ntot * 0.02
# %% Import from reverse_image_search.haltakov_clip import * import faiss # %% Create Index D = 512 M = 256 # number of subquantizers nbits = 8 nlist = 1500 # The number of cells (space partition). Typical value is sqrt(N) hnsw_m = 32 quantizer = faiss.IndexHNSWFlat(D, hnsw_m) index = faiss.IndexIVFPQ(quantizer, D, nlist, M, nbits) # this remains the same index.train(photo_features[:20000, :]) index.add(photo_features) faiss.write_index(index, "index.faiss") # %% index = faiss.read_index("index.faiss") # %% Using Index #!%%time index.nprobe = 200 search_query = "technical debt" text_features = encode_search_query(search_query) D, I = index.search(text_features, 10) best_photo_ids = photo_ids.iloc[I.tolist()[0]]["photo_id"].to_list() display_image_grid(best_photo_ids) best_photo_ids # %% True Result #!%%time
def _create_index(self): quantizer = faiss.IndexFlatL2(self.dim) # faiss.IndexHNSWFlat(dim, 32) index = faiss.IndexIVFPQ(quantizer, self.dim, self.partitions, 16, 8) return quantizer, index
mode='r', shape=(args.dstore_size, 1)) else: keys = np.memmap(args.dstore_mmap + '_keys.npy', dtype=np.float32, mode='r', shape=(args.dstore_size, args.dimension)) vals = np.memmap(args.dstore_mmap + '_vals.npy', dtype=np.int, mode='r', shape=(args.dstore_size, 1)) if not os.path.exists(args.faiss_index + ".trained"): # Initialize faiss index quantizer = faiss.IndexFlatL2(args.dimension) index = faiss.IndexIVFPQ(quantizer, args.dimension, args.ncentroids, args.code_size, 8) index.nprobe = args.probe print('Training Index') np.random.seed(args.seed) random_sample = np.random.choice(np.arange(vals.shape[0]), size=[min(1000000, vals.shape[0])], replace=False) start = time.time() # Faiss does not handle adding keys in fp16 as of writing this. index.train(keys[random_sample].astype(np.float32)) print('Training took {} s'.format(time.time() - start)) print('Writing index after training') start = time.time() faiss.write_index(index, args.faiss_index + ".trained")
def index_factory(d: int, index_key: str, metric_type: int, ef_construction: Optional[int] = None): """ custom index_factory that fix some issues of faiss.index_factory with inner product metrics. """ if metric_type == faiss.METRIC_INNER_PRODUCT: # make the index described by the key if any(re.findall(r"OPQ\d+_\d+,IVF\d+,PQ\d+", index_key)): params = [int(x) for x in re.findall(r"\d+", index_key)] cs = params[3] # code size (in Bytes if nbits=8) nbits = params[4] if len(params) == 5 else 8 # default value ncentroids = params[2] out_d = params[1] M_OPQ = params[0] quantizer = faiss.index_factory(out_d, "Flat", metric_type) assert quantizer.metric_type == metric_type index_ivfpq = faiss.IndexIVFPQ(quantizer, out_d, ncentroids, cs, nbits, metric_type) assert index_ivfpq.metric_type == metric_type index_ivfpq.own_fields = True quantizer.this.disown() # pylint: disable = no-member opq_matrix = faiss.OPQMatrix(d, M=M_OPQ, d2=out_d) # opq_matrix.niter = 50 # Same as default value index = faiss.IndexPreTransform(opq_matrix, index_ivfpq) elif any(re.findall(r"OPQ\d+_\d+,IVF\d+_HNSW\d+,PQ\d+", index_key)): params = [int(x) for x in re.findall(r"\d+", index_key)] M_HNSW = params[3] cs = params[4] # code size (in Bytes if nbits=8) nbits = params[5] if len(params) == 6 else 8 # default value ncentroids = params[2] out_d = params[1] M_OPQ = params[0] quantizer = faiss.IndexHNSWFlat(out_d, M_HNSW, metric_type) if ef_construction is not None and ef_construction >= 1: quantizer.hnsw.efConstruction = ef_construction assert quantizer.metric_type == metric_type index_ivfpq = faiss.IndexIVFPQ(quantizer, out_d, ncentroids, cs, nbits, metric_type) assert index_ivfpq.metric_type == metric_type index_ivfpq.own_fields = True quantizer.this.disown() # pylint: disable = no-member opq_matrix = faiss.OPQMatrix(d, M=M_OPQ, d2=out_d) # opq_matrix.niter = 50 # Same as default value index = faiss.IndexPreTransform(opq_matrix, index_ivfpq) elif any(re.findall(r"Pad\d+,IVF\d+_HNSW\d+,PQ\d+", index_key)): params = [int(x) for x in re.findall(r"\d+", index_key)] out_d = params[0] M_HNSW = params[2] cs = params[3] # code size (in Bytes if nbits=8) nbits = params[4] if len(params) == 5 else 8 # default value ncentroids = params[1] remapper = faiss.RemapDimensionsTransform(d, out_d, True) quantizer = faiss.IndexHNSWFlat(out_d, M_HNSW, metric_type) if ef_construction is not None and ef_construction >= 1: quantizer.hnsw.efConstruction = ef_construction index_ivfpq = faiss.IndexIVFPQ(quantizer, out_d, ncentroids, cs, nbits, metric_type) index_ivfpq.own_fields = True quantizer.this.disown() # pylint: disable = no-member index = faiss.IndexPreTransform(remapper, index_ivfpq) elif any(re.findall(r"HNSW\d+", index_key)): params = [int(x) for x in re.findall(r"\d+", index_key)] M_HNSW = params[0] index = faiss.IndexHNSWFlat(d, M_HNSW, metric_type) assert index.metric_type == metric_type elif index_key == "Flat": index = faiss.index_factory(d, index_key, metric_type) else: index = faiss.index_factory(d, index_key, metric_type) raise ValueError(( "Be careful, faiss might not create what you expect when using the " "inner product similarity metric, remove this line to try it anyway." "Happened with index_key: " + str(index_key))) else: index = faiss.index_factory(d, index_key, metric_type) return index
datas = pickle.load(f) # databese配列の作成 face_image_names = [] face_vectors = [] for k in datas: face_image_names.append(k) face_vectors.append(datas[k]) face_vectors = np.array(face_vectors).astype("float32") # faissを用いたPQの準備 nlist = 100 m = 8 k = 8 # 類似顔7こほしいのでk=8 d = 128 # 顔特徴ベクトルの次元数 quantizer = faiss.IndexFlatL2(d) # this remains the same index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8) index.train(face_vectors) index.add(face_vectors) print("indexing is end") # マネージャの作成 with Manager() as manager: # マネージャーの作成 similar_paths_manager = manager.list() similar_distance_manager = manager.list() frame_manager = manager.list() face_rect_manager = manager.list() # プロセスの生成 recommend_process = Process( target=recommend_faces, args=[similar_paths_manager, frame_manager],
def build_database(chunk_len: int = 16, batch_size: int = 64, d_emb: int = 768, n_centeroids: int = 256, code_size: int = 64, n_probe: int = 8, n_train: int = 50_000): """ ## Build Database * `chunk_len` is the length of a chunk (number of characters) * `batch_size` is the batch size to use when calculating $\text{B\small{ERT}}(N)$ * `d_emb` is the number of features in $\text{B\small{ERT}}(N)$ embeddings [lists to select in FAISS index](https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexIVFPQ.html) * `n_centeroids` is the number of lists in the index * `code_size` encoded vector size in the index * `n_probe` is the number of lists to probe * `n_train' is the number of keys to train the index on """ # Load the dataset text file dataset = TextFileDataset( lab.get_data_path() / 'tiny_shakespeare.txt', list, url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt') # Get training data (a string) text = dataset.train # Split the text into chunks of `chunk_length` chunks = [text[i:i + chunk_len] for i in range(0, len(text), chunk_len) if i + chunk_len * 2 < len(text)] # Get the offsets of each of the chunks chunk_offsets = np.array([i for i in range(0, len(text), chunk_len) if i + chunk_len * 2 < len(text)]) # Number of chunks n_chunks = len(chunks) # Initialize BERT to get $\text{B\small{ERT}}(N)$ bert = BERTChunkEmbeddings(torch.device('cuda:0')) # Get chunk embeddings by processing `batch_size` number of chunks on each iteration chunk_emb = [] for i in monit.iterate('Get embeddings', range(0, n_chunks, batch_size)): chunk_emb.append(bert(chunks[i: i + batch_size]).cpu()) # Merge them into a single tensor chunk_emb = torch.cat(chunk_emb, dim=0).numpy() # Create the [FAISS index](https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexIVFPQ.html) quantizer = faiss.IndexFlatL2(d_emb) index = faiss.IndexIVFPQ(quantizer, d_emb, n_centeroids, code_size, 8) index.nprobe = n_probe # Get a random sample of the the chunk indexes random_sample = np.random.choice(np.arange(n_chunks), size=[min(n_train, n_chunks)], replace=False) # Train the index to store the keys with monit.section('Train index'): index.train(chunk_emb[random_sample]) # Add the chunks to the index in batches of size `1024` for s in monit.iterate('Index', range(0, n_chunks, 1024)): e = min(s + 1024, n_chunks) # Add to index index.add_with_ids(chunk_emb[s:e], chunk_offsets[s: e]) # Save the index with monit.section('Save'): faiss.write_index(index, str(lab.get_data_path() / 'retro.index'))