def fit(self, X): if X.dtype != numpy.float32: X = X.astype(numpy.float32) f = X.shape[1] self.index = faiss.IndexLSH(f, self._n_bits) self.index.train(X) self.index.add(X)
def test_override(self): d = 256 nt = 3500 nb = 10000 nq = 500 (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq) def train_and_get_centroids(override_kmeans_index): index = faiss.index_binary_factory(d, b"BIVF10") index.verbose = True if override_kmeans_index is not None: index.clustering_index = override_kmeans_index index.train(xt) centroids = faiss.downcast_IndexBinary(index.quantizer).xb return faiss.vector_to_array(centroids).reshape(-1, d // 8) centroids_ref = train_and_get_centroids(None) # should do the exact same thing centroids_new = train_and_get_centroids(faiss.IndexFlatL2(d)) assert np.all(centroids_ref == centroids_new) # will do less accurate assignment... Sanity check that the # index is indeed used by kmeans centroids_new = train_and_get_centroids(faiss.IndexLSH(d, 16)) assert not np.all(centroids_ref == centroids_new)
def getmAP(clf, X_base, y_base, X_query, y_query, id_label, y_label): y_base, y_query = y_base[:, 0], y_query[:, 0] oneh = OneHotEncoder(sparse=False) y_label_1h = oneh.fit_transform(y_label) activations = clf.predict_proba(X_base) activations[id_label] = y_label_1h if args.code == "onehot": argmax = activations.argmax(axis=1).reshape((-1, 1)) activations = oneh.fit_transform(argmax) if args.code == "lsh": index = faiss.IndexLSH(y_label_1h.shape[1], args.nbits, True, True) else: index = faiss.IndexFlatIP(y_label_1h.shape[1]) index.train(activations.astype(np.float32)) index.add(activations.astype(np.float32)) queryAct = clf.predict_proba(X_query).astype(np.float32) _, idc = index.search(queryAct, y_base.shape[0]) predictions = y_base[idc] results = np.equal(predictions, np.expand_dims(y_query, axis=1)) return computemAP(results)
def test_IndexLSH_32_48(self): # CHECK: the difference between 32 and 48 does not make much sense for nbits2 in 32, 48: q = faiss.IndexLSH(d, nbits2) res = ev.launch('LSH half size', q) e = ev.evalres(res) # should give 0.003 0.019 0.108 assert e[10] > 0.018
def __init__(self, num_dimensions: int, **kwargs): super().__init__(num_dimensions) self.faiss_index_type = kwargs.get("faiss_index_type") self.faiss_lsh_num_bits = int(kwargs.get("faiss_lsh_num_bits", 4)) if self.faiss_index_type == "IndexFlatL2": self.index = faiss.IndexFlatL2(self.num_dimensions) elif self.faiss_index_type == "IndexLSH": self.index = faiss.IndexLSH(self.num_dimensions, self.faiss_lsh_num_bits) else: raise NotImplementedError( f"Faiss index '{self.index_type}' not implemented'")
def build_lsh(xb, name, n_bits=None): cache_dir = 'LSH_index_cache' try: os.makedirs(cache_dir) except: print('{} already existing'.format(cache_dir)) dim = xb.shape[1] if n_bits is None: n_bits = dim index_cache_fname = os.path.join( cache_dir, 'index_{}_{}bits.idx'.format(name, n_bits)) start_time = time.time() if os.path.isfile(index_cache_fname): print('Loading existing index...') cpuindex = faiss.read_index(index_cache_fname) res = faiss.StandardGpuResources() # use a single GPU index = faiss.index_cpu_to_gpu(res, 0, cpuindex) else: # rotate_data=True and train_thresholds=True index = faiss.IndexLSH(dim, n_bits, True, True) print('Training index with: {} ...'.format(xb.shape)) index.train(xb) print('Adding ...') index.add(xb) writable_index = faiss.index_gpu_to_cpu(index) faiss.write_index(writable_index, index_cache_fname) end_time = time.time() print('Done in: {}'.format(str(timedelta(seconds=(end_time - start_time))))) return index
def test_sh(self): d = 32 xt, xb, xq = get_dataset_2(d, 2000, 1000, 200) nlist, nprobe = 1, 1 gt_index = faiss.IndexFlatL2(d) gt_index.add(xb) gt_D, gt_I = gt_index.search(xq, 10) for nbit in 32, 64, 128: quantizer = faiss.IndexFlatL2(d) index_lsh = faiss.IndexLSH(d, nbit, True) index_lsh.add(xb) D, I = index_lsh.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) print('LSH baseline: %d' % ninter) for period in 10.0, 1.0: for tt in 'global centroid centroid_half median'.split(): index = faiss.IndexIVFSpectralHash(quantizer, d, nlist, nbit, period) index.nprobe = nprobe index.threshold_type = getattr( faiss.IndexIVFSpectralHash, 'Thresh_' + tt ) index.train(xt) index.add(xb) D, I = index.search(xq, 10) ninter = faiss.eval_intersection(I, gt_I) key = (nbit, tt, period) print('(%d, %s, %g): %d, ' % (nbit, repr(tt), period, ninter)) assert abs(ninter - self.ref_results[key]) <= 12
def test_indexLSH(self): q = faiss.IndexLSH(d, nbits) res = ev.launch('FLAT / LSH Cosine', q) e = ev.evalres(res) # should give 0.070 0.250 0.580 assert e[10] > 0.2
# np.random.seed(1234) # make reproducible # xb = np.random.random((nb, d)).astype('float32') # xb[:, 0] += np.arange(nb) / 1000. # xq = np.random.random((nq, d)).astype('float32') # xq[:, 0] += np.arange(nq) / 1000. print('Generating queries') # np.random.seed(666666) # np.random.shuffle(trainDataset) queries = testDataset[:nq] npGroundTruth = np.array(neighbors) groundTruth = npGroundTruth[:nq, :topk] print('Done') n_bits = 2 * d index = faiss.IndexLSH(d, n_bits) # build the index print(index.is_trained) index.add(trainDataset) # add vectors to the index print(index.ntotal) # we want to see 4 nearest neighbors t1 = timeit.default_timer() D, I = index.search(queries, topk) # sanity check t2 = timeit.default_timer() # print(I) # print(D) # D, I = index.search(xq, k) # actual search # print(I[:5]) # neighbors of the 5 first queries # print(I[-5:]) score = 0.0
import faiss import numpy as np objects = np.array([[1, 1, 2, 1], [5, 4, 6, 5], [1, 2, 1, 2]], dtype=np.float32) index = faiss.IndexLSH(4, 2) index.add(objects) distances, ids = index.search(objects, 3) print(distances) print(ids)
## Load Features start = time.time() final_vids, features, vid2features, id_index = load_features('/home/camp/FIVR/features/vcms_v1', is_gv=False) print('Read time: %.2f' % (time.time() - start)) features = np.asarray(features, np.float32) print(features.shape) final_vids = np.array(final_vids) ## Load K-Means Model # kmeans = joblib.load('./kmeans/kmeans.pkl') # lables = np.load('./kmeans/kmeans_lables_group.npy', allow_pickle=True) print('lshash start:') start = time.time() index = faiss.IndexLSH(512, 512) index.add(features) print('biuld time: %.2f' % (time.time() - start)) # 加载vid2name with open('/home/camp/FIVR/vid2name.pk', 'rb') as pk_file: vid2names = pk.load(pk_file) # 开始评估 annotation_dir = '/home/camp/FIVR/annotation' final_names = np.array([vid2names[vid][0] for vid in final_vids]) query_names = None results = None for task_name in ['DSVR', 'CSVR', 'ISVR']: annotation_path = os.path.join(annotation_dir, task_name + '.json') with open(annotation_path, 'r') as annotation_file:
def fit(self, X): X = X.astype(numpy.float32) self._index = faiss.IndexLSH(len(X[0]), self._n_bits) self._index.train(X) self._index.add(X)
import faiss from recall_data import recall_data # 基本参数 d = 300 # 向量维数 data_size = 10000 # 数据库大小 k = 50 nbits = 64 # 生成的hash值长度 # 生成测试数据 numpy.random.seed(13) data = numpy.random.random(size=(data_size, d)).astype('float32') test_data = recall_data # 创建索引模型并添加向量 index = faiss.IndexLSH(d, nbits) # LSH无需训练训 # 添加数据 start_time = time.time() index.add(data) # 添加索引可能会有一点慢 print "Add vector Used %.2f sec." % (time.time() - start_time) start_time = time.time() D, I = index.search(data[:50], k) # 搜索每一个数据的的k临近向量 # 输出结果 print "Used %.2f ms" % ((time.time() - start_time) * 1000) recall_1_count = 0 recall_50_count = 0 for (search_vec, test_vec) in zip(I, test_data):
def build(self, num_bits=8): self.index = faiss.IndexLSH(self.dimension, num_bits) if not self.index.is_trained: self.index.tran(data) self.index.add(self.vectors)