Beispiel #1
0
 def fit(self, X):
     if X.dtype != numpy.float32:
         X = X.astype(numpy.float32)
     f = X.shape[1]
     self.index = faiss.IndexLSH(f, self._n_bits)
     self.index.train(X)
     self.index.add(X)
    def test_override(self):
        d = 256
        nt = 3500
        nb = 10000
        nq = 500
        (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)

        def train_and_get_centroids(override_kmeans_index):
            index = faiss.index_binary_factory(d, b"BIVF10")
            index.verbose = True

            if override_kmeans_index is not None:
                index.clustering_index = override_kmeans_index

            index.train(xt)

            centroids = faiss.downcast_IndexBinary(index.quantizer).xb
            return faiss.vector_to_array(centroids).reshape(-1, d // 8)

        centroids_ref = train_and_get_centroids(None)

        # should do the exact same thing
        centroids_new = train_and_get_centroids(faiss.IndexFlatL2(d))

        assert np.all(centroids_ref == centroids_new)

        # will do less accurate assignment... Sanity check that the
        # index is indeed used by kmeans
        centroids_new = train_and_get_centroids(faiss.IndexLSH(d, 16))

        assert not np.all(centroids_ref == centroids_new)
Beispiel #3
0
def getmAP(clf, X_base, y_base, X_query, y_query, id_label, y_label):
    y_base, y_query = y_base[:, 0], y_query[:, 0]

    oneh = OneHotEncoder(sparse=False)
    y_label_1h = oneh.fit_transform(y_label)
    activations = clf.predict_proba(X_base)
    activations[id_label] = y_label_1h

    if args.code == "onehot":
        argmax = activations.argmax(axis=1).reshape((-1, 1))
        activations = oneh.fit_transform(argmax)

    if args.code == "lsh":
        index = faiss.IndexLSH(y_label_1h.shape[1], args.nbits, True, True)
    else:
        index = faiss.IndexFlatIP(y_label_1h.shape[1])

    index.train(activations.astype(np.float32))
    index.add(activations.astype(np.float32))

    queryAct = clf.predict_proba(X_query).astype(np.float32)

    _, idc = index.search(queryAct, y_base.shape[0])
    predictions = y_base[idc]
    results = np.equal(predictions, np.expand_dims(y_query, axis=1))

    return computemAP(results)
Beispiel #4
0
 def test_IndexLSH_32_48(self):
     # CHECK: the difference between 32 and 48 does not make much sense
     for nbits2 in 32, 48:
         q = faiss.IndexLSH(d, nbits2)
         res = ev.launch('LSH half size', q)
         e = ev.evalres(res)
         # should give 0.003  0.019  0.108
         assert e[10] > 0.018
Beispiel #5
0
 def __init__(self, num_dimensions: int, **kwargs):
     super().__init__(num_dimensions)
     self.faiss_index_type = kwargs.get("faiss_index_type")
     self.faiss_lsh_num_bits = int(kwargs.get("faiss_lsh_num_bits", 4))
     if self.faiss_index_type == "IndexFlatL2":
         self.index = faiss.IndexFlatL2(self.num_dimensions)
     elif self.faiss_index_type == "IndexLSH":
         self.index = faiss.IndexLSH(self.num_dimensions,
                                     self.faiss_lsh_num_bits)
     else:
         raise NotImplementedError(
             f"Faiss index '{self.index_type}' not implemented'")
Beispiel #6
0
def build_lsh(xb, name, n_bits=None):

    cache_dir = 'LSH_index_cache'
    try:
        os.makedirs(cache_dir)
    except:
        print('{} already existing'.format(cache_dir))

    dim = xb.shape[1]

    if n_bits is None:
        n_bits = dim

    index_cache_fname = os.path.join(
        cache_dir, 'index_{}_{}bits.idx'.format(name, n_bits))

    start_time = time.time()
    if os.path.isfile(index_cache_fname):
        print('Loading existing index...')
        cpuindex = faiss.read_index(index_cache_fname)

        res = faiss.StandardGpuResources()  # use a single GPU
        index = faiss.index_cpu_to_gpu(res, 0, cpuindex)
    else:

        # rotate_data=True and train_thresholds=True
        index = faiss.IndexLSH(dim, n_bits, True, True)

        print('Training index with: {} ...'.format(xb.shape))
        index.train(xb)
        print('Adding ...')
        index.add(xb)

        writable_index = faiss.index_gpu_to_cpu(index)
        faiss.write_index(writable_index, index_cache_fname)

    end_time = time.time()
    print('Done in: {}'.format(str(timedelta(seconds=(end_time -
                                                      start_time)))))

    return index
Beispiel #7
0
    def test_sh(self):
        d = 32
        xt, xb, xq = get_dataset_2(d, 2000, 1000, 200)
        nlist, nprobe = 1, 1

        gt_index = faiss.IndexFlatL2(d)
        gt_index.add(xb)
        gt_D, gt_I = gt_index.search(xq, 10)

        for nbit in 32, 64, 128:
            quantizer = faiss.IndexFlatL2(d)

            index_lsh = faiss.IndexLSH(d, nbit, True)
            index_lsh.add(xb)
            D, I = index_lsh.search(xq, 10)
            ninter = faiss.eval_intersection(I, gt_I)

            print('LSH baseline: %d' % ninter)

            for period in 10.0, 1.0:

                for tt in 'global centroid centroid_half median'.split():
                    index = faiss.IndexIVFSpectralHash(quantizer, d, nlist,
                                                       nbit, period)
                    index.nprobe = nprobe
                    index.threshold_type = getattr(
                        faiss.IndexIVFSpectralHash,
                        'Thresh_' + tt
                    )

                    index.train(xt)
                    index.add(xb)
                    D, I = index.search(xq, 10)

                    ninter = faiss.eval_intersection(I, gt_I)
                    key = (nbit, tt, period)

                    print('(%d, %s, %g): %d, ' % (nbit, repr(tt), period, ninter))
                    assert abs(ninter - self.ref_results[key]) <= 12
Beispiel #8
0
 def test_indexLSH(self):
     q = faiss.IndexLSH(d, nbits)
     res = ev.launch('FLAT / LSH Cosine', q)
     e = ev.evalres(res)
     # should give 0.070  0.250  0.580
     assert e[10] > 0.2
Beispiel #9
0
    # np.random.seed(1234)  # make reproducible
    # xb = np.random.random((nb, d)).astype('float32')
    # xb[:, 0] += np.arange(nb) / 1000.
    # xq = np.random.random((nq, d)).astype('float32')
    # xq[:, 0] += np.arange(nq) / 1000.

    print('Generating queries')
    # np.random.seed(666666)
    # np.random.shuffle(trainDataset)
    queries = testDataset[:nq]
    npGroundTruth = np.array(neighbors)
    groundTruth = npGroundTruth[:nq, :topk]
    print('Done')

    n_bits = 2 * d
    index = faiss.IndexLSH(d, n_bits)  # build the index
    print(index.is_trained)
    index.add(trainDataset)  # add vectors to the index
    print(index.ntotal)
    # we want to see 4 nearest neighbors
    t1 = timeit.default_timer()
    D, I = index.search(queries, topk)  # sanity check
    t2 = timeit.default_timer()
    # print(I)
    # print(D)
    # D, I = index.search(xq, k)  # actual search
    # print(I[:5])  # neighbors of the 5 first queries
    # print(I[-5:])

    score = 0.0
Beispiel #10
0
import faiss
import numpy as np

objects = np.array([[1, 1, 2, 1], [5, 4, 6, 5], [1, 2, 1, 2]],
                   dtype=np.float32)

index = faiss.IndexLSH(4, 2)
index.add(objects)
distances, ids = index.search(objects, 3)

print(distances)
print(ids)
Beispiel #11
0
    ## Load Features
    start = time.time()
    final_vids, features, vid2features, id_index = load_features('/home/camp/FIVR/features/vcms_v1', is_gv=False)
    print('Read time: %.2f' % (time.time() - start))
    features = np.asarray(features, np.float32)
    print(features.shape)
    final_vids = np.array(final_vids)

    ## Load K-Means Model
    # kmeans = joblib.load('./kmeans/kmeans.pkl')
    # lables = np.load('./kmeans/kmeans_lables_group.npy', allow_pickle=True)

    print('lshash start:')
    start = time.time()
    index = faiss.IndexLSH(512, 512)
    index.add(features) 
    print('biuld time: %.2f' % (time.time() - start))

    # 加载vid2name
    with open('/home/camp/FIVR/vid2name.pk', 'rb') as pk_file:
        vid2names = pk.load(pk_file)

    # 开始评估
    annotation_dir = '/home/camp/FIVR/annotation'
    final_names = np.array([vid2names[vid][0] for vid in final_vids])
    query_names = None
    results = None
    for task_name in ['DSVR', 'CSVR', 'ISVR']:
        annotation_path = os.path.join(annotation_dir, task_name + '.json')
        with open(annotation_path, 'r') as annotation_file:
Beispiel #12
0
 def fit(self, X):
     X = X.astype(numpy.float32)
     self._index = faiss.IndexLSH(len(X[0]), self._n_bits)
     self._index.train(X)
     self._index.add(X)
Beispiel #13
0
import faiss
from recall_data import recall_data

# 基本参数
d = 300  # 向量维数
data_size = 10000  # 数据库大小
k = 50
nbits = 64          # 生成的hash值长度

# 生成测试数据
numpy.random.seed(13)
data = numpy.random.random(size=(data_size, d)).astype('float32')
test_data = recall_data

# 创建索引模型并添加向量
index = faiss.IndexLSH(d, nbits)
# LSH无需训练训

# 添加数据
start_time = time.time()
index.add(data)  # 添加索引可能会有一点慢
print "Add vector Used %.2f sec." % (time.time() - start_time)

start_time = time.time()
D, I = index.search(data[:50], k)  # 搜索每一个数据的的k临近向量

# 输出结果
print "Used %.2f ms" % ((time.time() - start_time) * 1000)
recall_1_count = 0
recall_50_count = 0
for (search_vec, test_vec) in zip(I, test_data):
Beispiel #14
0
 def build(self, num_bits=8):
     self.index = faiss.IndexLSH(self.dimension, num_bits)
     if not self.index.is_trained:
         self.index.tran(data)
     self.index.add(self.vectors)