Example #1
0
    def test_exception_2(self):

        try:
            faiss.index_factory(12, 'IVF256,Flat,PQ8')
        except RuntimeError as e:
            assert 'could not parse' in str(e)
        else:
            assert False, 'exception did not fire???'
Example #2
0
    def test_factory_3(self):

        index = faiss.index_factory(12, "IVF10,PQ4")
        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 3)
        assert index.nprobe == 3

        index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
        faiss.ParameterSpace().set_index_parameter(index, "nprobe", 3)
        assert faiss.downcast_index(index.index).nprobe == 3
Example #3
0
    def test_white(self):

        # generate data
        d = 4
        nt = 1000
        nb = 200
        nq = 200

        # normal distribition
        x = faiss.randn((nt + nb + nq) * d, 1234).reshape(nt + nb + nq, d)

        index = faiss.index_factory(d, 'Flat')

        xt = x[:nt]
        xb = x[nt:-nq]
        xq = x[-nq:]

        # NN search on normal distribution
        index.add(xb)
        Do, Io = index.search(xq, 5)

        # make distribution very skewed
        x *= [10, 4, 1, 0.5]
        rr, _ = np.linalg.qr(faiss.randn(d * d).reshape(d, d))
        x = np.dot(x, rr).astype('float32')

        xt = x[:nt]
        xb = x[nt:-nq]
        xq = x[-nq:]

        # L2 search on skewed distribution
        index = faiss.index_factory(d, 'Flat')

        index.add(xb)
        Dl2, Il2 = index.search(xq, 5)

        # whiten + L2 search on L2 distribution
        index = faiss.index_factory(d, 'PCAW%d,Flat' % d)

        index.train(xt)
        index.add(xb)
        Dw, Iw = index.search(xq, 5)

        # make sure correlation of whitened results with original
        # results is much better than simple L2 distances
        # should be 961 vs. 264
        assert (faiss.eval_intersection(Io, Iw) >
                2 * faiss.eval_intersection(Io, Il2))
Example #4
0
def get_trained_index():
    filename = "%s/%s_%s_trained.index" % (
        tmpdir, dbname, index_key)

    if not os.path.exists(filename):
        index = faiss.index_factory(d, index_key)

        n_train = choose_train_size(index_key)

        xtsub = xt[:n_train]
        print "Keeping %d train vectors" % xtsub.shape[0]
        # make sure the data is actually in RAM and in float
        xtsub = xtsub.astype('float32').copy()
        index.verbose = True

        t0 = time.time()
        index.train(xtsub)
        index.verbose = False
        print "train done in %.3f s" % (time.time() - t0)
        print "storing", filename
        faiss.write_index(index, filename)
    else:
        print "loading", filename
        index = faiss.read_index(filename)
    return index
Example #5
0
 def fit(self, X):
     X = X.astype(numpy.float32)
     self._index = faiss.index_factory(len(X[0]), "IVF%d,PQ64" % self._n_bits)
     co = faiss.GpuClonerOptions()
     co.useFloat16 = True
     self._index = faiss.index_cpu_to_gpu(self._res, 0, self._index, co)
     self._index.train(X)
     self._index.add(X)
     self._index.setNumProbes(self._n_probes)
Example #6
0
    def do_cpu_to_gpu(self, index_key):
        ts = []
        ts.append(time.time())
        (xt, xb, xq) = self.get_dataset(small_one=True)
        nb, d = xb.shape

        index = faiss.index_factory(d, index_key)
        if index.__class__ == faiss.IndexIVFPQ:
            # speed up test
            index.pq.cp.niter = 2
            index.do_polysemous_training = False
        ts.append(time.time())

        index.train(xt)
        ts.append(time.time())

        # adding some ids because there was a bug in this case
        index.add_with_ids(xb, np.arange(nb) * 3 + 12345)
        ts.append(time.time())

        index.nprobe = 4
        D, Iref = index.search(xq, 10)
        ts.append(time.time())

        res = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
        ts.append(time.time())

        gpu_index.setNumProbes(4)

        D, Inew = gpu_index.search(xq, 10)
        ts.append(time.time())
        print 'times:', [t - ts[0] for t in ts]

        self.assertGreaterEqual((Iref == Inew).sum(), Iref.size)

        if faiss.get_num_gpus() == 1:
            return

        for shard in False, True:

            # test on just 2 GPUs
            res = [faiss.StandardGpuResources() for i in range(2)]
            co = faiss.GpuMultipleClonerOptions()
            co.shard = shard

            gpu_index = faiss.index_cpu_to_gpu_multiple_py(res, index, co)

            faiss.GpuParameterSpace().set_index_parameter(
                gpu_index, 'nprobe', 4)

            D, Inew = gpu_index.search(xq, 10)

            self.assertGreaterEqual((Iref == Inew).sum(), Iref.size)
Example #7
0
    def test_factory_1(self):

        index = faiss.index_factory(12, "IVF10,PQ4")
        assert index.do_polysemous_training

        index = faiss.index_factory(12, "IVF10,PQ4np")
        assert not index.do_polysemous_training

        index = faiss.index_factory(12, "PQ4")
        assert index.do_polysemous_training

        index = faiss.index_factory(12, "PQ4np")
        assert not index.do_polysemous_training

        try:
            index = faiss.index_factory(10, "PQ4")
        except RuntimeError:
            pass
        else:
            assert False, "should do a runtime error"
Example #8
0
    def test_chain(self):

        # generate data
        d = 4
        nt = 1000
        nb = 200
        nq = 200

        # normal distribition
        x = faiss.randn((nt + nb + nq) * d, 1234).reshape(nt + nb + nq, d)

        # make distribution very skewed
        x *= [10, 4, 1, 0.5]
        rr, _ = np.linalg.qr(faiss.randn(d * d).reshape(d, d))
        x = np.dot(x, rr).astype('float32')

        xt = x[:nt]
        xb = x[nt:-nq]
        xq = x[-nq:]

        index = faiss.index_factory(d, "L2norm,PCA2,L2norm,Flat")

        assert index.chain.size() == 3
        l2_1 = faiss.downcast_VectorTransform(index.chain.at(0))
        assert l2_1.norm == 2
        pca = faiss.downcast_VectorTransform(index.chain.at(1))
        assert not pca.is_trained
        index.train(xt)
        assert pca.is_trained

        index.add(xb)
        D, I = index.search(xq, 5)

        # do the computation manually and check if we get the same result
        def manual_trans(x):
            x = x.copy()
            faiss.normalize_L2(x)
            x = pca.apply_py(x)
            faiss.normalize_L2(x)
            return x

        index2 = faiss.IndexFlatL2(2)
        index2.add(manual_trans(xb))
        D2, I2 = index2.search(manual_trans(xq), 5)

        assert np.all(I == I2)
Example #9
0
    def test_remove_id_map_2(self):
        # from https://github.com/facebookresearch/faiss/issues/255
        rs = np.random.RandomState(1234)
        X = rs.randn(10, 10).astype(np.float32)
        idx = np.array([0, 10, 20, 30, 40, 5, 15, 25, 35, 45], np.int64)
        remove_set = np.array([10, 30], dtype=np.int64)
        index = faiss.index_factory(10, 'IDMap,Flat')
        index.add_with_ids(X[:5, :], idx[:5])
        index.remove_ids(remove_set)
        index.add_with_ids(X[5:, :], idx[5:])

        print (index.search(X, 1))

        for i in range(10):
            _, searchres = index.search(X[i:i + 1, :], 1)
            if idx[i] in remove_set:
                assert searchres[0] != idx[i]
            else:
                assert searchres[0] == idx[i]
Example #10
0
    def test_update(self):
        d = 64
        nb = 1000
        nt = 1500
        nq = 100
        np.random.seed(123)
        xb = np.random.random(size=(nb, d)).astype('float32')
        xt = np.random.random(size=(nt, d)).astype('float32')
        xq = np.random.random(size=(nq, d)).astype('float32')

        index = faiss.index_factory(d, "IVF64,Flat")
        index.train(xt)
        index.add(xb)
        index.nprobe = 32
        D, I = index.search(xq, 5)

        index.make_direct_map()
        recons_before = np.vstack([index.reconstruct(i) for i in range(nb)])

        # revert order of the 200 first vectors
        nu = 200
        index.update_vectors(np.arange(nu), xb[nu - 1::-1].copy())

        recons_after = np.vstack([index.reconstruct(i) for i in range(nb)])

        # make sure reconstructions remain the same
        diff_recons = recons_before[:nu] - recons_after[nu - 1::-1]
        assert np.abs(diff_recons).max() == 0

        D2, I2 = index.search(xq, 5)

        assert np.all(D == D2)

        gt_map = np.arange(nb)
        gt_map[:nu] = np.arange(nu, 0, -1) - 1
        eqs = I.ravel() == gt_map[I2.ravel()]

        assert np.all(eqs)
Example #11
0
    if isinstance(index, faiss.IndexPreTransform):
        index_hnsw = faiss.downcast_index(index.index)
        vec_transform = index.chain.at(0).apply_py
    else:
        index_hnsw = index
        vec_transform = lambda x: x

    hnsw = index_hnsw.hnsw
    hnsw_stats = faiss.cvar.hnsw_stats

else:

    print "build index, key=", args.indexkey

    index = faiss.index_factory(d, args.indexkey)

    if isinstance(index, faiss.IndexPreTransform):
        index_hnsw = faiss.downcast_index(index.index)
        vec_transform = index.chain.at(0).apply_py
    else:
        index_hnsw = index
        vec_transform = lambda x: x

    hnsw = index_hnsw.hnsw
    hnsw.efConstruction = args.efConstruction
    hnsw_stats = faiss.cvar.hnsw_stats
    index.verbose = True
    index_hnsw.verbose = True
    index_hnsw.storage.verbose = True
Example #12
0
    def test_factory_2(self):

        index = faiss.index_factory(12, "SQ8")
        assert index.code_size == 12
Example #13
0
    D, I = index.search(xq, k)
    t1 = time.time()

    # the recall should be 1 at all times
    recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(nq)
    print "k=%d %.3f s, R@1 %.4f" % (
        k, t1 - t0, recall_at_1)


#################################################################
#  Approximate search experiment
#################################################################

print "============ Approximate search"

index = faiss.index_factory(d, "IVF4096,PQ64")

# faster, uses more memory
# index = faiss.index_factory(d, "IVF16384,Flat")

co = faiss.GpuClonerOptions()

# here we are using a 64-byte PQ, so we must set the lookup tables to
# 16 bit float (this is due to the limited temporary memory).
co.useFloat16 = True

index = faiss.index_cpu_to_gpu(res, 0, index, co)

print "train"

index.train(xt)
Example #14
0
    def test_factory_2(self):

        index = faiss.index_factory(12, "SQ8")
        assert index.code_size == 12
Example #15
0
def fvecs_read(fname):
    return ivecs_read(fname).view('float32')


#################################################################
#  Main program
#################################################################

stage = int(sys.argv[1])

tmpdir = '/tmp/'

if stage == 0:
    # train the index
    xt = fvecs_read("sift1M/sift_learn.fvecs")
    index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
    print("training index")
    index.train(xt)
    print("write " + tmpdir + "trained.index")
    faiss.write_index(index, tmpdir + "trained.index")

if 1 <= stage <= 4:
    # add 1/4 of the database to 4 independent indexes
    bno = stage - 1
    xb = fvecs_read("sift1M/sift_base.fvecs")
    i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
    index = faiss.read_index(tmpdir + "trained.index")
    print("adding vectors %d:%d" % (i0, i1))
    index.add_with_ids(xb[i0:i1], np.arange(i0, i1))
    print("write " + tmpdir + "block_%d.index" % bno)
    faiss.write_index(index, tmpdir + "block_%d.index" % bno)
Example #16
0
 def __init__(self, view):
     self.view = view
     d = self.view.shape[-1]
     self.index = faiss.index_factory(d, "Flat", faiss.METRIC_INNER_PRODUCT)
     faiss.normalize_L2(self.view)
     self.index.add(view)
Example #17
0
    k = 1 << lk
    t0 = time.time()
    D, I = index.search(xq, k)
    t1 = time.time()

    # the recall should be 1 at all times
    recall_at_1 = (I[:, :1] == gt[:, :1]).sum() / float(nq)
    print "k=%d %.3f s, R@1 %.4f" % (k, t1 - t0, recall_at_1)

#################################################################
#  Approximate search experiment
#################################################################

print "============ Approximate search"

index = faiss.index_factory(d, "IVF4096,PQ64")

# faster, uses more memory
# index = faiss.index_factory(d, "IVF16384,Flat")

co = faiss.GpuClonerOptions()

# here we are using a 64-byte PQ, so we must set the lookup tables to
# 16 bit float (this is due to the limited temporary memory).
co.useFloat16 = True

index = faiss.index_cpu_to_gpu(res, 0, index, co)

print "train"

index.train(xt)
Example #18
0
    def do_cpu_to_gpu(self, index_key):
        ts = []
        ts.append(time.time())
        (xt, xb, xq) = self.get_dataset(small_one=True)
        nb, d = xb.shape

        index = faiss.index_factory(d, index_key)
        if index.__class__ == faiss.IndexIVFPQ:
            # speed up test
            index.pq.cp.niter = 2
            index.do_polysemous_training = False
        ts.append(time.time())

        index.train(xt)
        ts.append(time.time())

        # adding some ids because there was a bug in this case;
        # those need to be cast to idx_t(= int64_t), because
        # on windows the numpy int default is int32
        ids = (np.arange(nb) * 3 + 12345).astype('int64')
        index.add_with_ids(xb, ids)
        ts.append(time.time())

        index.nprobe = 4
        Dref, Iref = index.search(xq, 10)
        ts.append(time.time())

        res = faiss.StandardGpuResources()
        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
        ts.append(time.time())

        # Validate the layout of the memory info
        mem_info = res.getMemoryInfo()

        assert type(mem_info) == dict
        assert type(mem_info[0]['FlatData']) == tuple
        assert type(mem_info[0]['FlatData'][0]) == int
        assert type(mem_info[0]['FlatData'][1]) == int

        gpu_index.setNumProbes(4)

        Dnew, Inew = gpu_index.search(xq, 10)
        ts.append(time.time())
        print('times:', [t - ts[0] for t in ts])

        # Give us some margin of error
        self.assertGreaterEqual((Iref == Inew).sum(), Iref.size - 50)

        if faiss.get_num_gpus() == 1:
            return

        for shard in False, True:

            # test on just 2 GPUs
            res = [faiss.StandardGpuResources() for i in range(2)]
            co = faiss.GpuMultipleClonerOptions()
            co.shard = shard

            gpu_index = faiss.index_cpu_to_gpu_multiple_py(res, index, co)

            faiss.GpuParameterSpace().set_index_parameter(
                gpu_index, 'nprobe', 4)

            Dnew, Inew = gpu_index.search(xq, 10)

            # 0.99: allow some tolerance in results otherwise test
            # fails occasionally (not reproducible)
            self.assertGreaterEqual((Iref == Inew).sum(), Iref.size * 0.99)
Example #19
0
 def test_sq_cpu_to_gpu(self):
     res = faiss.StandardGpuResources()
     index = faiss.index_factory(32, "SQfp16")
     index.add(np.random.rand(1000, 32).astype(np.float32))
     gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
     self.assertIsInstance(gpu_index, faiss.GpuIndexFlat)
        "FAISS was not compiled with GPU support, or loading _swigfaiss_gpu.so failed"
    res = faiss.StandardGpuResources()
    dev_no = 0

# remember results from other index types
op_per_key = []

# keep track of optimal operating points seen so far
op = faiss.OperatingPoints()

for index_key in keys_to_test:

    print "============ key", index_key

    # make the index described by the key
    index = faiss.index_factory(d, index_key)

    if use_gpu:
        # transfer to GPU (may be partial)
        index = faiss.index_cpu_to_gpu(res, dev_no, index)
        params = faiss.GpuParameterSpace()
    else:
        params = faiss.ParameterSpace()

    params.initialize(index)

    print "[%.3f s] train & add" % (time.time() - t0)

    index.train(xt)
    index.add(xb)
Example #21
0
def run(spark_session, cfg):
    score_vector_table = cfg['score_vector_rebucketing'][
        'score_vector_alpha_table']
    similarity_table = cfg['top_n_similarity']['similarity_table']
    top_n_value = cfg['top_n_similarity']['top_n']
    aid_bucket_size = cfg['top_n_similarity']['aid_bucket_size']
    load_bucket_step = cfg['top_n_similarity']['load_bucket_step']
    search_bucket_step = cfg['top_n_similarity']['search_bucket_step']
    index_factory_string = cfg['top_n_similarity']['index_factory_string']

    # If the number of GPUs is 0, uninstall faiss-cpu.
    num_gpus = faiss.get_num_gpus()
    assert num_gpus != 0
    print('Number of GPUs available: {}'.format(num_gpus))

    start_time = time.time()

    # Load the score vectors into the index.
    aid_list = []
    for aid_bucket in range(0, aid_bucket_size, load_bucket_step):
        print('Loading alpha buckets {} - {} of {}'.format(
            aid_bucket, aid_bucket + load_bucket_step - 1, aid_bucket_size))
        (aids, score_vectors,
         _) = load_score_vectors(spark_session, score_vector_table, aid_bucket,
                                 load_bucket_step, aid_bucket_size)

        # Keep track of the aids.
        if aid_bucket == 0:
            aid_list = aids
        else:
            aid_list = np.concatenate((aid_list, aids))

        # Create the FAISS index on the first iteration.
        if aid_bucket == 0:
            cpu_index = faiss.index_factory(score_vectors.shape[1],
                                            index_factory_string)
            gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)

        # we need to check if train is necessary, now it is disabled.
        if not gpu_index.is_trained:
            gpu_index.train(score_vectors)

        # Add the vectors to the index.
        gpu_index.add(score_vectors)

    load_time = time.time()

    # Find the top N by bucket step.
    start_load = time.time()
    mode = 'overwrite'
    total_search_time = 0
    total_load_time = 0
    total_format_time = 0
    total_write_time = 0
    for aid_bucket in range(0, aid_bucket_size, search_bucket_step):
        print('Searching alpha buckets {} - {} of {}'.format(
            aid_bucket, aid_bucket + search_bucket_step - 1, aid_bucket_size))

        # Load the users to perform the search with.
        print('Loading users from Hive')
        (aids, score_vectors,
         buckets) = load_score_vectors(spark_session, score_vector_table,
                                       aid_bucket, search_bucket_step,
                                       aid_bucket_size)
        end_load = time.time()
        total_load_time += end_load - start_load

        # Search for the top N similar users for bucket.
        print('Performing the search')
        top_n_distances, top_n_indices = gpu_index.search(
            score_vectors, top_n_value)
        end_search = time.time()
        total_search_time += end_search - end_load

        # Get the top N aids from the top N indexes.
        top_n_aids = aid_list[top_n_indices]

        # Format and write the result back to Hive.
        # Format the data for a Spark dataframe in order to write to Hive.
        #  [ ('0000001', [{'aid':'0000001', 'score':1.73205081}, {'aid':'0000003', 'score':1.73205081}, {'aid':'0000004', 'score':0.88532267}, {'aid':'0000002', 'score':0.66903623}], 0),
        #    ('0000002', [{'aid':'0000002', 'score':1.73205081}, {'aid':'0000004', 'score':1.50844401}, {'aid':'0000001', 'score':0.66903623}, {'aid':'0000003', 'score':0.66903623}], 0),
        #    ... ]
        print('Formatting the output')
        data = [(str(aid), [(str(n_aid), float(distance))
                            for n_aid, distance in zip(top_aid, top_distances)
                            ], int(bucket))
                for aid, top_aid, top_distances, bucket in zip(
                    aids, top_n_aids, top_n_distances, buckets)]

        # Output dataframe schema.
        schema = StructType([
            StructField("aid", StringType(), True),
            StructField(
                "top_n_similar_user",
                ArrayType(
                    StructType([
                        StructField('aid', StringType(), False),
                        StructField('score', FloatType(), False)
                    ]), True)),
            StructField("aid_bucket", IntegerType(), True)
        ])

        # Create the output dataframe with the similar users for each user.
        df = spark_session.createDataFrame(
            spark_session.sparkContext.parallelize(data), schema)
        end_format = time.time()
        total_format_time += end_format - end_search

        # Write the output dataframe to Hive.
        print('Writing output to Hive')
        write_to_table_with_partition(df,
                                      similarity_table,
                                      partition=('aid_bucket'),
                                      mode=mode)
        mode = 'append'
        end_write = time.time()
        total_write_time += end_write - end_format
        start_load = end_write

    search_time = time.time()
    print('Index size:', gpu_index.ntotal)
    print(gpu_index.d)
    print(4 * gpu_index.d * gpu_index.ntotal, 'bytes (uncompressed)')
    print('Total time:           ',
          str(datetime.timedelta(seconds=search_time - start_time)))
    print(' Index load time:     ',
          str(datetime.timedelta(seconds=load_time - start_time)))
    print(' Overall search time: ',
          str(datetime.timedelta(seconds=search_time - load_time)))
    print('   Total load time:   ',
          str(datetime.timedelta(seconds=total_load_time)))
    print('   Total search time: ',
          str(datetime.timedelta(seconds=total_search_time)))
    print('   Total format time: ',
          str(datetime.timedelta(seconds=total_format_time)))
    print('   Total write time:  ',
          str(datetime.timedelta(seconds=total_write_time)))
Example #22
0
    def test_exception_2(self):

        try:
            faiss.index_factory(12, 'IVF256,Flat,PQ8')
        except RuntimeError, e:
            assert 'could not parse' in str(e)
Example #23
0
faiss库 - 海量高维域名相似度计算
author  :   h-j-13
time    :   2018-6-25
"""

import time
import numpy
import faiss

# 基本参数
d = 300  # 向量维数
data_size = 500000  # 数据库大小
k = 50

# 构建索引
index = faiss.index_factory(d, "OPQ8_64,IVF2000,PQ8")

# 生成测试数据
numpy.random.seed(13)

data = numpy.random.random(size=(data_size, d)).astype('float32')

#  训练数据
start_time = time.time()
index.train(data)
print "Train Index Used %.2f sec." % (time.time() - start_time)

for i in xrange(250):
    # 添加数据
    data = numpy.random.random(size=(data_size, d)).astype('float32')
    start_time = time.time()
Example #24
0
 def test_factory_4(self):
     index = faiss.index_factory(12, "IVF10,FlatDedup")
     assert index.instances is not None
Example #25
0
 def test_set_gpu_param(self):
     index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
     res = faiss.StandardGpuResources()
     gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
     faiss.GpuParameterSpace().set_index_parameter(gpu_index, "nprobe", 3)
Example #26
0
    def build(self, config):
        '''
            build index from scratch
        '''
        operation_method = config.get("index_operation", "new").lower()

        gallery_images, gallery_docs = split_datafile(
            config['data_file'], config['image_root'], config['delimiter'])

        # when remove data in index, do not need extract fatures
        if operation_method != "remove":
            gallery_features = self._extract_features(gallery_images, config)
        assert operation_method in [
            "new", "remove", "append"
        ], "Only append, remove and new operation are supported"

        # vector.index: faiss index file
        # id_map.pkl: use this file to map id to image_doc
        if operation_method in ["remove", "append"]:
            # if remove or append, vector.index and id_map.pkl must exist
            assert os.path.join(
                config["index_dir"], "vector.index"
            ), "The vector.index dose not exist in {} when 'index_operation' is not None".format(
                config["index_dir"])
            assert os.path.join(
                config["index_dir"], "id_map.pkl"
            ), "The id_map.pkl dose not exist in {} when 'index_operation' is not None".format(
                config["index_dir"])
            index = faiss.read_index(
                os.path.join(config["index_dir"], "vector.index"))
            with open(os.path.join(config["index_dir"], "id_map.pkl"),
                      'rb') as fd:
                ids = pickle.load(fd)
            assert index.ntotal == len(ids.keys(
            )), "data number in index is not equal in in id_map"
        else:
            if not os.path.exists(config["index_dir"]):
                os.makedirs(config["index_dir"], exist_ok=True)
            index_method = config.get("index_method", "HNSW32")

            # if IVF method, cal ivf number automaticlly
            if index_method == "IVF":
                index_method = index_method + str(
                    min(int(len(gallery_images) // 8), 65536)) + ",Flat"

            # for binary index, add B at head of index_method
            if config["dist_type"] == "hamming":
                index_method = "B" + index_method

            #dist_type
            dist_type = faiss.METRIC_INNER_PRODUCT if config[
                "dist_type"] == "IP" else faiss.METRIC_L2

            #build index
            if config["dist_type"] == "hamming":
                index = faiss.index_binary_factory(config["embedding_size"],
                                                   index_method)
            else:
                index = faiss.index_factory(config["embedding_size"],
                                            index_method, dist_type)
                index = faiss.IndexIDMap2(index)
            ids = {}

        if config["index_method"] == "HNSW32":
            logger.warning(
                "The HNSW32 method dose not support 'remove' operation")

        if operation_method != "remove":
            # calculate id for new data
            start_id = max(ids.keys()) + 1 if ids else 0
            ids_now = (
                np.arange(0, len(gallery_images)) + start_id).astype(np.int64)

            # only train when new index file
            if operation_method == "new":
                if config["dist_type"] == "hamming":
                    index.add(gallery_features)
                else:
                    index.train(gallery_features)

            if not config["dist_type"] == "hamming":
                index.add_with_ids(gallery_features, ids_now)

            for i, d in zip(list(ids_now), gallery_docs):
                ids[i] = d
        else:
            if config["index_method"] == "HNSW32":
                raise RuntimeError(
                    "The index_method: HNSW32 dose not support 'remove' operation"
                )
            # remove ids in id_map, remove index data in faiss index
            remove_ids = list(
                filter(lambda k: ids.get(k) in gallery_docs, ids.keys()))
            remove_ids = np.asarray(remove_ids)
            index.remove_ids(remove_ids)
            for k in remove_ids:
                del ids[k]

        # store faiss index file and id_map file
        if config["dist_type"] == "hamming":
            faiss.write_index_binary(
                index, os.path.join(config["index_dir"], "vector.index"))
        else:
            faiss.write_index(
                index, os.path.join(config["index_dir"], "vector.index"))

        with open(os.path.join(config["index_dir"], "id_map.pkl"), 'wb') as fd:
            pickle.dump(ids, fd)
Example #27
0
 def test_set_gpu_param(self):
     index = faiss.index_factory(12, "PCAR8,IVF10,PQ4")
     res = faiss.StandardGpuResources()
     gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
     faiss.GpuParameterSpace().set_index_parameter(gpu_index, "nprobe", 3)
Example #28
0
def fvecs_read(fname):
    return ivecs_read(fname).view('float32')


#################################################################
#  Main program
#################################################################

stage = int(sys.argv[1])

tmpdir = '/tmp/'

if stage == 0:
    # train the index
    xt = fvecs_read("sift1M/sift_learn.fvecs")
    index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
    print("training index")
    index.train(xt)
    print("write " + tmpdir + "trained.index")
    faiss.write_index(index, tmpdir + "trained.index")


if 1 <= stage <= 4:
    # add 1/4 of the database to 4 independent indexes
    bno = stage - 1
    xb = fvecs_read("sift1M/sift_base.fvecs")
    i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
    index = faiss.read_index(tmpdir + "trained.index")
    print("adding vectors %d:%d" % (i0, i1))
    index.add(xb[i0:i1])
    print("write " + tmpdir + "block_%d.index" % bno)
Example #29
0
    dev_no = 0

# remember results from other index types
op_per_key = []


# keep track of optimal operating points seen so far
op = faiss.OperatingPoints()


for index_key in keys_to_test:

    print "============ key", index_key

    # make the index described by the key
    index = faiss.index_factory(d, index_key)


    if use_gpu:
        # transfer to GPU (may be partial)
        index = faiss.index_cpu_to_gpu(res, dev_no, index)
        params = faiss.GpuParameterSpace()
    else:
        params = faiss.ParameterSpace()

    params.initialize(index)

    print "[%.3f s] train & add" % (time.time() - t0)

    index.train(xt)
    index.add(xb)
Example #30
0
bowDiction = cv2.BOWImgDescriptorExtractor(sift, cv2.BFMatcher(cv2.NORM_L2))
bowDiction.setVocabulary(dictionary)
print "bow dictionary", np.shape(dictionary)


# returns descriptor of image at pth
def feature_extract(pth):
    im = cv2.imread(pth, 1)
    gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
    return bowDiction.compute(gray, sift.detect(gray))


# ------------------- train faiss index
# prepare index
index = faiss.index_factory(bow_num_words, INDEX_KEY)
# index = faiss.IndexIDMap(index)
if USE_GPU:
    print("Use GPU...")
    res = faiss.StandardGpuResources()
    index = faiss.index_cpu_to_gpu(res, 0, index)

# prepare ids
ids_count = 1
index_dict = {}
ids = []
features = np.matrix([])

for file_name in images_list:
    print ids_count
    dsc = feature_extract(file_name)