Ejemplo n.º 1
0
    def test_interleaved(self):
        res = faiss.StandardGpuResources()
        d = 128
        nb = 5000
        nq = 50

        rs = np.random.RandomState(123)
        xb = rs.rand(nb, d).astype('float32')
        xq = rs.rand(nq, d).astype('float32')

        nlist = int(math.sqrt(nb))
        sub_q = 16
        bits_per_code = 8
        nprobe = 4

        config = faiss.GpuIndexIVFPQConfig()
        config.alternativeLayout = True
        idx_gpu = faiss.GpuIndexIVFPQ(res, d, nlist, sub_q, bits_per_code,
                                      faiss.METRIC_L2, config)
        q = faiss.IndexFlatL2(d)
        idx_cpu = faiss.IndexIVFPQ(q, d, nlist, sub_q, bits_per_code,
                                   faiss.METRIC_L2)

        idx_gpu.train(xb)
        idx_gpu.add(xb)
        idx_cpu.train(xb)
        idx_cpu.add(xb)

        idx_gpu.nprobe = nprobe
        idx_cpu.nprobe = nprobe

        # Try without precomputed codes
        d_g, i_g = idx_gpu.search(xq, 10)
        d_c, i_c = idx_cpu.search(xq, 10)
        self.assertGreaterEqual((i_g == i_c).sum(), i_g.size - 10)
        self.assertTrue(np.allclose(d_g, d_c))

        # Try with precomputed codes (different kernel)
        idx_gpu.setPrecomputedCodes(True)
        d_g, i_g = idx_gpu.search(xq, 10)
        d_c, i_c = idx_cpu.search(xq, 10)
        self.assertGreaterEqual((i_g == i_c).sum(), i_g.size - 10)
        self.assertTrue(np.allclose(d_g, d_c))
Ejemplo n.º 2
0
def create_index(template_path, index_file=None):

    gallery_file = '/media/kaicao/data2/AutomatedLatentRecognition/Results/template/NISTSD14_F.mat'
    if os.path.exists(gallery_file):
        D_gallery = sio.loadmat(gallery_file)
    else:
        des, finger_ID, minutiae = get_all_faetures(
            template_path=template_path, prefix='F')
        D_gallery = {}
        D_gallery['des'] = des
        D_gallery['finger_ID'] = finger_ID
        D_gallery['minutiae'] = minutiae
        sio.savemat(gallery_file, D_gallery)

# query_file = '/media/kaicao/data2/AutomatedLatentRecognition/Results/template/NISTSD14_S.mat'
#  	if os.path.exists(query_file):
#  		D_query = sio.loadmat(query_file)
#  	else:
# 	des, finger_ID, minutiae = get_all_faetures(template_path = template_path,prefix = 'S')
# 	D_query = {}
# 	D_query['des'] = des
# 	D_query['finger_ID'] = finger_ID
# 	D_query['minutiae'] = minutiae
# 	sio.savemat(query_file,D_query)

    finger_ID = D_gallery['finger_ID']
    minutiae = D_gallery['minutiae']
    des = D_gallery['des'].copy().astype('float32')
    #des = des[:1280,:32]
    print des.shape
    del D_gallery
    dim = des.shape[1]  # feature dimension

    nlist = 100
    m = 16
    quantizer = faiss.IndexFlatL2(dim)  # this remains the same
    index = faiss.IndexIVFPQ(quantizer, dim, nlist, m, 8)
    #pdb.set_trace()
    index.train(des)
    index.add(des)
    if index_file is not None:
        faiss.write_index(index, index_file)
Ejemplo n.º 3
0
 def __init__(self,
              vec_dimension: int,
              transformation: Callable = None,
              metric: str = 'l2',
              num_clusters: int = None,
              num_probe: int = None,
              num_bytes: int = None):
     self.quantizer = faiss.IndexFlat(vec_dimension, self.metrics[metric])
     if num_bytes is not None:
         self.index = faiss.IndexIVFPQ(self.quantizer, vec_dimension,
                                       num_clusters, num_bytes,
                                       self.metrics[metric])
     else:
         self.index = faiss.IndexIVFFlat(self.quantizer, vec_dimension,
                                         num_clusters, self.metrics[metric])
     self.index.nprobe = num_probe
     self.index.make_direct_map()
     self.transformation = transformation
     self.mapper = {}
     self.inverted_mapper = {}
Ejemplo n.º 4
0
def create_faiss_indices(model_code="dsbert", quantizer="HNSWFlat"):
    """
    Creates indices for efficient similarity search 
    Args:
        model_code (str): code for model to be used to create sentence embeddings
        quantizer (str): quantizer for compressing the indices for efficient search
    """

    embeddings = create_sentence_transformer_embeddings(
        model_code, df.definition.to_list())
    if quantizer == "HNSWFlat":
        quantizer = faiss.IndexHNSWFlat(embeddings.shape[1], 32)
        index = faiss.IndexIVFPQ(quantizer, embeddings.shape[1], 3, 16, 8)
        # Step 3: Pass the index to IndexIDMap
        index = faiss.IndexIDMap(index)

        # Step 4: Add vectors and their IDs
        index.train(embeddings)
        index.add_with_ids(embeddings, df.id.values)
        faiss.write_index(index, "store/wordnet_" + model_code + ".index")
Ejemplo n.º 5
0
    def init_faiss(self, matrix):
        self.train_data = np.matrix(matrix).astype('float32')
        logging.debug('FAISS init quantizer')
        self.f_quantizer = faiss.IndexFlatL2(self.dim)
        # Lock index read / wtite until it is built
        with self._lock:
            logging.debug('FAISS init index')
            self.f_index = faiss.IndexIVFPQ(self.f_quantizer, self.dim,
                                            self.nlist, self.bytesPerVec,
                                            self.bytesPerSubVec)
            logging.debug('FAISS train index')
            self.f_index.train(self.train_data)
            logging.debug('FAISS train index finished')

            # write index to disk
            self.model_loaded = self.save_model_to_disk(
                self.model_location, self.f_index)
        self.is_initiated_ = self.model_loaded

        return self.is_initiated_
Ejemplo n.º 6
0
    def _post_process(self, dataset, resources_paths):
        if self.config.with_index:
            index_file = resources_paths["embeddings_index"]
            if os.path.exists(index_file):
                dataset.load_faiss_index("embeddings", index_file)
            else:
                if "embeddings" not in dataset.column_names:
                    raise ValueError(
                        "Couldn't build the index because there are no embeddings."
                    )
                import faiss

                d = 768
                train_size = self.config.index_train_size
                logger.info("Building wiki_dpr faiss index")
                if self.config.index_name == "exact":
                    index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_8bit,
                                              128, faiss.METRIC_INNER_PRODUCT)
                    index.hnsw.efConstruction = 200
                    index.hnsw.efSearch = 128
                    dataset.add_faiss_index("embeddings",
                                            custom_index=index,
                                            train_size=train_size)
                else:
                    quantizer = faiss.IndexHNSWFlat(d, 128,
                                                    faiss.METRIC_INNER_PRODUCT)
                    quantizer.hnsw.efConstruction = 200
                    quantizer.hnsw.efSearch = 128
                    ivf_index = faiss.IndexIVFPQ(quantizer, d, 4096, 128, 8,
                                                 faiss.METRIC_INNER_PRODUCT)
                    ivf_index.nprobe = 64
                    ivf_index.own_fields = True
                    quantizer.this.disown()
                    dataset.add_faiss_index(
                        "embeddings",
                        train_size=train_size,
                        custom_index=ivf_index,
                    )
                logger.info("Saving wiki_dpr faiss index")
                dataset.save_faiss_index("embeddings", index_file)
        return dataset
Ejemplo n.º 7
0
def get_index(path_to_model):
    global index
    global word2int
    global xb
    global int2word
    #model = generate_model()
    model = Word2Vec.load(path_to_model)
    word2int = {key: k for k, key in enumerate(model.wv.vocab.keys())}
    int2word = {k: key for k, key in enumerate(model.wv.vocab.keys())}
    xb = numpy.array([model.wv[word] for word in model.wv.vocab.keys()])
    quantizer = faiss.IndexFlatIP(100)  # Inner product cosine similarity
    nlist = 50  # Finetune this number of clusters
    m = 100  # bytes per vector
    index = faiss.IndexIVFPQ(quantizer, 100, nlist, m,
                             8)  # reduced accuray, fast
    #index = faiss.IndexIVFFlat(quantizer, 100, nlist, faiss.METRIC_INNER_PRODUCT)
    faiss.normalize_L2(xb)
    #print(xb.shape)
    index.train(xb)
    index.add(xb)
    index.nprobe = 5
Ejemplo n.º 8
0
    def initFaiss(self, nlist, nprobe, bytesPerVec, bytesPerSubVec, dim,
                  matrix):
        self.nlist = nlist
        self.nprobe = nprobe
        self.bytesPerVec = bytesPerVec
        self.bytesPerSubVec = bytesPerSubVec
        self.dim = dim

        self.train_data = np.matrix(matrix).astype('float32')
        print('FAISS init quantizer', self.train_data, self.train_data.shape)
        self.f_quantizer = faiss.IndexFlatL2(self.dim)
        print('FAISS init index')
        self.f_index = faiss.IndexIVFPQ(self.f_quantizer, self.dim, self.nlist,
                                        self.bytesPerVec, self.bytesPerSubVec)
        print('FAISS train index')
        self.f_index.train(self.train_data)
        print('FAISS train index finished')

        self.modelLoaded = self.saveModelToDisk(model_location, self.f_index)
        self.is_initiated = self.modelLoaded
        return self.is_initiated
Ejemplo n.º 9
0
 def make_index_for_merge(self, quant, index_type, master_index):
     ncent = 40
     if index_type == 1:
         index = faiss.IndexIVFFlat(quant, d, ncent, faiss.METRIC_L2)
         if master_index:
             index.is_trained = True
     elif index_type == 2:
         index = faiss.IndexIVFPQ(quant, d, ncent, 4, 8)
         if master_index:
             index.pq = master_index.pq
             index.is_trained = True
     elif index_type == 3:
         index = faiss.IndexIVFPQR(quant, d, ncent, 4, 8, 8, 8)
         if master_index:
             index.pq = master_index.pq
             index.refine_pq = master_index.refine_pq
             index.is_trained = True
     elif index_type == 4:
         # quant used as the actual index
         index = faiss.IndexIDMap(quant)
     return index
Ejemplo n.º 10
0
def train_index(data, quantizer_path, trained_index_path, fine_quant='SQ8', cuda=False):
    quantizer = faiss.read_index(quantizer_path)
    if fine_quant == 'SQ8':
        trained_index = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d, quantizer.ntotal, faiss.METRIC_L2)
    elif fine_quant.startswith('PQ'):
        m = int(fine_quant[2:])
        trained_index = faiss.IndexIVFPQ(quantizer, quantizer.d, quantizer.ntotal, m, 8)
    else:
        raise ValueError(fine_quant)

    if cuda:
        if fine_quant.startswith('PQ'):
            print('PQ not supported on GPU; keeping CPU.')
        else:
            res = faiss.StandardGpuResources()
            gpu_index = faiss.index_cpu_to_gpu(res, 0, trained_index)
            gpu_index.train(data)
            trained_index = faiss.index_gpu_to_cpu(gpu_index)
    else:
        trained_index.train(data)
    faiss.write_index(trained_index, trained_index_path)
 def prepare_trained_index(self, preproc, coarse_quantizer, xt):
     if os.path.exists(self.codes_cachefile):
         print("load pretrained codebook")
         return faiss.read_index(self.codes_cachefile)
     d = preproc.d_out
     if self.pqflat_str == 'Flat':
         print("making an IVFFlat index")
         idx_model = faiss.IndexIVFFlat(coarse_quantizer, d, self.ncent,
                                        faiss.METRIC_L2)
     else:
         m = int(self.pqflat_str[2:])
         assert m < 56 or self.use_float16, "PQ%d will work only with -float16" % m
         print("making an IVFPQ index, m = ", m)
         idx_model = faiss.IndexIVFPQ(coarse_quantizer, d, self.ncent, m, 8)
     coarse_quantizer.this.disown()
     idx_model.own_fields = True
     # finish training on CPU
     t0 = time.time()
     x = preproc.apply_py(indexfunctions.sanitize(xt[:1000000]))
     idx_model.train(x)
     faiss.write_index(idx_model, self.codes_cachefile)
     return idx_model
Ejemplo n.º 12
0
    def test_IndexIVFPQ(self):
        d = 32
        nb = 1000
        nt = 1500
        nq = 200

        (xt, xb, xq) = get_dataset_2(d, nb, nt, nq)
        d = xt.shape[1]

        gt_index = faiss.IndexFlatL2(d)
        gt_index.add(xb)
        D, gt_nns = gt_index.search(xq, 1)

        coarse_quantizer = faiss.IndexFlatL2(d)
        index = faiss.IndexIVFPQ(coarse_quantizer, d, 32, 8, 8)
        index.cp.min_points_per_centroid = 5  # quiet warning
        index.train(xt)
        index.add(xb)
        index.nprobe = 4
        D, nns = index.search(xq, 10)
        n_ok = (nns == gt_nns).sum()
        nq = xq.shape[0]

        self.assertGreater(n_ok, nq * 0.66)

        # check that and Index2Layer gives the same reconstruction
        # this is a bit fragile: it assumes 2 runs of training give
        # the exact same result.
        index2 = faiss.Index2Layer(coarse_quantizer, 32, 8)
        if True:
            index2.train(xt)
        else:
            index2.pq = index.pq
            index2.is_trained = True
        index2.add(xb)
        ref_recons = index.reconstruct_n(0, nb)
        new_recons = index2.reconstruct_n(0, nb)
        self.assertTrue(np.all(ref_recons == new_recons))
Ejemplo n.º 13
0
    def _post_process(self, dataset, resources_paths):
        if self.config.with_index:
            index_file = resources_paths["embeddings_index"]
            if os.path.exists(index_file):
                dataset.load_faiss_index("embeddings", index_file)
            else:
                if "embedings" not in dataset.column_names:
                    raise ValueError(
                        "Couldn't build the index because there are no embeddings."
                    )
                import faiss

                train_size = self.config.index_train_size
                logging.info("Building wiki_dpr faiss index")
                if self.config.index_type == "exact":
                    dataset.add_faiss_index(
                        "embeddings",
                        string_factory="Flat",
                        metric_type=faiss.METRIC_INNER_PRODUCT,
                    )
                else:
                    d = 768
                    quantizer = faiss.IndexHNSWFlat(d, 32,
                                                    faiss.METRIC_INNER_PRODUCT)
                    ivf_index = faiss.IndexIVFPQ(quantizer, d, 4096, 64, 8,
                                                 faiss.METRIC_INNER_PRODUCT)
                    ivf_index.own_fields = True
                    quantizer.this.disown()
                    dataset.add_faiss_index(
                        "embeddings",
                        train_size=train_size,
                        faiss_verbose=logging.getLogger().level <=
                        logging.DEBUG,
                        custom_index=ivf_index,
                    )
                logging.info("Saving wiki_dpr faiss index")
                dataset.save_faiss_index("embeddings", index_file)
        return dataset
Ejemplo n.º 14
0
    def test_IndexIVFPQ(self):
        d = 32
        nb = 1000
        nt = 1500
        nq = 200

        (xt, xb, xq) = get_dataset_2(d, nb, nt, nq)
        d = xt.shape[1]

        gt_index = faiss.IndexFlatL2(d)
        gt_index.add(xb)
        D, gt_nns = gt_index.search(xq, 1)

        coarse_quantizer = faiss.IndexFlatL2(d)
        index = faiss.IndexIVFPQ(coarse_quantizer, d, 32, 8, 8)
        index.train(xt)
        index.add(xb)
        index.nprobe = 4
        D, nns = index.search(xq, 10)
        n_ok = (nns == gt_nns).sum()
        nq = xq.shape[0]

        self.assertGreater(n_ok, nq * 0.66)
Ejemplo n.º 15
0
    def test_IndexIVFPQ(self):
        (xt, xb, xq) = self.get_dataset()
        d = xt.shape[1]

        dev_no = 0
        usePrecomputed = True

        res = faiss.StandardGpuResources()

        flat_config = faiss.GpuIndexFlatConfig()
        flat_config.device = dev_no

        gt_index = faiss.GpuIndexFlatL2(res, d, flat_config)
        gt_index.add(xb)
        D, gt_nns = gt_index.search(xq, 1)

        coarse_quantizer = faiss.IndexFlatL2(d)
        ncentroids = int(np.sqrt(xb.shape[0])) * 4

        index = faiss.IndexIVFPQ(coarse_quantizer, d, ncentroids, 32, 8)
        # add implemented on GPU but not train
        index.train(xt)

        ivfpq_config = faiss.GpuIndexIVFPQConfig()
        ivfpq_config.device = dev_no
        ivfpq_config.usePrecomputedTables = usePrecomputed

        gpuIndex = faiss.GpuIndexIVFPQ(res, index, ivfpq_config)
        gpuIndex.setNumProbes(64)
        index.add(xb)

        D, nns = index.search(xq, 10)
        n_ok = (nns == gt_nns).sum()
        nq = xq.shape[0]
        print ncentroids, n_ok, nq

        self.assertGreater(n_ok, nq * 0.2)
Ejemplo n.º 16
0
def indexing(feats, pos, imgID):

    feats_np = np.zeros(shape=(len(feats), feats[0].shape[0]))
    pos_np = np.zeros(shape=(len(feats), 2))
    imgID_np = np.zeros(shape=(len(feats), 1))

    for i in range(len(feats)):
        feats_np[i, :] = feats[i]
        pos_np[i, :] = pos[i]
        imgID_np[i, :] = imgID[i]

    # construct the visual vocabulary
    voc_size = const_params.__voc_size__

    niter = 20
    verbose = False
    d = feats[0].shape[0]

    code_size = 8
    quantizer = faiss.IndexFlatL2(d)  # this remains the same
    index_ = faiss.IndexIVFPQ(quantizer, d, voc_size, code_size, 8)
                                  # 8 specifies that each sub-vector is encoded as 8 bits
    index_.train(feats_np.astype('float32'))
    index_.add(feats_np.astype('float32'))
    index_.nprobe = 5

    #faiss.write_index(faiss.clone_index(index_), '../index.faiss')

    #fp = open('../query.pkl', 'r')
    #des, pt = pickle.load(fp)
    #fp.close()

    #q = np.asarray(des).astype('float32')
    #D, I = index_.search(q, 10)

    return [faiss.clone_index(index_), pos_np, imgID_np]
def index_patches(patches, pca_dims=64):

    # settings for faiss:
    num_lists, M, num_bits = 200, 16, 8

    # assertions:
    assert torch.is_tensor(patches) and patches.dim() == 2
    assert type(pca_dims) == int and pca_dims > 0
    if pca_dims > patches.size(1):
        print('WARNING: Input dimension < %d. Using fewer PCA dimensions.' % pca_dims)
        pca_dims = patches.size(1) - (patches.size(1) % M)

    # construct faiss index:
    quantizer = faiss.IndexFlatL2(pca_dims)
    assert pca_dims % M == 0
    sub_index = faiss.IndexIVFPQ(quantizer, pca_dims, num_lists, M, num_bits)
    pca_matrix = faiss.PCAMatrix(patches.size(1), pca_dims, 0, True)
    faiss_index = faiss.IndexPreTransform(pca_matrix, sub_index)

    # train faiss index:
    patches = patches.numpy()
    faiss_index.train(patches)
    faiss_index.add(patches)
    return faiss_index, sub_index
Ejemplo n.º 18
0
def _ready_faiss():
    print("start indexing")
    datas = {}
    with open('inception_v3.pickle', mode='rb') as f:
        datas = pickle.load(f)
    # databese配列の作成
    image_names = []
    vectors = []
    for k in datas:
        image_names.append(k)
        vectors.append(datas[k])
    vectors = np.array(vectors).astype("float32")

    # faissを用いたPQ
    nlist = 100
    m = 8
    d = 2048  # 顔特徴ベクトルの次元数
    quantizer = faiss.IndexFlatL2(d)  # this remains the same
    index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
    print(vectors.shape)
    index.train(vectors)
    index.add(vectors)
    print("indexing is end")
    return index
    def build_index(self, wv, d, index_type='fast', nlist=100):
        '''
        wv是一个字典,d是向量长度, 
        '''

        self.int2word = {k: key for k, key in enumerate(wv.keys())}
        self.word2int = {key: k for k, key in enumerate(wv.keys())}

        xb = numpy.array([v for v in wv.values() if len(v) == d])  #不支持float16

        if index_type == 'accurate':
            index = faiss.IndexFlatL2(d)
            index.add(xb)
        elif index_type == 'fast':
            # nlist = 100# ?
            quantizer = faiss.IndexFlatL2(d)
            index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
            print xb.shape
            index.train(xb)

            index.add(xb)
            #index.own_fields = False
            self.quantizer = quantizer  #必须加,否则这个东西会被回收,导致self.index悄悄地被修改了! (搞了3个小时.., 问了作者)
        elif index_type == 'compress':
            nlist = 64  #?
            m = 8  # number of bytes per vector
            quantizer = faiss.IndexFlatL2(d)  # this remains the same
            index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
            # 8 specifies that each sub-vector is encoded as 8 bits
            index.train(xb)
            index.add(xb)
            #index.own_fields = True
            self.quantizer = quantizer
        self.index_type = index_type
        self.index_ = index
        self.xb = xb
Ejemplo n.º 20
0
print "loading database"

xb = fvecs_read(rootdir + '/sift_base.fvecs')
xt = fvecs_read(rootdir + '/sift_learn.fvecs')
xq = fvecs_read(rootdir + '/sift_query.fvecs')

d = xt.shape[1]

gt_index = faiss.IndexFlatL2(d)
gt_index.add(xb)

D, gt_nns = gt_index.search(xq, 1)

coarse_quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFPQ(coarse_quantizer, d, 25, 16, 8)

print "train"
index.train(xt)

print "add"
index.add(xb)

print "search"
index.nprobe = 5
D, nns = index.search(xq, 10)
n_ok = (nns == gt_nns).sum()
nq = xq.shape[0]

print "n_ok=%d/%d" % (n_ok, nq)
Ejemplo n.º 21
0
 def train(self, vecs):
     D = vecs.shape[1]
     self.quantizer = faiss.IndexFlatL2(D)
     self.index = faiss.IndexIVFPQ(self.quantizer, D, self.nlist, self.M, 8)
     self.index.train(vecs)
Ejemplo n.º 22
0
    def make_samples(db, mode, is_Oxford=isOxford, verbose=True):
        if is_Oxford:
            dic_addr = Odic_addr
            vec_addr = Odic_addr
            index_addr = Oindex_addr
        else:
            dic_addr = Ddic_addr
            vec_addr = Ddic_addr
            index_addr = Dindex_addr
        try:
            dicbase = cPickle.load(open(os.path.join(cache_dir, dic_addr), "rb", True))
            # print(dicbase)
            vecbase = cPickle.load(open(os.path.join(cache_dir, vec_addr), "rb", True))
            index = faiss.read_index(os.path.join(cache_dir, index_addr))
            if verbose:
                print("Using cache..., config=%s, depth=%s" % (vec_addr, depth))

        except:
            if verbose:
                print("Counting histogram..., config=%s, depth=%s" % (vec_addr, depth))

            # base_model = VGGNet(load_features_path=LOAD_MODEL_PATH, requires_grad=False)
            # base_model = Res101(load_features_path=LOAD_MODEL_PATH,
            #                     use_Gem_whiten=True, load_whiten_path=LOAD_WHITEN_PATH)
            # base_model =
            base_model = load_model(CHECKPOINT, False)
            base_model.eval()
            print("load successfully!")
            if REMOVE_FC:
                base_model = nn.Sequential(*list(base_model.children())[:-1])
                print("Remove FC")

            if use_gpu:
                base_model = base_model.cuda()

            vecbase = []
            dicbase = []
            data = db.get_data()
            count = 1
            for d in data.itertuples():
                # if count == 5:
                #     break
                d_img, d_cls = getattr(d, "img"), getattr(d, "cls")

                img = imageio.imread(d_img, pilmode="RGB")

                img = Image.fromarray(img)
                img = IMAGE_NORMALIZER(img)
                img = np.array(img)

                img = np.expand_dims(img, axis=0)

                if use_gpu:
                    inputs = torch.autograd.Variable(torch.from_numpy(img).cuda().float())
                else:
                    inputs = torch.from_numpy(img)

                d_hist = base_model(inputs).view(-1, )
                d_hist = d_hist.data.cpu().numpy()

                vecbase.append(d_hist)
                dicbase.append((d_cls, d_img))

                print(count)
                count += 1

            vecbase = np.array(vecbase).astype('float32')
            print(vecbase.shape)
            d = vecbase.shape[1]
            dicbase = pd.DataFrame(dicbase, columns=['cls', 'img'])
            if mode == 'Linear':
                index = faiss.IndexFlatL2(d)
                index.add(vecbase)
            elif mode == 'IVFPQ':
                n_list = 100
                n_bits = 8
                coarse_quantizer = faiss.IndexFlatL2(d)
                index = faiss.IndexIVFPQ(coarse_quantizer, d, n_list, 8, n_bits)
                index.nprobe = 10
                index.train(vecbase)
                index.add(vecbase)
            else:
                raise ValueError("you should choose a correct retrival mode")
            cPickle.dump(dicbase, open(os.path.join(cache_dir, dic_addr), "wb", True))
            cPickle.dump(vecbase, open(os.path.join(cache_dir, vec_addr), "wb", True))
            faiss.write_index(index, os.path.join(cache_dir, index_addr))

        return index, dicbase, vecbase
Ejemplo n.º 23
0
import numpy as np
d = 64                           # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.
import faiss

nlist = 100
m = 8
k = 4
quantizer = faiss.IndexFlatL2(d)  # this remains the same
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)# 8 specifies that each sub-vector is encoded as 8 bits
index.train(xb)
index.add(xb)
D, I = index.search(xb[:5], k) # sanity check
print('In dex',I)
print('Distances',D)
index.nprobe = 10           # make comparable with experiment above
D, I = index.search(xq, k)     # search
print('I[-5:]',I[-5:])
# n_points_per_cluster_total = 1000
# size_colum = 100
# centers = np.random.randint(-20, 20, size=(size_colum,size_colum))
# # train
# X, y = make_blobs(n_samples=n_points_per_cluster_total,
#                     centers=centers,
#                     n_features=size_colum,
Ejemplo n.º 24
0
    def subtest(self, mt):
        d = 32
        xt, xb, xq = get_dataset_2(d, 2000, 1000, 200)
        nlist = 64

        gt_index = faiss.IndexFlat(d, mt)
        gt_index.add(xb)
        gt_D, gt_I = gt_index.search(xq, 10)
        quantizer = faiss.IndexFlat(d, mt)
        for by_residual in True, False:

            index = faiss.IndexIVFPQ(
                quantizer, d, nlist, 4, 8)
            index.metric_type = mt
            index.by_residual = by_residual
            if by_residual:
                # perform cheap polysemous training
                index.do_polysemous_training = True
                pt = faiss.PolysemousTraining()
                pt.n_iter = 50000
                pt.n_redo = 1
                index.polysemous_training = pt

            index.train(xt)
            index.add(xb)
            index.nprobe = 4
            D, I = index.search(xq, 10)

            ninter = faiss.eval_intersection(I, gt_I)
            print('(%d, %s): %d, ' % (mt, by_residual, ninter))

            assert abs(ninter - self.ref_results[mt, by_residual]) <= 3

            index.use_precomputed_table = 0
            D2, I2 = index.search(xq, 10)
            assert np.all(I == I2)

            if by_residual:

                index.use_precomputed_table = 1
                index.polysemous_ht = 20
                D, I = index.search(xq, 10)
                ninter = faiss.eval_intersection(I, gt_I)
                print('(%d, %s, %d): %d, ' % (
                    mt, by_residual, index.polysemous_ht, ninter))

                # polysemous behaves bizarrely on ARM
                assert (ninter >= self.ref_results[
                    mt, by_residual, index.polysemous_ht] - 4)

            # also test range search

            if mt == faiss.METRIC_INNER_PRODUCT:
                radius = float(D[:, -1].max())
            else:
                radius = float(D[:, -1].min())
            print('radius', radius)

            lims, D3, I3 = index.range_search(xq, radius)
            ntot = ndiff = 0
            for i in range(len(xq)):
                l0, l1 = lims[i], lims[i + 1]
                Inew = set(I3[l0:l1])
                if mt == faiss.METRIC_INNER_PRODUCT:
                    mask = D2[i] > radius
                else:
                    mask = D2[i] < radius
                Iref = set(I2[i, mask])
                ndiff += len(Inew ^ Iref)
                ntot += len(Iref)
            print('ndiff %d / %d' % (ndiff, ntot))
            assert ndiff < ntot * 0.02
Ejemplo n.º 25
0
# %% Import
from reverse_image_search.haltakov_clip import *
import faiss

# %% Create Index
D = 512
M = 256  # number of subquantizers
nbits = 8
nlist = 1500  # The number of cells (space partition). Typical value is sqrt(N)
hnsw_m = 32
quantizer = faiss.IndexHNSWFlat(D, hnsw_m)
index = faiss.IndexIVFPQ(quantizer, D, nlist, M,
                         nbits)  # this remains the same
index.train(photo_features[:20000, :])
index.add(photo_features)
faiss.write_index(index, "index.faiss")
# %%

index = faiss.read_index("index.faiss")

# %% Using Index
#!%%time
index.nprobe = 200
search_query = "technical debt"
text_features = encode_search_query(search_query)
D, I = index.search(text_features, 10)
best_photo_ids = photo_ids.iloc[I.tolist()[0]]["photo_id"].to_list()
display_image_grid(best_photo_ids)
best_photo_ids
# %% True Result
#!%%time
Ejemplo n.º 26
0
    def _create_index(self):
        quantizer = faiss.IndexFlatL2(self.dim)  # faiss.IndexHNSWFlat(dim, 32)
        index = faiss.IndexIVFPQ(quantizer, self.dim, self.partitions, 16, 8)

        return quantizer, index
Ejemplo n.º 27
0
                     mode='r',
                     shape=(args.dstore_size, 1))
else:
    keys = np.memmap(args.dstore_mmap + '_keys.npy',
                     dtype=np.float32,
                     mode='r',
                     shape=(args.dstore_size, args.dimension))
    vals = np.memmap(args.dstore_mmap + '_vals.npy',
                     dtype=np.int,
                     mode='r',
                     shape=(args.dstore_size, 1))

if not os.path.exists(args.faiss_index + ".trained"):
    # Initialize faiss index
    quantizer = faiss.IndexFlatL2(args.dimension)
    index = faiss.IndexIVFPQ(quantizer, args.dimension, args.ncentroids,
                             args.code_size, 8)
    index.nprobe = args.probe

    print('Training Index')
    np.random.seed(args.seed)
    random_sample = np.random.choice(np.arange(vals.shape[0]),
                                     size=[min(1000000, vals.shape[0])],
                                     replace=False)
    start = time.time()
    # Faiss does not handle adding keys in fp16 as of writing this.
    index.train(keys[random_sample].astype(np.float32))
    print('Training took {} s'.format(time.time() - start))

    print('Writing index after training')
    start = time.time()
    faiss.write_index(index, args.faiss_index + ".trained")
Ejemplo n.º 28
0
def index_factory(d: int,
                  index_key: str,
                  metric_type: int,
                  ef_construction: Optional[int] = None):
    """
    custom index_factory that fix some issues of
    faiss.index_factory with inner product metrics.
    """

    if metric_type == faiss.METRIC_INNER_PRODUCT:

        # make the index described by the key
        if any(re.findall(r"OPQ\d+_\d+,IVF\d+,PQ\d+", index_key)):
            params = [int(x) for x in re.findall(r"\d+", index_key)]

            cs = params[3]  # code size (in Bytes if nbits=8)
            nbits = params[4] if len(params) == 5 else 8  # default value
            ncentroids = params[2]
            out_d = params[1]
            M_OPQ = params[0]

            quantizer = faiss.index_factory(out_d, "Flat", metric_type)
            assert quantizer.metric_type == metric_type
            index_ivfpq = faiss.IndexIVFPQ(quantizer, out_d, ncentroids, cs,
                                           nbits, metric_type)
            assert index_ivfpq.metric_type == metric_type
            index_ivfpq.own_fields = True
            quantizer.this.disown()  # pylint: disable = no-member
            opq_matrix = faiss.OPQMatrix(d, M=M_OPQ, d2=out_d)
            # opq_matrix.niter = 50 # Same as default value
            index = faiss.IndexPreTransform(opq_matrix, index_ivfpq)
        elif any(re.findall(r"OPQ\d+_\d+,IVF\d+_HNSW\d+,PQ\d+", index_key)):
            params = [int(x) for x in re.findall(r"\d+", index_key)]

            M_HNSW = params[3]
            cs = params[4]  # code size (in Bytes if nbits=8)
            nbits = params[5] if len(params) == 6 else 8  # default value
            ncentroids = params[2]
            out_d = params[1]
            M_OPQ = params[0]

            quantizer = faiss.IndexHNSWFlat(out_d, M_HNSW, metric_type)
            if ef_construction is not None and ef_construction >= 1:
                quantizer.hnsw.efConstruction = ef_construction
            assert quantizer.metric_type == metric_type
            index_ivfpq = faiss.IndexIVFPQ(quantizer, out_d, ncentroids, cs,
                                           nbits, metric_type)
            assert index_ivfpq.metric_type == metric_type
            index_ivfpq.own_fields = True
            quantizer.this.disown()  # pylint: disable = no-member
            opq_matrix = faiss.OPQMatrix(d, M=M_OPQ, d2=out_d)
            # opq_matrix.niter = 50 # Same as default value
            index = faiss.IndexPreTransform(opq_matrix, index_ivfpq)

        elif any(re.findall(r"Pad\d+,IVF\d+_HNSW\d+,PQ\d+", index_key)):
            params = [int(x) for x in re.findall(r"\d+", index_key)]

            out_d = params[0]
            M_HNSW = params[2]
            cs = params[3]  # code size (in Bytes if nbits=8)
            nbits = params[4] if len(params) == 5 else 8  # default value
            ncentroids = params[1]

            remapper = faiss.RemapDimensionsTransform(d, out_d, True)

            quantizer = faiss.IndexHNSWFlat(out_d, M_HNSW, metric_type)
            if ef_construction is not None and ef_construction >= 1:
                quantizer.hnsw.efConstruction = ef_construction
            index_ivfpq = faiss.IndexIVFPQ(quantizer, out_d, ncentroids, cs,
                                           nbits, metric_type)
            index_ivfpq.own_fields = True
            quantizer.this.disown()  # pylint: disable = no-member

            index = faiss.IndexPreTransform(remapper, index_ivfpq)
        elif any(re.findall(r"HNSW\d+", index_key)):
            params = [int(x) for x in re.findall(r"\d+", index_key)]
            M_HNSW = params[0]
            index = faiss.IndexHNSWFlat(d, M_HNSW, metric_type)
            assert index.metric_type == metric_type
        elif index_key == "Flat":
            index = faiss.index_factory(d, index_key, metric_type)
        else:
            index = faiss.index_factory(d, index_key, metric_type)
            raise ValueError((
                "Be careful, faiss might not create what you expect when using the "
                "inner product similarity metric, remove this line to try it anyway."
                "Happened with index_key: " + str(index_key)))

    else:
        index = faiss.index_factory(d, index_key, metric_type)

    return index
Ejemplo n.º 29
0
        datas = pickle.load(f)
    # databese配列の作成
    face_image_names = []
    face_vectors = []
    for k in datas:
        face_image_names.append(k)
        face_vectors.append(datas[k])
    face_vectors = np.array(face_vectors).astype("float32")

    # faissを用いたPQの準備
    nlist = 100
    m = 8
    k = 8  # 類似顔7こほしいのでk=8
    d = 128  # 顔特徴ベクトルの次元数
    quantizer = faiss.IndexFlatL2(d)  # this remains the same
    index = faiss.IndexIVFPQ(quantizer, d, nlist, m, 8)
    index.train(face_vectors)
    index.add(face_vectors)
    print("indexing is end")

    # マネージャの作成
    with Manager() as manager:
        # マネージャーの作成
        similar_paths_manager = manager.list()
        similar_distance_manager = manager.list()
        frame_manager = manager.list()
        face_rect_manager = manager.list()
        # プロセスの生成
        recommend_process = Process(
            target=recommend_faces,
            args=[similar_paths_manager, frame_manager],
Ejemplo n.º 30
0
def build_database(chunk_len: int = 16, batch_size: int = 64, d_emb: int = 768, n_centeroids: int = 256,
                   code_size: int = 64, n_probe: int = 8, n_train: int = 50_000):
    """
    ## Build Database

    * `chunk_len` is the length of a chunk (number of characters)
    * `batch_size` is the batch size to use when calculating $\text{B\small{ERT}}(N)$
    * `d_emb` is the number of features in $\text{B\small{ERT}}(N)$ embeddings
        [lists to select in FAISS index](https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexIVFPQ.html)
    * `n_centeroids` is the number of lists in the index
    * `code_size` encoded vector size in the index
    * `n_probe` is the number of lists to probe
    * `n_train' is the number of keys to train the index on
    """

    # Load the dataset text file
    dataset = TextFileDataset(
        lab.get_data_path() / 'tiny_shakespeare.txt',
        list,
        url='https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt')

    # Get training data (a string)
    text = dataset.train

    # Split the text into chunks of `chunk_length`
    chunks = [text[i:i + chunk_len] for i in range(0, len(text), chunk_len) if i + chunk_len * 2 < len(text)]
    # Get the offsets of each of the chunks
    chunk_offsets = np.array([i for i in range(0, len(text), chunk_len) if i + chunk_len * 2 < len(text)])
    # Number of chunks
    n_chunks = len(chunks)

    # Initialize BERT to get $\text{B\small{ERT}}(N)$
    bert = BERTChunkEmbeddings(torch.device('cuda:0'))

    # Get chunk embeddings by processing `batch_size` number of chunks on each iteration
    chunk_emb = []
    for i in monit.iterate('Get embeddings', range(0, n_chunks, batch_size)):
        chunk_emb.append(bert(chunks[i: i + batch_size]).cpu())
    # Merge them into a single tensor
    chunk_emb = torch.cat(chunk_emb, dim=0).numpy()

    # Create the [FAISS index](https://faiss.ai/cpp_api/struct/structfaiss_1_1IndexIVFPQ.html)
    quantizer = faiss.IndexFlatL2(d_emb)
    index = faiss.IndexIVFPQ(quantizer, d_emb, n_centeroids, code_size, 8)
    index.nprobe = n_probe

    # Get a random sample of the the chunk indexes
    random_sample = np.random.choice(np.arange(n_chunks), size=[min(n_train, n_chunks)], replace=False)

    # Train the index to store the keys
    with monit.section('Train index'):
        index.train(chunk_emb[random_sample])

    # Add the chunks to the index in batches of size `1024`
    for s in monit.iterate('Index', range(0, n_chunks, 1024)):
        e = min(s + 1024, n_chunks)
        # Add to index
        index.add_with_ids(chunk_emb[s:e], chunk_offsets[s: e])

    # Save the index
    with monit.section('Save'):
        faiss.write_index(index, str(lab.get_data_path() / 'retro.index'))