Example #1
0
    def test_ivf_flat(self):
        d = self.xq.shape[1] * 8

        quantizer = faiss.IndexBinaryFlat(d)
        index = faiss.IndexBinaryIVF(quantizer, d, 8)
        index.cp.min_points_per_centroid = 5  # quiet warning
        index.nprobe = 4
        index.train(self.xt)
        index.add(self.xb)
        D, I = index.search(self.xq, 3)

        _, tmpnam = tempfile.mkstemp()

        try:
            faiss.write_index_binary(index, tmpnam)

            index2 = faiss.read_index_binary(tmpnam)

            D2, I2 = index2.search(self.xq, 3)

            assert (I2 == I).all()
            assert (D2 == D).all()

        finally:
            os.remove(tmpnam)
Example #2
0
    def test_remove_id_map_binary(self):
        sub_index = faiss.IndexBinaryFlat(40)
        xb = np.zeros((10, 5), dtype='uint8')
        xb[:, 0] = np.arange(10) + 100
        index = faiss.IndexBinaryIDMap2(sub_index)
        index.add_with_ids(xb, np.arange(10) + 1000)
        assert index.reconstruct(1004)[0] == 104
        index.remove_ids(np.array([1003]))
        assert index.reconstruct(1004)[0] == 104
        try:
            index.reconstruct(1003)
        except:
            pass
        else:
            assert False, 'should have raised an exception'

        # while we are there, let's test I/O as well...
        _, tmpnam = tempfile.mkstemp()
        try:
            faiss.write_index_binary(index, tmpnam)
            index = faiss.read_index_binary(tmpnam)
        finally:
            os.remove(tmpnam)

        assert index.reconstruct(1004)[0] == 104
        try:
            index.reconstruct(1003)
        except:
            pass
        else:
            assert False, 'should have raised an exception'
Example #3
0
 def save(self, path1, path2):
     '''only faiss need this procedure'''
     if self.binary:
         faiss.write_index_binary(self.searcher, path1)
     else:
         faiss.write_index(self.searcher, path1)
     # save the text
     with open(path2, 'wb') as f:
         joblib.dump(self.corpus, f)
Example #4
0
 def test_write_580M(self):
     dim = 8
     nhash = 1
     num_million = 580 # changing to 570 works
     index1 = faiss.IndexBinaryMultiHash(dim, nhash, int(dim/nhash))
     random_hash_codes = np.random.randint(0, 256, (
         num_million * int(1e6), int(dim/8))).astype("uint8")
     index1.add(random_hash_codes)
     faiss.write_index_binary(index1, "/tmp/tmp.faiss")
     index2 = faiss.read_index_binary("/tmp/tmp.faiss")
Example #5
0
def train():
    all_data = np.array(get_all_data())
    if len(all_data) == 0:
        print("No images. exit()")
        exit()
    d = 32 * 8
    centroids = round(sqrt(all_data.shape[0]))
    print(f'centroids: {centroids}')
    quantizer = faiss.IndexBinaryFlat(d)
    index = faiss.IndexBinaryIVF(quantizer, d, centroids)
    index.nprobe = 8
    index.train(all_data)
    faiss.write_index_binary(index, "./" + "trained_import.index")
Example #6
0
 def build_index(cls, feature_file, index_file):
     '''
     :params feature_file: a npy file generated by using utils.build_mol_features
     '''
     logging.info("rebuild index from {}".format(feature_file))
     fp_arr = np.load(feature_file)
     bytes_list = []
     for item in tqdm.tqdm(fp_arr):
         bytes_list.append(cls.vec2bytes(item))
     dim = int(np.ceil(fp_arr.shape[1] / 8) * 8)
     index = faiss.IndexBinaryFlat(dim)
     index.add(np.array(bytes_list).astype("uint8"))
     faiss.write_index_binary(index, index_file)
     return index
Example #7
0
    def test_read_index_ownership(self):
        d = self.xq.shape[1] * 8

        index = faiss.IndexBinaryFlat(d)
        index.add(self.xb)

        _, tmpnam = tempfile.mkstemp()
        try:
            faiss.write_index_binary(index, tmpnam)

            index2 = faiss.read_index_binary(tmpnam)

            assert index2.thisown
        finally:
            os.remove(tmpnam)
Example #8
0
def train():
    all_descriptors=[]
    all_data=import_get_all_data()
    if len(all_data)==0:
        print("No images. exit()")
        exit()
    for x in all_data:
        all_descriptors.append(x[1])
    all_descriptors=np.concatenate(all_descriptors, axis=0)

    d=61*8
    centroids = round(sqrt(all_descriptors.shape[0]))
    print(f'centroids: {centroids}')
    quantizer = faiss.IndexBinaryFlat(d)
    index = faiss.IndexBinaryIVF(quantizer, d, centroids)
    index.nprobe = 8
    index.train(all_descriptors)
    faiss.write_index_binary(index, "./" + "trained_import.index")
Example #9
0
def train():
    all_descriptors = []
    all_ids = get_all_ids()
    if len(all_ids) == 0:
        print("No images. exit()")
        exit()
    for id in all_ids:
        x = convert_array(get_akaze_features_by_id(id))
        all_descriptors.append(x)
    all_descriptors = np.concatenate(all_descriptors, axis=0)

    d = 61 * 8
    centroids = round(sqrt(all_descriptors.shape[0]))
    print(f'centroids: {centroids}')
    quantizer = faiss.IndexBinaryFlat(d)
    index = faiss.IndexBinaryIVF(quantizer, d, centroids)
    index.nprobe = 8
    index.train(all_descriptors)
    faiss.write_index_binary(index, "./" + "trained.index")
Example #10
0
    def test_flat(self):
        d = self.xq.shape[1] * 8

        index = faiss.IndexBinaryFlat(d)
        index.add(self.xb)
        D, I = index.search(self.xq, 3)

        _, tmpnam = tempfile.mkstemp()
        try:
            faiss.write_index_binary(index, tmpnam)

            index2 = faiss.read_index_binary(tmpnam)

            D2, I2 = index2.search(self.xq, 3)

            assert (I2 == I).all()
            assert (D2 == D).all()

        finally:
            os.remove(tmpnam)
Example #11
0
    def test_binary_from_float(self):
        d = self.xq.shape[1] * 8

        float_index = faiss.IndexHNSWFlat(d, 16)
        index = faiss.IndexBinaryFromFloat(float_index)
        index.add(self.xb)
        D, I = index.search(self.xq, 3)

        fd, tmpnam = tempfile.mkstemp()
        os.close(fd)
        try:
            faiss.write_index_binary(index, tmpnam)

            index2 = faiss.read_index_binary(tmpnam)

            D2, I2 = index2.search(self.xq, 3)

            assert (I2 == I).all()
            assert (D2 == D).all()

        finally:
            os.remove(tmpnam)
Example #12
0
def registry_index(way_index):
    # assert way_index in range(len(DIMENSIONS))
    # prepare index
    dimensions = DIMENSIONS[way_index]
    if isAddPhash:
        dimensions += PHASH_X * PHASH_Y
    # https://github.com/facebookresearch/faiss/wiki/Binary-indexes
    # https://github.com/facebookresearch/faiss/blob/22b7876ef5540b85feee173aa3182a2f37dc98f6/tests/test_index_binary.py#L213
    if way_index != 3:
        # nbits/8 https://github.com/facebookresearch/faiss/wiki/Faiss-indexes#relationship-with-lsh
        index = faiss.IndexBinaryHash(dimensions * 8, 1)
    else:
        index = faiss.index_factory(dimensions, INDEX_KEY)
    if USE_GPU:
        print("Use GPU...")
        res = faiss.StandardGpuResources()
        index = faiss.index_cpu_to_gpu(res, 0, index)

    # start training
    images_list = iterate_files(train_image_dir)  # may change
    # prepare ids
    ids_count = 0
    index_defaultdict = defaultdict(list)
    # ids = None
    # features = np.matrix([])
    features = []
    ids = []
    cla_name_temp = parser_name(images_list[0])
    way = get_way(w_index=way_index)  # ORB , surf, and so on
    for file_name in images_list:
        cla_name = parser_name(file_name)
        ret, feature = way_feature(way, file_name)
        numf = feature.shape[0]
        if way_index == 3 and FEATURE_CLIP:
            numf = FEATURE_CLIP if feature.shape[
                0] > FEATURE_CLIP else feature.shape[0]
            # feature = feature[:FEATURE_CLIP, :]
            choosed_fea = sample(range(feature.shape[0]), numf)
            feature = feature[choosed_fea, :]

        if ret == 0 and feature.any():
            if cla_name != cla_name_temp:
                ids_count += 1  # change when same img not only one
                cla_name_temp = cla_name
            # record id and path
            # image_dict = {ids_count: (file_name, feature)}
            # image_dict = {ids_count: file_name}   # smaller than above
            index_defaultdict[ids_count].append(
                file_name
            )  # here in registry, on_id may have more than one img(obj)
            # print(way_feature.shape[0])
            # ids_list = np.linspace(ids_count, ids_count, num=feature.shape[0], dtype="int64")
            ids_list = np.linspace(ids_count,
                                   ids_count,
                                   num=numf,
                                   dtype="int64")
            print(feature.shape, ids_count, len(ids_list), ids_list.shape)
            features.append(feature)
            ids.append(ids_list)

            # if features.any():
            #     # print(feature[0].dtype)    # uint8
            #     features = np.vstack((features, feature))    # <class 'numpy.matrix'>
            #     # print(feature.shape)
            #     ids = np.hstack((ids, ids_list))    # None --> empty matrix
            #     print(ids.dtype, ids)
            # else:  # all feature is 0
            #     features = feature
            #     ids = ids_list

            # print(ids, ids.dtype)  # int64
            # print(index.is_trained)
            # print(features.shape, ids.shape)
            # if ids_count % 500 == 499:    # optim
            #     if not index.is_trained:

            #         index.train(features)

            #     index.add_with_ids(features, ids)    # https://github.com/facebookresearch/faiss/issues/856
            #     ids = None
            #     features = np.matrix([])
    # print(len(features), len(ids))
    features = np.vstack(features)
    ids = np.hstack(ids)
    print(features.shape, ids.shape)

    if features.any():
        if not index.is_trained:
            index.train(features)
        index.add_with_ids(features, ids)  # change

    # save index
    if WAY_INDEX == 3:
        faiss.write_index(index, index_path)
    else:
        faiss.write_index_binary(index, index_path)

    # save ids
    if not os.path.exists(ids_path):
        with open(ids_path, 'wb+') as f:
            try:
                pickle.dump(index_defaultdict, f, True)
            except EnvironmentError as e:
                logging.error('Failed to save index file error:[{}]'.format(e))
            except RuntimeError as v:
                logging.error('Failed to save index file error:[{}]'.format(v))

    print('Registry completed')
Example #13
0
import argparse

import faiss
import joblib
import numpy as np
from tqdm import trange

if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument("--embedding_file", type=str, required=True)
    parser.add_argument("--output_file", type=str, required=True)
    parser.add_argument("--batch_size", type=int, default=256)
    parser.add_argument("--hash_num_bits", type=int, default=768)

    args = parser.parse_args()

    embedding_data = joblib.load(args.embedding_file, mmap_mode="r")

    ids = np.array(embedding_data["ids"], dtype=np.int)
    embeddings = embedding_data["embeddings"]
    dim_size = embeddings.shape[1] * 8

    index = faiss.IndexBinaryIDMap(faiss.IndexBinaryFlat(dim_size))
    for start in trange(0, ids.size, args.batch_size):
        index.add_with_ids(embeddings[start:start + args.batch_size],
                           ids[start:start + args.batch_size])

    faiss.write_index_binary(index, args.output_file)
Example #14
0
        base_index.hnsw.efSearch = args.hnsw_ef_search
        base_index.hnsw.efConstruction = args.hnsw_ef_construction
        index = FaissHNSWIndex.build(ids, embeddings, base_index)

    else:
        base_index = faiss.IndexFlatIP(dim_size)
        index = FaissIndex.build(ids, embeddings, base_index)
        if args.index_device == "cuda":
            index = index.to_gpu()

    del ids
    del embeddings

    with tempfile.NamedTemporaryFile() as f:
        if isinstance(index, FaissBinaryIndex):
            faiss.write_index_binary(index.index, f.name)
        else:
            faiss.write_index(index.index, f.name)

        logger.info("Index size: %d bytes", os.path.getsize(f.name))

    logger.info("Loading BiEncoder...")
    biencoder = BiEncoder.load_from_checkpoint(args.biencoder_file,
                                               map_location="cpu")
    biencoder = biencoder.to(args.biencoder_device)
    biencoder.eval()
    biencoder.freeze()

    if args.parallel:
        biencoder.query_encoder = DataParallel(biencoder.query_encoder)
Example #15
0
    def build(self, config):
        '''
            build index from scratch
        '''
        operation_method = config.get("index_operation", "new").lower()

        gallery_images, gallery_docs = split_datafile(
            config['data_file'], config['image_root'], config['delimiter'])

        # when remove data in index, do not need extract fatures
        if operation_method != "remove":
            gallery_features = self._extract_features(gallery_images, config)
        assert operation_method in [
            "new", "remove", "append"
        ], "Only append, remove and new operation are supported"

        # vector.index: faiss index file
        # id_map.pkl: use this file to map id to image_doc
        if operation_method in ["remove", "append"]:
            # if remove or append, vector.index and id_map.pkl must exist
            assert os.path.join(
                config["index_dir"], "vector.index"
            ), "The vector.index dose not exist in {} when 'index_operation' is not None".format(
                config["index_dir"])
            assert os.path.join(
                config["index_dir"], "id_map.pkl"
            ), "The id_map.pkl dose not exist in {} when 'index_operation' is not None".format(
                config["index_dir"])
            index = faiss.read_index(
                os.path.join(config["index_dir"], "vector.index"))
            with open(os.path.join(config["index_dir"], "id_map.pkl"),
                      'rb') as fd:
                ids = pickle.load(fd)
            assert index.ntotal == len(ids.keys(
            )), "data number in index is not equal in in id_map"
        else:
            if not os.path.exists(config["index_dir"]):
                os.makedirs(config["index_dir"], exist_ok=True)
            index_method = config.get("index_method", "HNSW32")

            # if IVF method, cal ivf number automaticlly
            if index_method == "IVF":
                index_method = index_method + str(
                    min(int(len(gallery_images) // 8), 65536)) + ",Flat"

            # for binary index, add B at head of index_method
            if config["dist_type"] == "hamming":
                index_method = "B" + index_method

            #dist_type
            dist_type = faiss.METRIC_INNER_PRODUCT if config[
                "dist_type"] == "IP" else faiss.METRIC_L2

            #build index
            if config["dist_type"] == "hamming":
                index = faiss.index_binary_factory(config["embedding_size"],
                                                   index_method)
            else:
                index = faiss.index_factory(config["embedding_size"],
                                            index_method, dist_type)
                index = faiss.IndexIDMap2(index)
            ids = {}

        if config["index_method"] == "HNSW32":
            logger.warning(
                "The HNSW32 method dose not support 'remove' operation")

        if operation_method != "remove":
            # calculate id for new data
            start_id = max(ids.keys()) + 1 if ids else 0
            ids_now = (
                np.arange(0, len(gallery_images)) + start_id).astype(np.int64)

            # only train when new index file
            if operation_method == "new":
                if config["dist_type"] == "hamming":
                    index.add(gallery_features)
                else:
                    index.train(gallery_features)

            if not config["dist_type"] == "hamming":
                index.add_with_ids(gallery_features, ids_now)

            for i, d in zip(list(ids_now), gallery_docs):
                ids[i] = d
        else:
            if config["index_method"] == "HNSW32":
                raise RuntimeError(
                    "The index_method: HNSW32 dose not support 'remove' operation"
                )
            # remove ids in id_map, remove index data in faiss index
            remove_ids = list(
                filter(lambda k: ids.get(k) in gallery_docs, ids.keys()))
            remove_ids = np.asarray(remove_ids)
            index.remove_ids(remove_ids)
            for k in remove_ids:
                del ids[k]

        # store faiss index file and id_map file
        if config["dist_type"] == "hamming":
            faiss.write_index_binary(
                index, os.path.join(config["index_dir"], "vector.index"))
        else:
            faiss.write_index(
                index, os.path.join(config["index_dir"], "vector.index"))

        with open(os.path.join(config["index_dir"], "id_map.pkl"), 'wb') as fd:
            pickle.dump(ids, fd)