Example #1
0
def dedup():
    global index
    all_data = get_all_data()
    if len(all_data) == 0:
        print("all_data no images. exit()")
        exit()
    try:
        index = faiss.read_index_binary("trained.index")
    except:
        index = faiss.read_index_binary("trained_import.index")

    image_ids = np.array([np.int64(x[0]) for x in all_data])
    phashes = np.array([x[1] for x in all_data])
    index.add_with_ids(phashes, image_ids)

    print("Index is ready")
    import_all_data = import_get_all_data()
    if len(import_all_data) == 0:
        print("import_all_data no images. exit()")
        exit()
    for x in tqdm(import_all_data):
        filename = x[0]
        features = x[1]
        res = phash_reverse_search(features)
        if len(res) != 0:
            print(f'duplicate {filename} - {res}')
            print(f'deleting {filename}')
            remove(f'{IMAGE_PATH}/{x[0]}')
Example #2
0
    def test_remove_id_map_binary(self):
        sub_index = faiss.IndexBinaryFlat(40)
        xb = np.zeros((10, 5), dtype='uint8')
        xb[:, 0] = np.arange(10) + 100
        index = faiss.IndexBinaryIDMap2(sub_index)
        index.add_with_ids(xb, np.arange(10) + 1000)
        assert index.reconstruct(1004)[0] == 104
        index.remove_ids(np.array([1003]))
        assert index.reconstruct(1004)[0] == 104
        try:
            index.reconstruct(1003)
        except:
            pass
        else:
            assert False, 'should have raised an exception'

        # while we are there, let's test I/O as well...
        _, tmpnam = tempfile.mkstemp()
        try:
            faiss.write_index_binary(index, tmpnam)
            index = faiss.read_index_binary(tmpnam)
        finally:
            os.remove(tmpnam)

        assert index.reconstruct(1004)[0] == 104
        try:
            index.reconstruct(1003)
        except:
            pass
        else:
            assert False, 'should have raised an exception'
Example #3
0
    def test_ivf_flat(self):
        d = self.xq.shape[1] * 8

        quantizer = faiss.IndexBinaryFlat(d)
        index = faiss.IndexBinaryIVF(quantizer, d, 8)
        index.cp.min_points_per_centroid = 5  # quiet warning
        index.nprobe = 4
        index.train(self.xt)
        index.add(self.xb)
        D, I = index.search(self.xq, 3)

        _, tmpnam = tempfile.mkstemp()

        try:
            faiss.write_index_binary(index, tmpnam)

            index2 = faiss.read_index_binary(tmpnam)

            D2, I2 = index2.search(self.xq, 3)

            assert (I2 == I).all()
            assert (D2 == D).all()

        finally:
            os.remove(tmpnam)
Example #4
0
def dedup():
    global index, POINT_ID
    try:
        index = faiss.read_index_binary("trained_import.index")
    except:
        print("trained_import.index not found. exit()")
        exit()
    all_data = import_get_all_data()

    for obj in tqdm(all_data):
        image_id = obj[0]
        features = obj[1]
        point_ids = np.arange(start=POINT_ID,
                              stop=POINT_ID + len(features),
                              dtype=np.int64)
        for point_id in point_ids:
            point_id_to_image_id_map[point_id] = image_id
        image_id_to_point_ids_map[image_id] = point_ids
        POINT_ID += len(features)
        index.add_with_ids(features, point_ids)

    print("Index is ready")
    deleted = []
    for x in tqdm(all_data):
        filename = x[0]
        if filename in deleted:
            continue
        features = x[1]
        res = akaze_reverse_search(features, filename)
        if len(res) > 0 and res[0] != filename:
            print("=============")
            print(filename)
            print(res[0])
            print("=============")
            deleted.extend(res)
Example #5
0
 def load(self, path1, path2):
     '''only faiss need this procedure'''
     if self.binary:
         self.searcher = faiss.read_index_binary(path1)
     else:
         self.searcher = faiss.read_index(path1)
     with open(path2, 'rb') as f:
         self.corpus = joblib.load(f)
Example #6
0
 def test_write_580M(self):
     dim = 8
     nhash = 1
     num_million = 580 # changing to 570 works
     index1 = faiss.IndexBinaryMultiHash(dim, nhash, int(dim/nhash))
     random_hash_codes = np.random.randint(0, 256, (
         num_million * int(1e6), int(dim/8))).astype("uint8")
     index1.add(random_hash_codes)
     faiss.write_index_binary(index1, "/tmp/tmp.faiss")
     index2 = faiss.read_index_binary("/tmp/tmp.faiss")
Example #7
0
    def test_read_index_ownership(self):
        d = self.xq.shape[1] * 8

        index = faiss.IndexBinaryFlat(d)
        index.add(self.xb)

        _, tmpnam = tempfile.mkstemp()
        try:
            faiss.write_index_binary(index, tmpnam)

            index2 = faiss.read_index_binary(tmpnam)

            assert index2.thisown
        finally:
            os.remove(tmpnam)
Example #8
0
def init_index():
    global index
    try:
        index = faiss.read_index_binary("trained.index")
    except:
        d = 32 * 8
        quantizer = faiss.IndexBinaryFlat(d)
        index = faiss.IndexBinaryIVF(quantizer, d, 1)
        index.nprobe = 1
        index.train(np.array([np.zeros(32)], dtype=np.uint8))
    all_data = get_all_data()
    image_ids = np.array([np.int64(x[0]) for x in all_data])
    phashes = np.array([x[1] for x in all_data])
    if len(all_data) != 0:
        index.add_with_ids(phashes, image_ids)
    print("Index is ready")
def dedup():
    global index
    all_data = get_all_data()
    if len(all_data) == 0:
        print("all_data no images. exit()")
        exit()

    index = faiss.read_index_binary("trained_import.index")

    image_ids = np.arange(len(all_data), dtype=np.int64)
    for i in range(len(all_data)):
        file_id_to_file_name_map[i] = all_data[i][0]
    phashes = np.array([x[1] for x in all_data])
    index.add_with_ids(phashes, image_ids)
    print("Index is ready")

    deleted = []
    for x in tqdm(all_data):
        if x[0] in deleted:
            continue

        res = phash_reverse_search(x[1])
        if len(res) != 0 and (not (len(res) == 1 and
                                   file_id_to_file_name_map[res[0]] == x[0])):
            images_id_res = []
            for img_id in res:
                width, height = imagesize.get(
                    f'{IMAGE_PATH}/{file_id_to_file_name_map[img_id]}')
                images_id_res.append((img_id, width * height))
            print("===============")
            print(f"duplicates of {x[0]}:")
            for x in images_id_res:
                print(f"{file_id_to_file_name_map[x[0]]} - {x[1]} pixels")
            print("===============")
            images_id_res.sort(key=lambda x: x[1], reverse=True)
            # keep img with biggest resolution (skip image with most pixels)
            for i in range(1, len(images_id_res)):
                img_id = images_id_res[i][0]
                print(f'deleting {file_id_to_file_name_map[img_id]}')
                index.remove_ids(np.int64([img_id]))
                deleted.append(file_id_to_file_name_map[img_id])
                delete_descriptor_by_id(file_id_to_file_name_map[img_id])
                remove(f'{IMAGE_PATH}/{file_id_to_file_name_map[img_id]}')
Example #10
0
    def test_flat(self):
        d = self.xq.shape[1] * 8

        index = faiss.IndexBinaryFlat(d)
        index.add(self.xb)
        D, I = index.search(self.xq, 3)

        _, tmpnam = tempfile.mkstemp()
        try:
            faiss.write_index_binary(index, tmpnam)

            index2 = faiss.read_index_binary(tmpnam)

            D2, I2 = index2.search(self.xq, 3)

            assert (I2 == I).all()
            assert (D2 == D).all()

        finally:
            os.remove(tmpnam)
Example #11
0
    def test_binary_from_float(self):
        d = self.xq.shape[1] * 8

        float_index = faiss.IndexHNSWFlat(d, 16)
        index = faiss.IndexBinaryFromFloat(float_index)
        index.add(self.xb)
        D, I = index.search(self.xq, 3)

        fd, tmpnam = tempfile.mkstemp()
        os.close(fd)
        try:
            faiss.write_index_binary(index, tmpnam)

            index2 = faiss.read_index_binary(tmpnam)

            D2, I2 = index2.search(self.xq, 3)

            assert (I2 == I).all()
            assert (D2 == D).all()

        finally:
            os.remove(tmpnam)
def init_index():
    global index, POINT_ID
    try:
        index = faiss.read_index_binary("trained.index")
    except:  #temporary index
        d = 61 * 8
        quantizer = faiss.IndexBinaryFlat(d)
        index = faiss.IndexBinaryIVF(quantizer, d, 1)
        index.nprobe = 1
        index.train(np.array([np.zeros(61)], dtype=np.uint8))
    all_ids = get_all_ids()
    for image_id in tqdm(all_ids):
        features = convert_array(get_akaze_features_by_id(image_id))
        point_ids = np.arange(start=POINT_ID,
                              stop=POINT_ID + len(features),
                              dtype=np.int64)
        for point_id in point_ids:
            point_id_to_image_id_map[point_id] = image_id
        image_id_to_point_ids_map[image_id] = point_ids
        POINT_ID += len(features)
        index.add_with_ids(features, point_ids)
    print("Index is ready")
Example #13
0
    def __init__(self, config):

        self.config = config
        self.rec_predictor = RecPredictor(config)
        self.det_predictor = DetPredictor(config)

        assert 'IndexProcess' in config.keys(), "Index config not found ... "
        self.return_k = self.config['IndexProcess']['return_k']

        index_dir = self.config["IndexProcess"]["index_dir"]
        assert os.path.exists(os.path.join(
            index_dir, "vector.index")), "vector.index not found ..."
        assert os.path.exists(os.path.join(
            index_dir, "id_map.pkl")), "id_map.pkl not found ... "

        if config['IndexProcess'].get("binary_index", False):
            self.Searcher = faiss.read_index_binary(
                os.path.join(index_dir, "vector.index"))
        else:
            self.Searcher = faiss.read_index(
                os.path.join(index_dir, "vector.index"))

        with open(os.path.join(index_dir, "id_map.pkl"), "rb") as fd:
            self.id_map = pickle.load(fd)
Example #14
0
 def load_index(self, index_file):
     logging.info("load index from {}".format(index_file))
     return faiss.read_index_binary(index_file)
Example #15
0
def double_test():
    way_index1 = 2
    way_index2 = 3
    start = time.time()
    if not os.path.exists(index_path):
        print('begin registry')
        registry_index(way_index1)
        registry_index(way_index2)
        print('registry complete')
    index2 = faiss.read_index('')

    index1 = faiss.read_index_binary('')
    print('get index')
    FI1 = FaissIndex(index1, way_index1, False)
    FI2 = FaissIndex(index2, way_index2, False)

    print('begin serach')
    pred_bools = []
    i = 0
    rotated_gamma1 = 0
    rotated_gamma2 = 0
    rotated_gamma = 0
    wrong_results = ''
    for d, _, fs in os.walk(test_img_dir):

        for f in fs:
            if f.endswith('csv'):
                continue
            results1 = FI1.search_by_image(join(d, f), TOP_N)
            results2 = FI2.search_by_image(join(d, f), TOP_N)
            print(f)
            print(results2[0]['neighbors'], results1[0]['neighbors'])
            pred_bool1 = show_results(f, results1, save_wrong=False)
            pred_bool2 = show_results(f, results2, save_wrong=False)
            if 'rotate_gamma' in f and not pred_bool1:
                rotated_gamma1 += 1
            if 'rotate_gamma' in f and not pred_bool2:
                rotated_gamma2 += 1

            pred_bool = pred_bool1 or pred_bool2
            pred_bools.append(pred_bool)
            if 'rotate_gamma' in f and not pred_bool:
                rotated_gamma += 1
            i += 1
            # if not pred_bool and 'rotate_gamma' in f:
            #     wrong_results += '{}\n'.format(f)
            if not pred_bool and 'rotate_gamma' not in f:
                wrong_results += '{}\n'.format(f)
        # i += 1    # dir
        # if i == 20000:
        #     break

    # with open('./wrong_results_merge_adddata.json', 'a+') as f:
    #     f.writelines(wrong_results)

    print('acc is: ',
          sum(pred_bools) / len(pred_bools), len(pred_bools), rotated_gamma1,
          rotated_gamma2, rotated_gamma)
    end = time.time()
    avg_img = (end - start) / i
    print(avg_img, i)
Example #16
0
 def load_index(self, index_dir: str):
     index_path = os.path.join(index_dir, 'index')
     index = faiss.read_index_binary(index_path)
     return index, None