def dedup(): global index all_data = get_all_data() if len(all_data) == 0: print("all_data no images. exit()") exit() try: index = faiss.read_index_binary("trained.index") except: index = faiss.read_index_binary("trained_import.index") image_ids = np.array([np.int64(x[0]) for x in all_data]) phashes = np.array([x[1] for x in all_data]) index.add_with_ids(phashes, image_ids) print("Index is ready") import_all_data = import_get_all_data() if len(import_all_data) == 0: print("import_all_data no images. exit()") exit() for x in tqdm(import_all_data): filename = x[0] features = x[1] res = phash_reverse_search(features) if len(res) != 0: print(f'duplicate {filename} - {res}') print(f'deleting {filename}') remove(f'{IMAGE_PATH}/{x[0]}')
def test_remove_id_map_binary(self): sub_index = faiss.IndexBinaryFlat(40) xb = np.zeros((10, 5), dtype='uint8') xb[:, 0] = np.arange(10) + 100 index = faiss.IndexBinaryIDMap2(sub_index) index.add_with_ids(xb, np.arange(10) + 1000) assert index.reconstruct(1004)[0] == 104 index.remove_ids(np.array([1003])) assert index.reconstruct(1004)[0] == 104 try: index.reconstruct(1003) except: pass else: assert False, 'should have raised an exception' # while we are there, let's test I/O as well... _, tmpnam = tempfile.mkstemp() try: faiss.write_index_binary(index, tmpnam) index = faiss.read_index_binary(tmpnam) finally: os.remove(tmpnam) assert index.reconstruct(1004)[0] == 104 try: index.reconstruct(1003) except: pass else: assert False, 'should have raised an exception'
def test_ivf_flat(self): d = self.xq.shape[1] * 8 quantizer = faiss.IndexBinaryFlat(d) index = faiss.IndexBinaryIVF(quantizer, d, 8) index.cp.min_points_per_centroid = 5 # quiet warning index.nprobe = 4 index.train(self.xt) index.add(self.xb) D, I = index.search(self.xq, 3) _, tmpnam = tempfile.mkstemp() try: faiss.write_index_binary(index, tmpnam) index2 = faiss.read_index_binary(tmpnam) D2, I2 = index2.search(self.xq, 3) assert (I2 == I).all() assert (D2 == D).all() finally: os.remove(tmpnam)
def dedup(): global index, POINT_ID try: index = faiss.read_index_binary("trained_import.index") except: print("trained_import.index not found. exit()") exit() all_data = import_get_all_data() for obj in tqdm(all_data): image_id = obj[0] features = obj[1] point_ids = np.arange(start=POINT_ID, stop=POINT_ID + len(features), dtype=np.int64) for point_id in point_ids: point_id_to_image_id_map[point_id] = image_id image_id_to_point_ids_map[image_id] = point_ids POINT_ID += len(features) index.add_with_ids(features, point_ids) print("Index is ready") deleted = [] for x in tqdm(all_data): filename = x[0] if filename in deleted: continue features = x[1] res = akaze_reverse_search(features, filename) if len(res) > 0 and res[0] != filename: print("=============") print(filename) print(res[0]) print("=============") deleted.extend(res)
def load(self, path1, path2): '''only faiss need this procedure''' if self.binary: self.searcher = faiss.read_index_binary(path1) else: self.searcher = faiss.read_index(path1) with open(path2, 'rb') as f: self.corpus = joblib.load(f)
def test_write_580M(self): dim = 8 nhash = 1 num_million = 580 # changing to 570 works index1 = faiss.IndexBinaryMultiHash(dim, nhash, int(dim/nhash)) random_hash_codes = np.random.randint(0, 256, ( num_million * int(1e6), int(dim/8))).astype("uint8") index1.add(random_hash_codes) faiss.write_index_binary(index1, "/tmp/tmp.faiss") index2 = faiss.read_index_binary("/tmp/tmp.faiss")
def test_read_index_ownership(self): d = self.xq.shape[1] * 8 index = faiss.IndexBinaryFlat(d) index.add(self.xb) _, tmpnam = tempfile.mkstemp() try: faiss.write_index_binary(index, tmpnam) index2 = faiss.read_index_binary(tmpnam) assert index2.thisown finally: os.remove(tmpnam)
def init_index(): global index try: index = faiss.read_index_binary("trained.index") except: d = 32 * 8 quantizer = faiss.IndexBinaryFlat(d) index = faiss.IndexBinaryIVF(quantizer, d, 1) index.nprobe = 1 index.train(np.array([np.zeros(32)], dtype=np.uint8)) all_data = get_all_data() image_ids = np.array([np.int64(x[0]) for x in all_data]) phashes = np.array([x[1] for x in all_data]) if len(all_data) != 0: index.add_with_ids(phashes, image_ids) print("Index is ready")
def dedup(): global index all_data = get_all_data() if len(all_data) == 0: print("all_data no images. exit()") exit() index = faiss.read_index_binary("trained_import.index") image_ids = np.arange(len(all_data), dtype=np.int64) for i in range(len(all_data)): file_id_to_file_name_map[i] = all_data[i][0] phashes = np.array([x[1] for x in all_data]) index.add_with_ids(phashes, image_ids) print("Index is ready") deleted = [] for x in tqdm(all_data): if x[0] in deleted: continue res = phash_reverse_search(x[1]) if len(res) != 0 and (not (len(res) == 1 and file_id_to_file_name_map[res[0]] == x[0])): images_id_res = [] for img_id in res: width, height = imagesize.get( f'{IMAGE_PATH}/{file_id_to_file_name_map[img_id]}') images_id_res.append((img_id, width * height)) print("===============") print(f"duplicates of {x[0]}:") for x in images_id_res: print(f"{file_id_to_file_name_map[x[0]]} - {x[1]} pixels") print("===============") images_id_res.sort(key=lambda x: x[1], reverse=True) # keep img with biggest resolution (skip image with most pixels) for i in range(1, len(images_id_res)): img_id = images_id_res[i][0] print(f'deleting {file_id_to_file_name_map[img_id]}') index.remove_ids(np.int64([img_id])) deleted.append(file_id_to_file_name_map[img_id]) delete_descriptor_by_id(file_id_to_file_name_map[img_id]) remove(f'{IMAGE_PATH}/{file_id_to_file_name_map[img_id]}')
def test_flat(self): d = self.xq.shape[1] * 8 index = faiss.IndexBinaryFlat(d) index.add(self.xb) D, I = index.search(self.xq, 3) _, tmpnam = tempfile.mkstemp() try: faiss.write_index_binary(index, tmpnam) index2 = faiss.read_index_binary(tmpnam) D2, I2 = index2.search(self.xq, 3) assert (I2 == I).all() assert (D2 == D).all() finally: os.remove(tmpnam)
def test_binary_from_float(self): d = self.xq.shape[1] * 8 float_index = faiss.IndexHNSWFlat(d, 16) index = faiss.IndexBinaryFromFloat(float_index) index.add(self.xb) D, I = index.search(self.xq, 3) fd, tmpnam = tempfile.mkstemp() os.close(fd) try: faiss.write_index_binary(index, tmpnam) index2 = faiss.read_index_binary(tmpnam) D2, I2 = index2.search(self.xq, 3) assert (I2 == I).all() assert (D2 == D).all() finally: os.remove(tmpnam)
def init_index(): global index, POINT_ID try: index = faiss.read_index_binary("trained.index") except: #temporary index d = 61 * 8 quantizer = faiss.IndexBinaryFlat(d) index = faiss.IndexBinaryIVF(quantizer, d, 1) index.nprobe = 1 index.train(np.array([np.zeros(61)], dtype=np.uint8)) all_ids = get_all_ids() for image_id in tqdm(all_ids): features = convert_array(get_akaze_features_by_id(image_id)) point_ids = np.arange(start=POINT_ID, stop=POINT_ID + len(features), dtype=np.int64) for point_id in point_ids: point_id_to_image_id_map[point_id] = image_id image_id_to_point_ids_map[image_id] = point_ids POINT_ID += len(features) index.add_with_ids(features, point_ids) print("Index is ready")
def __init__(self, config): self.config = config self.rec_predictor = RecPredictor(config) self.det_predictor = DetPredictor(config) assert 'IndexProcess' in config.keys(), "Index config not found ... " self.return_k = self.config['IndexProcess']['return_k'] index_dir = self.config["IndexProcess"]["index_dir"] assert os.path.exists(os.path.join( index_dir, "vector.index")), "vector.index not found ..." assert os.path.exists(os.path.join( index_dir, "id_map.pkl")), "id_map.pkl not found ... " if config['IndexProcess'].get("binary_index", False): self.Searcher = faiss.read_index_binary( os.path.join(index_dir, "vector.index")) else: self.Searcher = faiss.read_index( os.path.join(index_dir, "vector.index")) with open(os.path.join(index_dir, "id_map.pkl"), "rb") as fd: self.id_map = pickle.load(fd)
def load_index(self, index_file): logging.info("load index from {}".format(index_file)) return faiss.read_index_binary(index_file)
def double_test(): way_index1 = 2 way_index2 = 3 start = time.time() if not os.path.exists(index_path): print('begin registry') registry_index(way_index1) registry_index(way_index2) print('registry complete') index2 = faiss.read_index('') index1 = faiss.read_index_binary('') print('get index') FI1 = FaissIndex(index1, way_index1, False) FI2 = FaissIndex(index2, way_index2, False) print('begin serach') pred_bools = [] i = 0 rotated_gamma1 = 0 rotated_gamma2 = 0 rotated_gamma = 0 wrong_results = '' for d, _, fs in os.walk(test_img_dir): for f in fs: if f.endswith('csv'): continue results1 = FI1.search_by_image(join(d, f), TOP_N) results2 = FI2.search_by_image(join(d, f), TOP_N) print(f) print(results2[0]['neighbors'], results1[0]['neighbors']) pred_bool1 = show_results(f, results1, save_wrong=False) pred_bool2 = show_results(f, results2, save_wrong=False) if 'rotate_gamma' in f and not pred_bool1: rotated_gamma1 += 1 if 'rotate_gamma' in f and not pred_bool2: rotated_gamma2 += 1 pred_bool = pred_bool1 or pred_bool2 pred_bools.append(pred_bool) if 'rotate_gamma' in f and not pred_bool: rotated_gamma += 1 i += 1 # if not pred_bool and 'rotate_gamma' in f: # wrong_results += '{}\n'.format(f) if not pred_bool and 'rotate_gamma' not in f: wrong_results += '{}\n'.format(f) # i += 1 # dir # if i == 20000: # break # with open('./wrong_results_merge_adddata.json', 'a+') as f: # f.writelines(wrong_results) print('acc is: ', sum(pred_bools) / len(pred_bools), len(pred_bools), rotated_gamma1, rotated_gamma2, rotated_gamma) end = time.time() avg_img = (end - start) / i print(avg_img, i)
def load_index(self, index_dir: str): index_path = os.path.join(index_dir, 'index') index = faiss.read_index_binary(index_path) return index, None