def test_remove_id_map(self): sub_index = faiss.IndexFlat(5) xb = np.zeros((10, 5), dtype='float32') xb[:, 0] = np.arange(10) + 1000 index = faiss.IndexIDMap2(sub_index) index.add_with_ids(xb, np.arange(10) + 100) assert index.reconstruct(104)[0] == 1004 index.remove_ids(np.array([103])) assert index.reconstruct(104)[0] == 1004 try: index.reconstruct(103) except: pass else: assert False, 'should have raised an exception'
def __init__(self, dim: int, save_path: str, num_threads: int = None): """ Constructor. :param dim: :param save_path: :param num_threads """ self.dim = dim if num_threads is not None and num_threads > 0: faiss.omp_set_num_threads(num_threads) if isfile(save_path): logging.debug("restore: %s", save_path) self._index = faiss.read_index(save_path) else: self._sub_index = faiss.IndexFlat(dim) self._index = faiss.IndexIDMap2(self._sub_index)
def __init__(self, d, k, use_gpu=False, add_with_ids=False): """ Initialize the class with the dimension of vectors :param k: Number of neighbors to search :param d: dimension of the database and query vectors """ self.d = d self.index = faiss.IndexFlatL2(self.d) self.add_with_ids = add_with_ids if self.add_with_ids: self.index = faiss.IndexIDMap2(self.index) self.use_gpu = use_gpu if self.use_gpu: os.environ['CUDA_VISIBLE_DEVICES'] = "0" self.convert_to_gpu() # self.index = faiss.GpuIndexFlatL2(res, self.d, flat_config) # Does brute force neighbor search # self.index = faiss.IndexFlatIP(d) self.k = k
def create_faiss_model(item_embedding, item_list, faiss_path, size=128, mode="train"): item_embedding = np.array(item_embedding, dtype=np.float32) ids = np.array(item_list).astype("int") if mode == "train": index = faiss.index_factory(size, "IVF100,Flat", faiss.METRIC_INNER_PRODUCT) index.nprobe = 20 index.train(item_embedding) # 初始化make_direct_map,reconstruct 重建向量 index.make_direct_map() index_id = faiss.IndexIDMap2(index) elif mode == "update": index_id = faiss.read_index(faiss_path) index_id.add_with_ids(item_embedding, ids) # index保存 faiss.write_index(index_id, faiss_path) return index
def __init__(self, d=64, index_path='/workspace/zhiyi/data/faiss.index'): self.faiss_sub_index = faiss.IndexFlatL2(d) self.faiss_index = faiss.IndexIDMap2(self.faiss_sub_index) self.index_path = index_path
def __init__(self, num_dimensions): self.num_dimensions = num_dimensions self.index = faiss.IndexFlatL2(num_dimensions) self.index_id_map = faiss.IndexIDMap2(self.index)
import numpy as np d = 64 nb = 100 np.random.seed(1234) xb = np.random.random((nb, d)).astype('float32') xb[:, 0] += np.arange(nb) / 1000. import faiss _sub_index = faiss.IndexFlatL2(d) index = faiss.IndexIDMap2(_sub_index) print(index.is_trained) index.add_with_ids(xb, np.arange(start=10, stop=10 + nb)) # type: ignore print(index.ntotal) k = 4 D, I = index.search(xb[:5], k) # type: ignore arr = [] ids = index.get_ids() # type: ignore print(ids) print(ids[0])
def delete_products(current_products, products_to_update): """ Удаление продуктов, которых нет в новом xml или которые удалили из директории с дополнительными изображениями. Удаление индексов продуктов из модели с faiss индексами :param current_products: датафрейм из нового xml-файла :param products_to_update: текущий датафрейм :return: датафрейм products_to_update с удаленными продуктами """ logging.info('Удаление продуктов') idx_to_remove = [] # удаление продуктов, которых нет в новом xml # смотрим, какие продукты нужно удалить, исходя из отсутствия артикулов продуктов текущей модели в новом xml products_to_update_vendor_code = set(products_to_update['vendor_code']) current_products_vendor_code = set(current_products['vendor_code']) products_to_update_vendor_code.difference_update( current_products_vendor_code) if products_to_update_vendor_code: for index, row in products_to_update.iterrows(): if row['vendor_code'] in products_to_update_vendor_code: idx_to_remove.append(index) products_to_update.drop(index, inplace=True) # удаление строк с изображениями, которых уже нет в директории current_product_files = set( glob.glob(config.PATH_TO_PRODUCT_FOLDER + '/*/*/*')) product_files_to_update = set( products_to_update[~products_to_update['picture'].str. startswith('http')]['picture'].values) product_files_to_update.difference_update(current_product_files) if product_files_to_update: for file in product_files_to_update: index = products_to_update[products_to_update['picture'] == file].index[0] products_to_update.drop(index, inplace=True) idx_to_remove.append(index) products_to_update.reset_index(inplace=True, drop=True) # если ничего не удаляли, возвращаем неизмененный датафрейм if not idx_to_remove: logging.info('Продуктов для удаления нет') return products_to_update # удаление из faiss соответствующих векторов logging.info(f'Удаление индексов из faiss [{len(idx_to_remove)} строк]') index = faiss.read_index(config.PATH_TO_FAISS_INDEX) vectors = [index.reconstruct(i) for i in range(index.ntotal)] vectors_without_removed = [ vectors[i] for i in range(len(vectors)) if i not in idx_to_remove ] updated_index = faiss.IndexFlatL2(2048) updated_index = faiss.IndexIDMap2(updated_index) updated_index.add_with_ids( np.vstack(vectors_without_removed), np.hstack([i for i in range(len(vectors_without_removed))])) logging.info('Запись') faiss.write_index(updated_index, config.PATH_TO_FAISS_INDEX) products_to_update.to_pickle(config.PATH_TO_PRODUCT_DATASET) assert updated_index.ntotal == products_to_update.shape[0] logging.info(f'Удалено {len(idx_to_remove)} строк') return products_to_update
port=33335, log_level="info") from pydantic import BaseModel from fastapi import FastAPI, File, Form, HTTPException, Response, status import faiss from os import listdir import numpy as np from tqdm import tqdm import cv2 import sqlite3 import io conn = sqlite3.connect('rgb_histograms.db') IMAGE_PATH = "./../../../public/images" sub_index = faiss.IndexFlat(4096, faiss.METRIC_L1) index_id_map = faiss.IndexIDMap2(sub_index) def init_index(): global index_flat all_ids = get_all_ids() for image_id in tqdm(all_ids): features = convert_array(get_rgb_histogram_by_id(image_id)) index_id_map.add_with_ids(np.array([features]), np.int64([image_id])) print("Index is ready") def read_img_file(image_data): return np.fromstring(image_data, np.uint8)
def build(self, config): ''' build index from scratch ''' operation_method = config.get("index_operation", "new").lower() gallery_images, gallery_docs = split_datafile( config['data_file'], config['image_root'], config['delimiter']) # when remove data in index, do not need extract fatures if operation_method != "remove": gallery_features = self._extract_features(gallery_images, config) assert operation_method in [ "new", "remove", "append" ], "Only append, remove and new operation are supported" # vector.index: faiss index file # id_map.pkl: use this file to map id to image_doc if operation_method in ["remove", "append"]: # if remove or append, vector.index and id_map.pkl must exist assert os.path.join( config["index_dir"], "vector.index" ), "The vector.index dose not exist in {} when 'index_operation' is not None".format( config["index_dir"]) assert os.path.join( config["index_dir"], "id_map.pkl" ), "The id_map.pkl dose not exist in {} when 'index_operation' is not None".format( config["index_dir"]) index = faiss.read_index( os.path.join(config["index_dir"], "vector.index")) with open(os.path.join(config["index_dir"], "id_map.pkl"), 'rb') as fd: ids = pickle.load(fd) assert index.ntotal == len(ids.keys( )), "data number in index is not equal in in id_map" else: if not os.path.exists(config["index_dir"]): os.makedirs(config["index_dir"], exist_ok=True) index_method = config.get("index_method", "HNSW32") # if IVF method, cal ivf number automaticlly if index_method == "IVF": index_method = index_method + str( min(int(len(gallery_images) // 8), 65536)) + ",Flat" # for binary index, add B at head of index_method if config["dist_type"] == "hamming": index_method = "B" + index_method #dist_type dist_type = faiss.METRIC_INNER_PRODUCT if config[ "dist_type"] == "IP" else faiss.METRIC_L2 #build index if config["dist_type"] == "hamming": index = faiss.index_binary_factory(config["embedding_size"], index_method) else: index = faiss.index_factory(config["embedding_size"], index_method, dist_type) index = faiss.IndexIDMap2(index) ids = {} if config["index_method"] == "HNSW32": logger.warning( "The HNSW32 method dose not support 'remove' operation") if operation_method != "remove": # calculate id for new data start_id = max(ids.keys()) + 1 if ids else 0 ids_now = ( np.arange(0, len(gallery_images)) + start_id).astype(np.int64) # only train when new index file if operation_method == "new": if config["dist_type"] == "hamming": index.add(gallery_features) else: index.train(gallery_features) if not config["dist_type"] == "hamming": index.add_with_ids(gallery_features, ids_now) for i, d in zip(list(ids_now), gallery_docs): ids[i] = d else: if config["index_method"] == "HNSW32": raise RuntimeError( "The index_method: HNSW32 dose not support 'remove' operation" ) # remove ids in id_map, remove index data in faiss index remove_ids = list( filter(lambda k: ids.get(k) in gallery_docs, ids.keys())) remove_ids = np.asarray(remove_ids) index.remove_ids(remove_ids) for k in remove_ids: del ids[k] # store faiss index file and id_map file if config["dist_type"] == "hamming": faiss.write_index_binary( index, os.path.join(config["index_dir"], "vector.index")) else: faiss.write_index( index, os.path.join(config["index_dir"], "vector.index")) with open(os.path.join(config["index_dir"], "id_map.pkl"), 'wb') as fd: pickle.dump(ids, fd)