def do_mmappedIO(self, sparse): d = 10 nb = 1000 nq = 200 nt = 200 xt, xb, xq = get_dataset_2(d, nb, nt, nq) if sparse: fname = "/tmp/faiss_test_rareio_sparse.faissindex" else: fname = "/tmp/faiss_test_rareio_full.faissindex" quantizer = faiss.IndexFlatL2(d) index1 = faiss.IndexIVFFlat(quantizer, d, 20) if sparse: # makes the inverted lists sparse because all elements get # assigned to the same invlist xt += (np.ones(10) * 1000).astype('float32') index1.train(xt) index1.add(xb) faiss.write_index(index1, fname) index2 = faiss.read_index(fname) self.compare_results(index1, index2, xq) index3 = faiss.read_index(fname, faiss.IO_FLAG_MMAP) self.compare_results(index1, index3, xq)
def get_trained_index(): filename = "%s/%s_%s_trained.index" % ( tmpdir, dbname, index_key) if not os.path.exists(filename): index = faiss.index_factory(d, index_key) n_train = choose_train_size(index_key) xtsub = xt[:n_train] print "Keeping %d train vectors" % xtsub.shape[0] # make sure the data is actually in RAM and in float xtsub = xtsub.astype('float32').copy() index.verbose = True t0 = time.time() index.train(xtsub) index.verbose = False print "train done in %.3f s" % (time.time() - t0) print "storing", filename faiss.write_index(index, filename) else: print "loading", filename index = faiss.read_index(filename) return index
def get_populated_index(preproc): if not index_cachefile or not os.path.exists(index_cachefile): if not altadd: gpu_index, indexall = compute_populated_index(preproc) else: gpu_index, indexall = compute_populated_index_2(preproc) if index_cachefile: print "store", index_cachefile faiss.write_index(indexall, index_cachefile) else: print "load", index_cachefile indexall = faiss.read_index(index_cachefile) gpu_index = None co = faiss.GpuMultipleClonerOptions() co.useFloat16 = use_float16 co.useFloat16CoarseQuantizer = False co.usePrecomputed = use_precomputed_tables co.indicesOptions = 0 co.verbose = 10 co.shard = True # the replicas will be made "manually" t0 = time.time() print "CPU index contains %d vectors, move to GPU" % indexall.ntotal if replicas == 1: if not gpu_index: print "copying loaded index to GPUs" vres, vdev = make_vres_vdev() index = faiss.index_cpu_to_gpu_multiple( vres, vdev, indexall, co) else: index = gpu_index else: del gpu_index # We override the GPU index print "Copy CPU index to %d sharded GPU indexes" % replicas index = faiss.IndexProxy() for i in range(replicas): gpu0 = ngpu * i / replicas gpu1 = ngpu * (i + 1) / replicas vres, vdev = make_vres_vdev(gpu0, gpu1) print " dispatch to GPUs %d:%d" % (gpu0, gpu1) index1 = faiss.index_cpu_to_gpu_multiple( vres, vdev, indexall, co) index1.this.disown() index.addIndex(index1) index.own_fields = True del indexall print "move to GPU done in %.3f s" % (time.time() - t0) return index
def get_populated_index(): filename = "%s/%s_%s_populated.index" % ( tmpdir, dbname, index_key) if not os.path.exists(filename): index = get_trained_index() i0 = 0 t0 = time.time() for xs in matrix_slice_iterator(xb, 100000): i1 = i0 + xs.shape[0] print '\radd %d:%d, %.3f s' % (i0, i1, time.time() - t0), sys.stdout.flush() index.add(xs) i0 = i1 print print "Add done in %.3f s" % (time.time() - t0) print "storing", filename faiss.write_index(index, filename) else: print "loading", filename index = faiss.read_index(filename) return index
def test_dedup(self): d = 10 nb = 1000 nq = 200 nt = 500 xt, xb, xq = get_dataset_2(d, nt, nb, nq) # introduce duplicates xb[500:900:2] = xb[501:901:2] xb[901::4] = xb[900::4] xb[902::4] = xb[900::4] xb[903::4] = xb[900::4] # also in the train set xt[201::2] = xt[200::2] quantizer = faiss.IndexFlatL2(d) index_new = faiss.IndexIVFFlatDedup(quantizer, d, 20) index_new.verbose = True # should display # IndexIVFFlatDedup::train: train on 350 points after dedup (was 500 points) index_new.train(xt) index_ref = faiss.IndexIVFFlat(quantizer, d, 20) assert index_ref.is_trained index_ref.nprobe = 5 index_ref.add(xb) index_new.nprobe = 5 index_new.add(xb) Dref, Iref = index_ref.search(xq, 20) Dnew, Inew = index_new.search(xq, 20) for i in range(nq): ref = self.normalize_res(Dref[i], Iref[i]) new = self.normalize_res(Dnew[i], Inew[i]) assert ref == new # test I/O _, tmpfile = tempfile.mkstemp() try: faiss.write_index(index_new, tmpfile) index_st = faiss.read_index(tmpfile) finally: if os.path.exists(tmpfile): os.unlink(tmpfile) Dst, Ist = index_st.search(xq, 20) for i in range(nq): new = self.normalize_res(Dnew[i], Inew[i]) st = self.normalize_res(Dst[i], Ist[i]) assert st == new # test remove toremove = np.hstack((np.arange(3, 1000, 5), np.arange(850, 950))) index_ref.remove_ids(toremove) index_new.remove_ids(toremove) Dref, Iref = index_ref.search(xq, 20) Dnew, Inew = index_new.search(xq, 20) for i in range(nq): ref = self.normalize_res(Dref[i], Iref[i]) new = self.normalize_res(Dnew[i], Inew[i]) assert ref == new
similarities = scores[0] neighbor_ids = scores[1] margin_scores = scores[2] current_embeddings_ids = scores[3] with open(output_file, 'a') as output: for idx in range(similarities.shape[0]): result = str(similarities[idx].tolist()) + "\t" \ + str(neighbor_ids[idx].tolist()) + "\t" \ + str(margin_scores[idx].tolist()) + "\t" \ + str(current_embeddings_ids[idx]) output.write(result) output.write("\n") if __name__ == "__main__": argument_parser = argparse.ArgumentParser() argument_parser.add_argument("--index", help="Searchable index file") argument_parser.add_argument("--embeddings", help="Embeddings for sentences") argument_parser.add_argument("--batch-size", help="Batch size for searching") argument_parser.add_argument("--output", help="Output file") argument_parser.add_argument("--neighbors", help="Number of nearest neighbors") arguments = argument_parser.parse_args() index = faiss.read_index(arguments.index) index = faiss.index_cpu_to_all_gpus(index) k = int(arguments.neighbors) write_results_to_file(arguments.output, index, arguments.embeddings, int(arguments.batch_size), k)
def __init_indexes(self): for fname in self.base_dir.glob(self.pattern): print(fname) idx = fname.stem.split('_')[-1] self.indexes[int(idx)] = faiss.read_index(str(fname))
def merge_indexes(subindex_dir, trained_index_path, target_index_path, target_idx2id_path, target_inv_path): # target_inv_path = merged_index.ivfdata names = os.listdir(subindex_dir) idx2id_paths = [ os.path.join(subindex_dir, name) for name in names if name.endswith('.hdf5') ] index_paths = [ os.path.join(subindex_dir, name) for name in names if name.endswith('.faiss') ] print(len(idx2id_paths)) print(len(index_paths)) print('copying idx2id') with h5py.File(target_idx2id_path, 'w') as out: for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'): with h5py.File(idx2id_path, 'r') as in_: for key, g in in_.items(): offset = str(g.attrs['offset']) assert key == offset group = out.create_group(offset) group.create_dataset('doc', data=in_[key]['doc']) group.create_dataset('word', data=in_[key]['word']) print('loading invlists') ivfs = [] for index_path in tqdm(index_paths, desc='loading invlists'): # the IO_FLAG_MMAP is to avoid actually loading the data thus # the total size of the inverted lists can exceed the # available RAM index = faiss.read_index(index_path, faiss.IO_FLAG_MMAP) ivfs.append(index.invlists) # avoid that the invlists get deallocated with the index index.own_invlists = False # construct the output index index = faiss.read_index(trained_index_path) # prepare the output inverted lists. They will be written # to merged_index.ivfdata invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size, target_inv_path) # merge all the inverted lists print('merging') ivf_vector = faiss.InvertedListsPtrVector() for ivf in tqdm(ivfs): ivf_vector.push_back(ivf) print("merge %d inverted lists " % ivf_vector.size()) ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size()) print(ntotal) # now replace the inverted lists in the output index index.ntotal = ntotal index.replace_invlists(invlists) print('writing index') faiss.write_index(index, target_index_path)
def setup_faiss(self, args): if not args.dstore_filename: raise ValueError('Cannot build a datastore without the data.') start = time.time() index = faiss.read_index(args.indexfile, faiss.IO_FLAG_ONDISK_SAME_DIR) print('Reading datastore took {} s'.format(time.time() - start)) index.nprobe = args.probe if args.dstore_fp16: print('Keys are fp16 and vals are int16') # Load the keys even if they are not being used. # if not args.no_load_keys: self.keys = np.memmap(args.dstore_filename + '_keys.npy', dtype=np.float16, mode='r', shape=(self.dstore_size, self.dimension)) self.vals = np.memmap(args.dstore_filename + '_vals.npy', dtype=np.int16, mode='r', shape=(self.dstore_size, 1)) else: print('Keys are fp32 and vals are int64') # Load the keys even if they are not being used. # if not args.no_load_keys: self.keys = np.memmap(args.dstore_filename + '_keys.npy', dtype=np.float32, mode='r', shape=(self.dstore_size, self.dimension)) self.vals = np.memmap(args.dstore_filename + '_vals.npy', dtype=np.int, mode='r', shape=(self.dstore_size, 1)) # If you wish to load all the keys into memory # CAUTION: Only do this if your RAM can handle it! if args.move_dstore_to_mem: print('Loading to memory...') start = time.time() if not args.no_load_keys: del self.keys self.keys_from_memmap = np.memmap( args.dstore_filename + '_keys.npy', dtype=np.float32, mode='r', shape=(self.dstore_size, self.dimension)) self.keys = np.zeros( (self.dstore_size, self.dimension), dtype=np.float16 if args.dstore_fp16 else np.float32) self.keys = self.keys_from_memmap[:] self.keys = self.keys.astype( np.float16 if args.dstore_fp16 else np.float32) del self.vals self.vals_from_memmap = np.memmap(args.dstore_filename + '_vals.npy', dtype=np.int, mode='r', shape=(self.dstore_size, 1)) self.vals = np.zeros( (self.dstore_size, 1), dtype=np.int16 if args.dstore_fp16 else np.int) self.vals = self.vals_from_memmap[:] self.vals = self.vals.astype( np.int16 if args.dstore_fp16 else np.int) print('Loading to memory took {} s'.format(time.time() - start)) return index
origins = [ "http://clipis.co", "https://clipis.co", "http://*****:*****@app.post("/encode/image", summary="Index one image") async def encode_image(data: Image): print(data) img_name = data.name data = data.base64img.encode('ascii') index_one_image(data, img_name, model, preprocess, device) return Response("Successful", status_code=200) @app.post("/encode/images", summary="Index multiple images at once") async def encode_images(data: ImageBatch): image_base64_list = data.img_list
###################################################### xt, xb, xq, gt = datasets.load_data(dataset=args.db, compute_gt=args.compute_gt) nq, d = xq.shape nb, d = xb.shape ###################################################### # Make index ###################################################### if os.path.exists(args.indexfile): print "reading", args.indexfile index = faiss.read_index(args.indexfile) if isinstance(index, faiss.IndexPreTransform): index_hnsw = faiss.downcast_index(index.index) vec_transform = index.chain.at(0).apply_py else: index_hnsw = index vec_transform = lambda x: x hnsw = index_hnsw.hnsw hnsw_stats = faiss.cvar.hnsw_stats else: print "build index, key=", args.indexkey
if stage == 0: # train the index xt = fvecs_read("sift1M/sift_learn.fvecs") index = faiss.index_factory(xt.shape[1], "IVF4096,Flat") print("training index") index.train(xt) print("write " + tmpdir + "trained.index") faiss.write_index(index, tmpdir + "trained.index") if 1 <= stage <= 4: # add 1/4 of the database to 4 independent indexes bno = stage - 1 xb = fvecs_read("sift1M/sift_base.fvecs") i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4) index = faiss.read_index(tmpdir + "trained.index") print("adding vectors %d:%d" % (i0, i1)) index.add_with_ids(xb[i0:i1], np.arange(i0, i1)) print("write " + tmpdir + "block_%d.index" % bno) faiss.write_index(index, tmpdir + "block_%d.index" % bno) machine_ports = [ ('localhost', 12010), ('localhost', 12011), ('localhost', 12012), ('localhost', 12013), ] v6 = False if 5 <= stage <= 8: # load an index slice and launch index
import face_recognition import faiss import numpy as np from pathlib import Path from milvus import Milvus, IndexType, MetricType, Status from face_recognition_app.models.images import Images from face_recognition_app import app index = faiss.read_index(app.config['INDEXES_LOCATION'] + 'faiss.index') embeddings = np.load(app.config['EMBEDDINGS_LOCATION'] + 'embeddings.npy', allow_pickle=True).tolist() milvus = Milvus(host=app.config['MILVUS_HOST'], port=app.config['MILVUS_PORT']) MAX_RESULT = 64 class Embeddings(): def is_similar(current_image_encoded, unknown_face_encodings): # match your image with the image and check if it matches result = face_recognition.compare_faces(unknown_face_encodings, current_image_encoded, tolerance=0.6) # check if it was a match if result[0] == True: return True else: return False def distance(current_image_encoded, unknown_face_encodings):
def build(self, config): ''' build index from scratch ''' operation_method = config.get("index_operation", "new").lower() gallery_images, gallery_docs = split_datafile( config['data_file'], config['image_root'], config['delimiter']) # when remove data in index, do not need extract fatures if operation_method != "remove": gallery_features = self._extract_features(gallery_images, config) assert operation_method in [ "new", "remove", "append" ], "Only append, remove and new operation are supported" # vector.index: faiss index file # id_map.pkl: use this file to map id to image_doc if operation_method in ["remove", "append"]: # if remove or append, vector.index and id_map.pkl must exist assert os.path.join( config["index_dir"], "vector.index" ), "The vector.index dose not exist in {} when 'index_operation' is not None".format( config["index_dir"]) assert os.path.join( config["index_dir"], "id_map.pkl" ), "The id_map.pkl dose not exist in {} when 'index_operation' is not None".format( config["index_dir"]) index = faiss.read_index( os.path.join(config["index_dir"], "vector.index")) with open(os.path.join(config["index_dir"], "id_map.pkl"), 'rb') as fd: ids = pickle.load(fd) assert index.ntotal == len(ids.keys( )), "data number in index is not equal in in id_map" else: if not os.path.exists(config["index_dir"]): os.makedirs(config["index_dir"], exist_ok=True) index_method = config.get("index_method", "HNSW32") # if IVF method, cal ivf number automaticlly if index_method == "IVF": index_method = index_method + str( min(int(len(gallery_images) // 8), 65536)) + ",Flat" # for binary index, add B at head of index_method if config["dist_type"] == "hamming": index_method = "B" + index_method #dist_type dist_type = faiss.METRIC_INNER_PRODUCT if config[ "dist_type"] == "IP" else faiss.METRIC_L2 #build index if config["dist_type"] == "hamming": index = faiss.index_binary_factory(config["embedding_size"], index_method) else: index = faiss.index_factory(config["embedding_size"], index_method, dist_type) index = faiss.IndexIDMap2(index) ids = {} if config["index_method"] == "HNSW32": logger.warning( "The HNSW32 method dose not support 'remove' operation") if operation_method != "remove": # calculate id for new data start_id = max(ids.keys()) + 1 if ids else 0 ids_now = ( np.arange(0, len(gallery_images)) + start_id).astype(np.int64) # only train when new index file if operation_method == "new": if config["dist_type"] == "hamming": index.add(gallery_features) else: index.train(gallery_features) if not config["dist_type"] == "hamming": index.add_with_ids(gallery_features, ids_now) for i, d in zip(list(ids_now), gallery_docs): ids[i] = d else: if config["index_method"] == "HNSW32": raise RuntimeError( "The index_method: HNSW32 dose not support 'remove' operation" ) # remove ids in id_map, remove index data in faiss index remove_ids = list( filter(lambda k: ids.get(k) in gallery_docs, ids.keys())) remove_ids = np.asarray(remove_ids) index.remove_ids(remove_ids) for k in remove_ids: del ids[k] # store faiss index file and id_map file if config["dist_type"] == "hamming": faiss.write_index_binary( index, os.path.join(config["index_dir"], "vector.index")) else: faiss.write_index( index, os.path.join(config["index_dir"], "vector.index")) with open(os.path.join(config["index_dir"], "id_map.pkl"), 'wb') as fd: pickle.dump(ids, fd)
def __init__( self, model: str = "paraphrase-multilingual-MiniLM-L12-v2", idx_path: str = os.path.join( os.path.expanduser('~'), ".dialobot", "intent/", ), idx_file: str = "intent.idx", dataset_file: str = "dataset.pkl", fallback_threshold: float = 0.6, topk: int = 5, labeling_count: int = 20, device="cpu", ) -> None: """ IntentRetriever using USE and faiss. Dialobot conducts fallback checking through vector retrieval. Args: model (str): model name for sentence transformers idx_path (str): path to save dataset idx_file (str): file name of trained faiss dataset_file (str): file name of dataset fallback_threshold (float): threshold for fallback checking topk (int): number of distances to return labeling_count (int) : Minimum Labeling Count References: Universal Sentence Encoder (Cer et al., 2018) https://arxiv.org/abs/1803.11175 Billion-scale similarity search with GPUs (Johnson et al., 2017) https://arxiv.org/abs/1702.08734 Note: If the number of data is smaller than the labeling_count, it is classified as 'the number of data', and if it is more than that, it is classified as 'int(the number of data / topk)' labels. Examples: >>> # 1. create retriever >>> retriever = IntentRetriever() >>> # 2. add data, batch data >>> retriever.add(("What time is it now?", "time")) >>> retriever.add(("Tell me today's weather", "weather")) >>> retriever.add([("What time do we meet tomorrow?", "time"), ("How will the weather be tomorrow?", "weather")]) >>> # 3. remove data >>> retriever.remove(("What time is it now?", "time")) >>> # 4. recognize intent >>> retriever.recognize("Tell me tomorrow's weather") 'weather' >>> # 5. set `True` param `detail` if you want more information >>> retriever.recognize("Tell me tomorrow's weather", detail=True) {'intent': 'weather', 'scores': {'weather': 0.98, 'greeting': 0.69, ...} >>> # 6. clear all dataset >>> retriever.clear() """ assert model in self.available_models(), \ "param `retriever_model` must be one of {}".format(str(list(self.available_models()))) self.device = device self.model = SentenceTransformer(model).to(self.device) self.dim = RETRIEVER_MODELS_DIMENSION[model] self.topk = topk self.labeling_count = labeling_count self.quantizer = faiss.IndexFlatIP(self.dim) self.idx_path = idx_path self.idx_file = idx_file self.dataset_file = dataset_file self.fallback_threshold = fallback_threshold if os.path.exists(idx_path + idx_file): self.index = faiss.read_index(idx_path + idx_file) self.nlist = int(self.index.ntotal / self.topk) else: self.nlist = 1 self.index = faiss.IndexIVFFlat( self.quantizer, self.dim, self.nlist, faiss.METRIC_INNER_PRODUCT, ) if os.path.exists(idx_path + dataset_file): with open(idx_path + dataset_file, mode="rb") as f: self.dataset: List[Tuple[str, np.ndarray, str]] = pickle.load(f) else: os.makedirs(idx_path, exist_ok=True) self.dataset: List[Tuple[str, np.ndarray, str]] = []
def __init__(self, path, nprobe, k=1024): index = faiss.read_index(path) index.nprobe = nprobe self.index = index self.k = k
import matplotlib.pyplot as plt import seaborn as sns import os.path import faiss import numpy as np mnist = fetch_openml("mnist_784", version=1) sns.set(context="paper", style="white") index = faiss.IndexHNSWSQ(784, faiss.ScalarQuantizer.QT_8bit, 32) index.verbose = True faiss_index_file = 'faiss.index' if os.path.exists(faiss_index_file): print('load existing index from %s' % faiss_index_file) index = faiss.read_index(faiss_index_file, faiss.IO_FLAG_MMAP) index.hnsw.efSearch = 256 else: # build lossy faiss index print('build new index and save to %s' % faiss_index_file) index.hnsw.efConstruction = 40 data = np.ascontiguousarray(mnist.data, dtype=np.float32) # we no longer need mnist data in its original form print('train index...') index.train(data) print('add vectors to index...') index.add(data) print('save...') faiss.write_index(index, faiss_index_file) reducer = umap.UMAP(random_state=42, init="random", verbose=True, n_epochs=200)
def run_index(args): dump_names = os.listdir(os.path.join(args.dump_dir, args.phrase_dir)) dump_paths = sorted([ os.path.join(args.dump_dir, args.phrase_dir, name) for name in dump_names if name.endswith('.hdf5') ]) data = None if args.stage in ['all', 'coarse']: if args.replace or not os.path.exists(args.quantizer_path): if not os.path.exists(args.index_dir): os.makedirs(args.index_dir) start_data, avg_vec, std_vec = sample_data( dump_paths, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, norm_th=args.norm_th) with open(os.path.join(args.index_dir, 'avg_vec.pkl'), 'wb') as fp: pickle.dump(avg_vec, fp) with open(os.path.join(args.index_dir, 'std_vec.pkl'), 'wb') as fp: pickle.dump(std_vec, fp) if args.stage in ['all', 'fine']: if args.replace or not os.path.exists(args.trained_index_path): if start_data is None: start_data, avg_vec, std_vec = sample_data( dump_paths, doc_sample_ratio=args.doc_sample_ratio, vec_sample_ratio=args.vec_sample_ratio, norm_th=args.norm_th, hnsw=args.hnsw) train_index(start_data, args.quantizer_path, args.trained_index_path, args.num_clusters, fine_quant=args.fine_quant, cuda=args.cuda, hnsw=args.hnsw) if args.stage in ['all', 'add']: if args.replace or not os.path.exists(args.index_path): avg_vec = None std_vec = None # with open(os.path.join(args.index_dir, 'avg_vec.pkl'), 'rb') as fp: # avg_vec = pickle.load(fp) # with open(os.path.join(args.index_dir, 'std_vec.pkl'), 'rb') as fp: # std_vec = pickle.load(fp) if args.dump_paths is not None: dump_paths = args.dump_paths if not os.path.exists(args.subindex_dir): os.makedirs(args.subindex_dir) add_to_index( dump_paths, args.trained_index_path, args.index_path, args.idx2id_path, cuda=args.cuda, num_docs_per_add=args.num_docs_per_add, offset=args.offset, norm_th=args.norm_th, fine_quant=args.fine_quant, avg_vec=avg_vec, std_vec=std_vec, first_passage=args.first_passage, index_filter=args.index_filter, ) if args.stage == 'merge': if args.replace or not os.path.exists(args.index_path): merge_indexes(args.subindex_dir, args.trained_index_path, args.index_path, args.idx2id_path, args.inv_path) if args.stage == 'move': index = faiss.read_index(args.trained_index_path) invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size, args.inv_path) index.replace_invlists(invlists) faiss.write_index(index, args.index_path)
def __init__(self, phrase_dump_dir, tfidf_dump_dir, start_index_path, idx2id_path, max_norm_path, doc_rank_fn, cuda=False, dump_only=False): # If dump dir is a file, use it as a dump. if os.path.isdir(phrase_dump_dir): self.phrase_dump_paths = sorted([ os.path.join(phrase_dump_dir, name) for name in os.listdir(phrase_dump_dir) if 'hdf5' in name ]) dump_names = [ os.path.splitext(os.path.basename(path))[0] for path in self.phrase_dump_paths ] self.dump_ranges = [ list(map(int, name.split('-'))) for name in dump_names ] else: self.phrase_dump_paths = [phrase_dump_dir] self.phrase_dumps = [ h5py.File(path, 'r') for path in self.phrase_dump_paths ] # Load tfidf dump assert os.path.isdir(tfidf_dump_dir), tfidf_dump_dir self.tfidf_dump_paths = sorted([ os.path.join(tfidf_dump_dir, name) for name in os.listdir(tfidf_dump_dir) if 'hdf5' in name ]) tfidf_dump_names = [ os.path.splitext(os.path.basename(path))[0] for path in self.tfidf_dump_paths ] if '-' in tfidf_dump_names[0]: # Range check tfidf_dump_ranges = [ list(map(int, name.split('_')[0].split('-'))) for name in tfidf_dump_names ] assert tfidf_dump_ranges == self.dump_ranges self.tfidf_dumps = [ h5py.File(path, 'r') for path in self.tfidf_dump_paths ] logger.info(f'using doc ranker functions: {doc_rank_fn["index"]}') self.doc_rank_fn = doc_rank_fn if dump_only: return # Read index logger.info(f'Reading {start_index_path}') self.start_index = faiss.read_index(start_index_path, faiss.IO_FLAG_ONDISK_SAME_DIR) self.idx_f = self.load_idx_f(idx2id_path) with open(max_norm_path, 'r') as fp: self.max_norm = json.load(fp) # Options self.num_docs_list = [] self.cuda = cuda if self.cuda: assert torch.cuda.is_available( ), f"Cuda availability {torch.cuda.is_available()}" self.device = torch.device('cuda') else: self.device = torch.device("cpu")
data = np.array(data).astype('float32') printTime() log_instance.info('normalize_L2') normalize_L2(data) log_instance.info("start upload data faiss!") printTime() # index = faiss.IndexFlatIP(data_dimension) # index.train(data) # log_instance.info(index.is_trained) # index.add(data) print("loading", "jiadian_populated.index") index = faiss.read_index("jiadian_populated.index") log_instance.info(index.ntotal) log_instance.info("end upload data faiss!") printTime() query = np.array(search_lst).astype('float32') normalize_L2(query) topN = 3 log_instance.info('start query!') dis, ind = index.search(query, topN) printTime() log_instance.info('end query!')
p.add_argument('--port', type=int, default=8080, help='Port to listen on') p.add_argument('--index', default=[], required=True, action="append", help="Location of FAISS index file.") p.add_argument( '--map', type=str, default='', help='Location of file mapping index IDs to other IDs as needed.') args = p.parse_args() # Import the index # Just taking the first element for this example, but the index flag now supports multiple index files. Use them as you need. idx = faiss.read_index(args.index[0]) # Create the ID map id_map = {} if args.map: with open(args.map, 'rb') as f: for line in f: line = line.strip() if line.startswith('#'): continue k, v = line.split(':') id_map[int(k.strip())] = v.strip() idxsvc = IndexSvc(__name__, host="::", port=args.port) idxsvc.RegisterQuery(index) idxsvc.set_map(id_map) idxsvc.run()
def loadIndex(filename): return faiss.read_index(filename)
if stage == 0: # train the index xt = fvecs_read("sift1M/sift_learn.fvecs") index = faiss.index_factory(xt.shape[1], "IVF4096,Flat") print("training index") index.train(xt) print("write " + tmpdir + "trained.index") faiss.write_index(index, tmpdir + "trained.index") if 1 <= stage <= 4: # add 1/4 of the database to 4 independent indexes bno = stage - 1 xb = fvecs_read("sift1M/sift_base.fvecs") i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4) index = faiss.read_index(tmpdir + "trained.index") print("adding vectors %d:%d" % (i0, i1)) index.add(xb[i0:i1]) print("write " + tmpdir + "block_%d.index" % bno) faiss.write_index(index, tmpdir + "block_%d.index" % bno) if stage == 5: # merge the images into an on-disk index # first load the inverted lists ivfs = [] for bno in range(4): # the IO_FLAG_MMAP is to avoid actually loading the data thus # the total size of the inverted lists can exceed the # available RAM print("read " + tmpdir + "block_%d.index" % bno)
def pdd_research_idx(input_idx_file, output_test_result, query_num, sku2vector_dir, jd_check_file): data = [] sku_lst = [] error_sku = [] idx = 0 log_instance.info("start upload data sku part!") printTime() f_name_lst = [] for root, dirs, files in os.walk(sku2vector_dir): for f_name in files: if not f_name.startswith('part_'): continue f_name = os.path.join(root, f_name) f_name_lst.append(f_name) f_name_lst = sorted(f_name_lst) for f_name in f_name_lst: with open(f_name) as f1: for line in f1: line = line.strip() if line == '': continue lst1 = line.split('\t') if len(lst1) != 2: continue lst1 = [tmp for tmp in lst1] sku, vec_str = lst1 lst2 = vec_str.split("|") lst2 = [float(d1) for d1 in lst2] data.append(lst2) sku_lst.append(sku) if idx % 5000 == 0: log_instance.info("idx: %s" % idx) idx += 1 log_instance.info("end upload data sku part!") printTime() log_instance.info(len(error_sku)) log_instance.info(len(data)) search_lst = data[:query_num] query = np.array(search_lst).astype('float32') normalize_L2(query) topN = 3 log_instance.info('loading index from %s!' % (input_idx_file)) printTime() index = faiss.read_index(input_idx_file) log_instance.info('ended loading index from %s!' % (input_idx_file)) printTime() log_instance.info('start query!') faiss.omp_set_num_threads(10) dis, ind = index.search(query, topN) printTime() log_instance.info('end query!') def result_dealing(dis, ind, sku_lst, result_warehouse_list): score_lst = dis.tolist() idx_lst = ind.tolist() r_lst = [] for j in range(len(score_lst)): s1, s2, s3 = score_lst[j] i1, i2, i3 = idx_lst[j] sku1_pdd = sku_lst[j] # if sku1 not in sku_name_dict: # n1 = "unk" # log_instance.info(sku1) # else: # n1 = sku_name_dict[sku1] # if sku2 not in sku_name_dict: # n2 = "unk" # log_instance.info(sku2) # else: # n2 = sku_name_dict[sku2] # if sku3 not in sku_name_dict: # n3 = "unk" # log_instance.info(sku3) # else: # n3 = sku_name_dict[sku3] r_lst.append( "%s\t%s:%s\t%s:%s\t%s:%s" % (sku1_pdd, result_warehouse_list[i1], s1, result_warehouse_list[i2], s2, result_warehouse_list[i3], s3)) with open(output_test_result, "w") as f2: f2.write("\n".join(r_lst)) f2.flush() with open(jd_check_file, "r", encoding="utf-8") as f3: result_warehouse_list = json.load(f3) result_dealing(dis, ind, sku_lst, result_warehouse_list) log_instance.info("finished!")
def initialize(self): self.index = faiss.read_index(self.faiss_filepath)
def add_to_index(dump_paths, trained_index_path, target_index_path, idx2id_path, num_docs_per_add=1000, cuda=False, fine_quant='SQ4', offset=0, norm_th=999, ignore_ids=None, avg_vec=None, std_vec=None, first_passage=False, index_filter=-1e8): sidx2doc_id = [] sidx2word_id = [] dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths] # filter dumps if index_filter != -1e8: f_dumps = [ h5py.File(dump_path.replace('/phrase/', '/filter/'), 'r') for dump_path in dump_paths ] print('reading %s' % trained_index_path) start_index = faiss.read_index(trained_index_path) if 'none' not in fine_quant: index_ivf = faiss.extract_index_ivf(start_index) index_ivf.make_direct_map() index_ivf.set_direct_map_type(faiss.DirectMap.Hashtable) if cuda: if 'PQ' in fine_quant: index_ivf = faiss.extract_index_ivf(start_index) quantizer = index_ivf.quantizer quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer) index_ivf.quantizer = quantizer_gpu else: res = faiss.StandardGpuResources() co = faiss.GpuClonerOptions() co.useFloat16 = True start_index = faiss.index_cpu_to_gpu(res, 0, start_index, co) print('adding following dumps:') for dump_path in dump_paths: print(dump_path) start_total = 0 start_total_prev = 0 cnt = 0 for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')): starts = [] start_valids = [] dump_length = len(phrase_dump) for i, (doc_idx, doc_group) in enumerate( tqdm(phrase_dump.items(), desc='adding %d' % di)): if ignore_ids is not None and doc_idx in ignore_ids: continue num_start = doc_group['start'].shape[0] if num_start == 0: continue cnt += 1 # First passage only if first_passage: f2o_start = doc_group['f2o_start'][:] cut = sum(f2o_start < doc_group['len_per_para'][0]) start = int8_to_float(doc_group['start'][:cut], doc_group.attrs['offset'], doc_group.attrs['scale']) num_start = start.shape[0] # Apply index filter elif index_filter != -1e8: o2f_start = { orig: ft for ft, orig in enumerate(doc_group['f2o_start'][:]) } filter_start = f_dumps[di][doc_idx]['filter_start'][:] filter_end = f_dumps[di][doc_idx]['filter_end'][:] start_idxs, = np.where(filter_start > index_filter) end_idxs, = np.where(filter_end > index_filter) save_idx = set(np.concatenate([start_idxs, end_idxs])) save_idx = sorted( [o2f_start[si] for si in save_idx if si in o2f_start]) start = int8_to_float(doc_group['start'][save_idx], doc_group.attrs['offset'], doc_group.attrs['scale']) num_start = start.shape[0] else: start = int8_to_float(doc_group['start'][:], doc_group.attrs['offset'], doc_group.attrs['scale']) start_valid = np.linalg.norm(start, axis=1) <= norm_th starts.append(start) start_valids.append(start_valid) sidx2doc_id.extend([int(doc_idx)] * num_start) if index_filter == -1e8: sidx2word_id.extend(range(num_start)) else: sidx2word_id.extend(save_idx) start_total += num_start if len(starts) > 0 and ((i % num_docs_per_add == 0) or (i == dump_length - 1)): print('adding at %d' % (i + 1)) add_with_offset( start_index, concat_vectors(starts), concat_vectors(start_valids), start_total_prev, offset, fine_quant, ) start_total_prev = start_total starts = [] start_valids = [] if len(starts) > 0: print('final adding at %d' % (i + 1)) add_with_offset( start_index, concat_vectors(starts), concat_vectors(start_valids), start_total_prev, offset, fine_quant, ) start_total_prev = start_total print('number of docs', cnt) for dump in dumps: dump.close() if cuda: print('moving back to cpu') if 'PQ' in fine_quant: index_ivf.quantizer = quantizer del quantizer_gpu else: start_index = faiss.index_gpu_to_cpu(start_index) print('start_index ntotal: %d' % start_index.ntotal) print(start_total) sidx2doc_id = np.array(sidx2doc_id, dtype=np.int32) sidx2word_id = np.array(sidx2word_id, dtype=np.int32) print('writing index and metadata') with h5py.File(idx2id_path, 'w') as f: g = f.create_group(str(offset)) g.create_dataset('doc', data=sidx2doc_id) g.create_dataset('word', data=sidx2word_id) g.attrs['offset'] = offset faiss.write_index(start_index, target_index_path) print('done')
def load_index(self, index_dir: str): index_path = os.path.join(index_dir, 'index') docid_path = os.path.join(index_dir, 'docid') index = faiss.read_index(index_path) docids = self.load_docids(docid_path) return index, docids
compute_gt=args.compute_gt) print("dataset sizes: train %s base %s query %s GT %s" % (xt.shape, xb.shape, xq.shape, gt.shape)) nq, d = xq.shape nb, d = xb.shape ###################################################### # Make index ###################################################### if args.indexfile and os.path.exists(args.indexfile): print("reading", args.indexfile) index = faiss.read_index(args.indexfile) if isinstance(index, faiss.IndexPreTransform): index_ivf = faiss.downcast_index(index.index) else: index_ivf = index assert isinstance(index_ivf, faiss.IndexIVF) vec_transform = lambda x: x assert isinstance(index_ivf, faiss.IndexIVF) else: print("build index, key=", args.indexkey) index = faiss.index_factory(d, args.indexkey)