def _post_process(self, dataset, resources_paths): if self.config.with_index: index_file = resources_paths["embeddings_index"] if os.path.exists(index_file): dataset.load_faiss_index("embeddings", index_file) else: if "embeddings" not in dataset.column_names: raise ValueError("Couldn't build the index because there are no embeddings.") import faiss train_size = self.config.index_train_size logging.info("Building wiki_dpr faiss index") if self.config.index_name == "exact": d = 768 index = faiss.IndexHNSWFlat(d, 512, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) else: d = 768 quantizer = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_INNER_PRODUCT) ivf_index = faiss.IndexIVFPQ(quantizer, d, 4096, 64, 8, faiss.METRIC_INNER_PRODUCT) ivf_index.own_fields = True quantizer.this.disown() dataset.add_faiss_index( "embeddings", train_size=train_size, custom_index=ivf_index, ) logging.info("Saving wiki_dpr faiss index") dataset.save_faiss_index("embeddings", index_file) return dataset
def build_hnsw(self, to_file, ef=2000, m=64): ''' @description: 训练hnsw模型 @param {type} to_file: 模型保存目录 @return: ''' logging.info('Building hnsw index.') vecs = np.stack(self.data['custom_vec'].values).reshape(-1, 300) vecs = vecs.astype('float32') dim = self.w2v_model.vector_size # Declaring index index = faiss.IndexHNSWFlat(dim, m) # build the index res = faiss.StandardGpuResources() # use a single GPU faiss.index_cpu_to_gpu(res, 0, index) # make it a GPU index index.hnsw.efConstruction = ef print("add") index.verbose = True # to see progress print('xb: ', vecs.shape) print('dtype: ', vecs.dtype) index.add(vecs) # add vectors to the index print("total: ", index.ntotal) self.evaluate(vecs[:10000]) faiss.write_index(index, to_file) return index
def fit(x_train): from benchmark_utils import timer D = x_train.shape[1] if metric in ["euclidean", "angular"]: index = faiss.IndexHNSWFlat(D, M) index.hnsw.efConstruction = 500 else: raise NotImplementedError(f"The '{metric}' distance is not supported.") # Pre-processing: start = timer(use_torch=False) index.add(x_train) elapsed = timer(use_torch=False) - start # Return an operator for actual KNN queries: def f(x_test, efSearch=10): faiss.ParameterSpace().set_index_parameter(index, "efSearch", efSearch) start = timer(use_torch=False) distances, indices = index.search(x_test, K) elapsed = timer(use_torch=False) - start return indices, elapsed return f, elapsed
def train_coarse_quantizer(data, quantizer_path, num_clusters, hnsw=False, niter=10, cuda=False): d = data.shape[1] index_flat = faiss.IndexFlatL2(d) # make it into a gpu index if cuda: res = faiss.StandardGpuResources() index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat) clus = faiss.Clustering(d, num_clusters) clus.verbose = True clus.niter = niter clus.train(data, index_flat) centroids = faiss.vector_float_to_array(clus.centroids) centroids = centroids.reshape(num_clusters, d) if hnsw: quantizer = faiss.IndexHNSWFlat(d, 32) quantizer.hnsw.efSearch = 128 quantizer.train(centroids) quantizer.add(centroids) else: quantizer = faiss.IndexFlatL2(d) quantizer.add(centroids) faiss.write_index(quantizer, quantizer_path)
def _post_process(self, dataset, resources_paths): if self.config.with_index: index_file = resources_paths["embeddings_index"] if os.path.exists(index_file): dataset.load_faiss_index("embeddings", index_file) else: import faiss train_size = self.config.index_train_size logging.info("Building wiki_dpr faiss index") if self.config.dummy: dataset.add_faiss_index( "embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT, train_size=train_size, ) else: d = 768 quantizer = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_INNER_PRODUCT) ivf_index = faiss.IndexIVFPQ(quantizer, d, 4096, 64, 8, faiss.METRIC_INNER_PRODUCT) ivf_index.own_fields = True quantizer.this.disown() dataset.add_faiss_index( "embeddings", train_size=train_size, faiss_verbose=logging.getLogger().level <= logging.DEBUG, custom_index=ivf_index, ) logging.info("Saving wiki_dpr faiss index") dataset.save_faiss_index("embeddings", index_file) return dataset
def init_index(self, vector_sz: int): # IndexHNSWFlat supports L2 similarity only # so we have to apply DOT -> L2 similairy space conversion with the help of an extra dimension index = faiss.IndexHNSWFlat(vector_sz + 1, self.store_n) index.hnsw.efSearch = self.ef_search index.hnsw.efConstruction = self.ef_construction self.index = index
def add_faiss_index_to_dataset(dataset): import faiss index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) return dataset
def save_faiss_index(self): try: diary_cover_pic_face_vec_fd = open( "./diary_cover_pic_face_vec.txt", "r") xb, ids = [], [] for line in diary_cover_pic_face_vec_fd.readlines(): line_term_list = line.split("\t") diary_id = line_term_list[0] face_feature = json.loads(line_term_list[1]) face_feature_vec = np.array(face_feature) xb.append(face_feature_vec) ids.append(diary_id) xb_np = np.array(xb).astype('float32') ids_np = np.array(ids).astype('int') index = faiss.IndexHNSWFlat(128, 32) index = faiss.IndexIDMap(index) index.add_with_ids(xb_np, ids_np) faiss.write_index(index, settings.INDEX_PATH) diary_cover_pic_face_vec_fd.close() except: logging.error("catch exception, err_msg:%s" % traceback.format_exc())
def init_index(self): index = faiss.IndexHNSWFlat(self.vector_size + 1, 512) index.hnsw.efSearch = 128 index.hnsw.efConstruction = 200 self.index = index self._deserialize_index() self._index_initialized = True
def _post_process(self, dataset, resources_paths): if self.config.with_index: index_file = resources_paths["embeddings_index"] if os.path.exists(index_file): dataset.load_faiss_index("embeddings", index_file) else: if "embeddings" not in dataset.column_names: raise ValueError("Couldn't build the index because there are no embeddings.") import faiss d = 768 train_size = self.config.index_train_size logger.info("Building wiki_dpr faiss index") if self.config.index_name == "exact": index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_8bit, 128, faiss.METRIC_INNER_PRODUCT) index.hnsw.efConstruction = 200 index.hnsw.efSearch = 128 dataset.add_faiss_index("embeddings", custom_index=index, train_size=train_size) else: quantizer = faiss.IndexHNSWFlat(d, 128, faiss.METRIC_INNER_PRODUCT) quantizer.hnsw.efConstruction = 200 quantizer.hnsw.efSearch = 128 ivf_index = faiss.IndexIVFPQ(quantizer, d, 4096, 128, 8, faiss.METRIC_INNER_PRODUCT) ivf_index.nprobe = 64 ivf_index.own_fields = True quantizer.this.disown() dataset.add_faiss_index( "embeddings", train_size=train_size, custom_index=ivf_index, ) logger.info("Saving wiki_dpr faiss index") dataset.save_faiss_index("embeddings", index_file) return dataset
def __init__(self, image, target_mask, source_mask, patch_size=(11, 11), patch_weight=None, num_neighbors=1): im_h, im_w, im_ch = image.shape if patch_weight is None: self.patch_weight = np.ones(patch_size, dtype=_im_dtype) self.patch_size = patch_size self.num_neighb = num_neighbors print("Build NNF index: ", end=" ") start = time.time() if _NN_algorithm != "PatchMatch": self.source_ind = op.masked_indices(source_mask) self.target_ind = op.masked_indices(target_mask) # convert array indices to patch indices pad = patch_size[0] // 2 ind_y, ind_x = np.divmod(self.source_ind, im_w) self.source_ind = (ind_x - pad) + (ind_y - pad) * (im_w - 2 * pad) source_point_cloud = extract_patches_2d( image, patch_size=patch_size )[self.source_ind].reshape((self.source_ind.size,-1)) \ * np.repeat(np.sqrt(self.patch_weight),im_ch) # need this because of FLANN bug (?) with memory release self.target_point_cloud = np.zeros( (self.target_ind.size, source_point_cloud.shape[-1]), dtype=_im_dtype) if _NN_algorithm == "FLANN": self.nn = flann.FLANN() self.nn.build_index(source_point_cloud, algorithm="kdtree", trees=1) #, log_level = "info") elif _NN_algorithm == "Sklearn": self.nn = NearestNeighbors( n_neighbors=num_neighbors, algorithm='kd_tree', metric='minkowski', n_jobs=-1) #,metric_params={'w':self.patch_weight}) self.nn.fit(X=source_point_cloud) elif _NN_algorithm == "FAISS": self.nn = faiss.IndexHNSWFlat(source_point_cloud.shape[1], 50) self.nn.add(source_point_cloud) if _NN_algorithm == "PatchMatch": self.nn = pm.PatchMatch(target_mask, source_mask, patch_size=patch_size, lambdas=np.ones_like(image, dtype=_im_dtype)) print('%f sec' % (time.time() - start))
def fit(self, X): self.index = faiss.IndexHNSWFlat(len(X[0]), self.method_param["M"]) self.index.hnsw.efConstruction = self.method_param["efConstruction"] self.index.verbose = True if (self.metric == 'angular'): X = X / np.linalg.norm(X, axis=1)[:, np.newaxis] self.index.add(X) faiss.omp_set_num_threads(1)
def test_size_t_ptr(self): # issue 1064 index = faiss.IndexHNSWFlat(10, 32) hnsw = index.hnsw index.add(np.random.rand(100, 10).astype('float32')) be = np.empty(2, 'uint64') hnsw.neighbor_range(23, 0, faiss.swig_ptr(be), faiss.swig_ptr(be[1:]))
def test_wrapped_quantizer_HNSW(self): faiss.omp_set_num_threads(1) def bin2float(v): def byte2float(byte): return np.array( [-1.0 + 2.0 * (byte & (1 << b) != 0) for b in range(0, 8)]) return np.hstack([byte2float(byte) for byte in v]).astype('float32') def floatvec2nparray(v): return np.array([np.float32(v.at(i)) for i in range(0, v.size())]) \ .reshape(-1, d) d = 256 nt = 12800 nb = 10000 nq = 500 (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq) index_ref = faiss.IndexBinaryFlat(d) index_ref.add(xb) nlist = 256 clus = faiss.Clustering(d, nlist) clus_index = faiss.IndexFlatL2(d) xt_f = np.array([bin2float(v) for v in xt]) clus.train(xt_f, clus_index) centroids = floatvec2nparray(clus.centroids) hnsw_quantizer = faiss.IndexHNSWFlat(d, 32) hnsw_quantizer.add(centroids) hnsw_quantizer.is_trained = True wrapped_quantizer = faiss.IndexBinaryFromFloat(hnsw_quantizer) assert nlist == hnsw_quantizer.ntotal assert nlist == wrapped_quantizer.ntotal assert wrapped_quantizer.is_trained index = faiss.IndexBinaryIVF(wrapped_quantizer, d, hnsw_quantizer.ntotal) index.nprobe = 128 assert index.is_trained index.add(xb) D_ref, I_ref = index_ref.search(xq, 10) D, I = index.search(xq, 10) recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \ / float(D_ref.shape[0]) assert recall > 0.77, "recall = %g" % recall
def indexed_function(text): print("Building Index") passages_path = '/home/gsir059/Music/faisis_index_test/my_kb' dataset = load_from_disk(passages_path) print("dataset loaded") index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) print("Building Done")
def main( rag_example_args: "RagExampleArguments", processing_args: "ProcessingArguments", index_hnsw_args: "IndexHnswArguments", ): ###################################### logger.info("Step 1 - Create the dataset") ###################################### # The dataset needed for RAG must have three columns: # - title (string): title of the document # - text (string): text of a passage of the document # - embeddings (array of dimension d): DPR representation of the passage # Let's say you have documents in tab-separated csv files with columns "title" and "text" assert os.path.isfile(rag_example_args.csv_path), "Please provide a valid path to a csv file" # You can load a Dataset object this way dataset = load_dataset( "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"] ) # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files # Then split the documents into passages of 100 words dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc) # And compute the embeddings ctx_encoder = DPRContextEncoder.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name).to(device=device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name) new_features = Features( {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))} ) # optional, save as float32 instead of float64 to save space dataset = dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer), batched=True, batch_size=processing_args.batch_size, features=new_features, ) # And finally save your dataset passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset") dataset.save_to_disk(passages_path) # from datasets import load_from_disk # dataset = load_from_disk(passages_path) # to reload the dataset ###################################### logger.info("Step 2 - Index the dataset") ###################################### # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) # And save the index index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss") dataset.get_index("embeddings").save(index_path)
def test_hnsw(self): d = self.xq.shape[1] index = faiss.IndexHNSWFlat(d, 16) index.add(self.xb) Dhnsw, Ihnsw = index.search(self.xq, 1) self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 460) self.io_and_retest(index, Dhnsw, Ihnsw)
def get_index(index_type, dim): if index_type == 'hnsw': m = 48 index = faiss.IndexHNSWFlat(dim, m) index.hnsw.efConstruction = 128 return index elif index_type == 'l2': return faiss.IndexFlatL2(dim) raise
def generate_faiss_index_dataset(data, ctx_encoder_name, args, device): """ Adapted from Huggingface example script at https://github.com/huggingface/transformers/blob/master/examples/research_projects/rag/use_own_knowledge_dataset.py """ import faiss if isinstance(data, str): dataset = load_dataset("csv", data_files=data, delimiter="\t", column_names=["title", "text"]) else: dataset = HFDataset.from_pandas(data) dataset = dataset.map( partial(split_documents, split_text_n=args.split_text_n, split_text_character=args.split_text_character), batched=True, num_proc=args.process_count, ) ctx_encoder = DPRContextEncoder.from_pretrained(ctx_encoder_name).to( device=device) ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained( ctx_encoder_name) new_features = Features({ "text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32")) }) # optional, save as float32 instead of float64 to save space dataset = dataset.map( partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer, device=device), batched=True, batch_size=args.rag_embed_batch_size, features=new_features, ) if isinstance(data, str): dataset = dataset["train"] if args.save_knowledge_dataset: output_dataset_directory = os.path.join(args.output_dir, "knowledge_dataset") os.makedirs(output_dataset_directory, exist_ok=True) dataset.save_to_disk(output_dataset_directory) index = faiss.IndexHNSWFlat(args.faiss_d, args.faiss_m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) return dataset
def __init__(self, vector_sz: int, buffer_size: int = 50000, store_n: int = 512 , ef_search: int = 128, ef_construction: int = 200): super(DenseHNSWFlatIndexer, self).__init__(buffer_size=buffer_size) # IndexHNSWFlat supports L2 similarity only # so we have to apply DOT -> L2 similairy space conversion with the help of an extra dimension index = faiss.IndexHNSWFlat(vector_sz + 1, store_n) index.hnsw.efSearch = ef_search index.hnsw.efConstruction = ef_construction self.index = index self.phi = 0
def __init__(self, dim, num_documents, num_subvectors=16, hnsw_m=32, nbits=8): super().__init__() nlist = math.floor(math.sqrt(num_documents)) quantizer = faiss.IndexHNSWFlat(dim, hnsw_m) index = faiss.IndexIVFPQ(quantizer, dim, nlist, num_subvectors, nbits) self.index = faiss.index_cpu_to_all_gpus(index) self.num_training = max(nlist * 10, 256)
def _create_new_index(self, vector_dim: int, index_factory: str = "Flat", metric_type=faiss.METRIC_INNER_PRODUCT, **kwargs): if index_factory == "HNSW" and metric_type == faiss.METRIC_INNER_PRODUCT: # faiss index factory doesn't give the same results for HNSW IP, therefore direct init. # defaults here are similar to DPR codebase (good accuracy, but very high RAM consumption) n_links = kwargs.get("n_links", 128) index = faiss.IndexHNSWFlat(vector_dim, n_links, metric_type) index.hnsw.efSearch = kwargs.get("efSearch", 20)#20 index.hnsw.efConstruction = kwargs.get("efConstruction", 80)#80 logger.info(f"HNSW params: n_links: {n_links}, efSearch: {index.hnsw.efSearch}, efConstruction: {index.hnsw.efConstruction}") else: index = faiss.index_factory(vector_dim, index_factory, metric_type) return index
def add_index(shard_dir, index_path): data_shard_list = [] for shard_address in glob(str(shard_dir) + "/*/"): data_shard_list.append(load_from_disk(shard_address)) concat = concatenate_datasets(data_shard_list) faiss.omp_set_num_threads(96) index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT) concat.add_faiss_index("embeddings", custom_index=index) concat.get_index("embeddings").save( index_path ) # since we load the index in to memory,we can directly update the index in the disk
def run(args): csv_file = args.knowledge_file k_col = args.k_col title_col = args.title_col out_dir = args.out_dir os.makedirs(out_dir, exist_ok=True) passages_path = os.path.join(out_dir, KNOWLEGE_DIR) index_path = os.path.join(out_dir, INDEX_FILE) context_model = args.context_model device = "cuda" if (args.device == "cuda") and torch.cuda.is_available() else "cpu" batch_size = args.batch_size hnsw_m = args.hnsw_m # load pretrained context model ctx_encoder = transformers.DPRContextEncoder.from_pretrained( context_model).to(device) ctx_tokenizer = transformers.tokenization_bert_japanese.BertJapaneseTokenizer.from_pretrained( context_model) embedding_dim = ctx_encoder.config.hidden_size # convert csv file to index dataset = load_dataset("csv", data_files=[csv_file], split="train") column_names = dataset.column_names[:] def set_text_title(doc): return { "text": doc[k_col], "title": doc[title_col] if title_col is not None else "" } dataset = dataset.map(set_text_title) dataset.remove_columns_(column_names) print(dataset[0]) dataset = dataset.map(partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer, device=device), batched=True, batch_size=batch_size) # save index fils dataset.save_to_disk(passages_path) index = faiss.IndexHNSWFlat(embedding_dim, hnsw_m, faiss.METRIC_INNER_PRODUCT) dataset.add_faiss_index("embeddings", custom_index=index) dataset.get_index("embeddings").save(index_path)
def __init__( self, vector_sz: int, buffer_size: int = 50000, store_n: int = 128, ef_search: int = 256, ef_construction: int = 200, ): super(DenseHNSWFlatIndexer, self).__init__(buffer_size=buffer_size) index = faiss.IndexHNSWFlat(vector_sz, store_n, faiss.METRIC_INNER_PRODUCT) index.hnsw.efSearch = ef_search index.hnsw.efConstruction = ef_construction self.index = index
def _build_index(self, xb): d = xb.shape[-1] # res = faiss.StandardGpuResources() # index = faiss.GpuIndexFlatIP(res, d) # brute-force # index = faiss.IndexFlatL2(d) # HNSW index = faiss.IndexHNSWFlat(d, 32) index.add(xb) return index
def test_wrapped_quantizer_HNSW(self): def bin2float2d(v): n, d = v.shape vf = ((v.reshape(-1, 1) >> np.arange(8)) & 1).astype("float32") vf *= 2 vf -= 1 return vf.reshape(n, d * 8) d = 256 nt = 12800 nb = 10000 nq = 500 (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq) index_ref = faiss.IndexBinaryFlat(d) index_ref.add(xb) nlist = 256 clus = faiss.Clustering(d, nlist) clus_index = faiss.IndexFlatL2(d) xt_f = bin2float2d(xt) clus.train(xt_f, clus_index) centroids = faiss.vector_to_array(clus.centroids).reshape(-1, clus.d) hnsw_quantizer = faiss.IndexHNSWFlat(d, 32) hnsw_quantizer.add(centroids) hnsw_quantizer.is_trained = True wrapped_quantizer = faiss.IndexBinaryFromFloat(hnsw_quantizer) assert nlist == hnsw_quantizer.ntotal assert nlist == wrapped_quantizer.ntotal assert wrapped_quantizer.is_trained index = faiss.IndexBinaryIVF(wrapped_quantizer, d, hnsw_quantizer.ntotal) index.nprobe = 128 assert index.is_trained index.add(xb) D_ref, I_ref = index_ref.search(xq, 10) D, I = index.search(xq, 10) recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \ / float(D_ref.shape[0]) assert recall >= 0.77, "recall = %g" % recall
def _post_process(self, dataset, resources_paths, dl_manager): if self.config.with_index: index_file = resources_paths["embeddings_index"] if os.path.exists(index_file): print(">>> E") dataset.load_faiss_index("embeddings", index_file) else: try: downloaded_resources = dl_manager.download_and_extract({ "embeddings_index": os.path.join( _INDEX_URL, self.config.index_file.format(split=dataset.split)) }) dataset.load_faiss_index( "embeddings", downloaded_resources["embeddings_index"]) print(">>> D") dataset.save_faiss_index("embeddings", index_file) except ConnectionError: # if the index of the specified split doesn't exist print(">>> F") import faiss train_size = self.config.index_train_size logging.info("Building wiki_dpr faiss index") if self.config.index_type == "exact": dataset.add_faiss_index( "embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT, ) else: d = 768 quantizer = faiss.IndexHNSWFlat( d, 32, faiss.METRIC_INNER_PRODUCT) ivf_index = faiss.IndexIVFPQ( quantizer, d, 4096, 64, 8, faiss.METRIC_INNER_PRODUCT) ivf_index.own_fields = True quantizer.this.disown() dataset.add_faiss_index( "embeddings", train_size=train_size, faiss_verbose=logging.getLogger().level <= logging.DEBUG, custom_index=ivf_index, ) logging.info("Saving wiki_dpr faiss index") dataset.save_faiss_index("embeddings", index_file) return dataset
def build(self, vectors): t0 = time.time() if self.add_noise: vectors += np.random.randn(vectors.shape[0], vectors.shape[1]) * self.noise_amount self.num_points += vectors.shape[0] if not self.assume_unit_normed: vectors = unit_norm(vectors) self.index = faiss.IndexHNSWFlat(vectors.shape[1], self.max_degree) self.index.hnsw.efConstruction = self.efConstruction self.index.hnsw.efSearch = self.efSearch self.index.metric_type = faiss.METRIC_L2 self.index.add(vectors) t1 = time.time() self.total_insert_time += t1 - t0
def test_hnsw_IP(self): d = self.xq.shape[1] index_IP = faiss.IndexFlatIP(d) index_IP.add(self.xb) Dref, Iref = index_IP.search(self.xq, 1) index = faiss.IndexHNSWFlat(d, 16, faiss.METRIC_INNER_PRODUCT) index.add(self.xb) Dhnsw, Ihnsw = index.search(self.xq, 1) print('nb equal: ', (Iref == Ihnsw).sum()) self.assertGreaterEqual((Iref == Ihnsw).sum(), 480) mask = Iref[:, 0] == Ihnsw[:, 0] assert np.allclose(Dref[mask, 0], Dhnsw[mask, 0])