def create_index(words, index_path, vocab_path, cache_dir, batch_size=64): tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert") model = AutoModel.from_pretrained("ai4bharat/indic-bert", cache_dir=cache_dir, return_dict=True) model.to('cuda') index = faiss.IndexFlatIP(model.config.hidden_size) i = 0 while i < len(words): batch = words[i:i + batch_size] tokens = tokenizer(batch, truncation=True, padding=True, max_length=10, return_tensors="pt") tokens.to('cuda') outputs = model(**tokens) embeddings = torch.mean(outputs.last_hidden_state, 1).detach().cpu().numpy() faiss.normalize_L2(embeddings) index.add(embeddings) i += batch_size print("{} words done".format(index.ntotal)) faiss.write_index(index, index_path)
def test_sparse_routines(self): """ the sparse assignment routine """ ds = datasets.SyntheticDataset(1000, 2000, 0, 200) xt = ds.get_train().copy() faiss.normalize_L2(xt) mask = np.abs(xt) > 0.045 # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros xt[np.logical_not(mask)] = 0 centroids = ds.get_queries() assert len(centroids) == 200 xsparse = scipy.sparse.csr_matrix(xt) Dref, Iref = faiss.knn(xsparse.todense(), centroids, 1) D, I = clustering.sparse_assign_to_dense(xsparse, centroids) np.testing.assert_array_equal(Iref.ravel(), I) np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4) D, I = clustering.sparse_assign_to_dense_blocks(xsparse, centroids, qbs=123, bbs=33, nt=4) np.testing.assert_array_equal(Iref.ravel(), I) np.testing.assert_array_almost_equal(Dref.ravel(), D, decimal=4)
def vector_search(query, model, index, num_results=10, threshold=0.75): """Tranforms query to vector using a pretrained, sentence-level DistilBERT model and finds similar vectors using FAISS. Args: query (str): User query that should be more than a sentence long. model (sentence_transformers.SentenceTransformer.SentenceTransformer) index (`numpy.ndarray`): FAISS index that needs to be deserialized. num_results (int): Number of results to return. threshold: filter the D and I Returns: D (:obj:`numpy.array` of `float`): Distance between results and query. I (:obj:`numpy.array` of `int`): Paper ID of the results. """ vector = model.encode(query) normalize_L2(vector) D, I = index.search(np.array(vector).astype("float32"), k=num_results) index = I.flatten() distance = D.flatten() filter = distance > threshold if distance[filter].tolist(): pass else: index = index[:3] distance = distance[:3] filter = distance > threshold / 2 return distance[filter].tolist(), index[filter].tolist()
def IndexIVFFlat(): d = 2048 # dimension nb = 1000050 # database size np.random.seed(1234) # make reproducible training_vectors= np.random.random((nb, d)).astype('float32')*10 normalize_L2(training_vectors) nlist = 1000 # 聚类中心的个数 k = 50 #邻居个数 quantizer = faiss.IndexFlatIP(d) # the other index,需要以其他index作为基础 index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT) # by default it performs inner-product search assert not index.is_trained t_tr = time.time() index.train(training_vectors) print('tr time:', time.time()-t_tr) assert index.is_trained index.nprobe = 300 # default nprobe is 1, try a few more t_s = time.time() index.add(training_vectors) # add may be a bit slower as well print('add time:', time.time()-t_s) t1=time.time() D, I = index.search(training_vectors[:100], k) # actual search t2 = time.time() print('faiss kmeans result times {}'.format(t2-t1)) # print(D[:5]) # neighbors of the 5 first queries print(I[:5]) topk = 5 np.save('rank{}'.format(topk) + '.npy', I) np.save('similarity{}'.format(topk) + '.npy', D)
def __init__( self, embedding_space: EmbeddingSpaceType, embedding_space_dims: int, similarity_algorithm: SimilarityAlgorithm, ): super(GloVeWordEmbeddingIndex, self).__init__( faiss_index_name='faiss_index', index_np_name='index_np', embedding_space_dims_name='embedding_space_dims', similarity_algorithm_name='similarity_algorithm', ) self.embedding_space = embedding_space self.embedding_space_dims = embedding_space_dims self.similarity_algorithm = similarity_algorithm self.index_np, self.word_to_index, self.index_to_word = ( GloVeWordEmbeddingIndex.build_index( embedding_space, embedding_space_dims, )) # for FAISS we need float32 instead of float64 self.index_np = self.index_np.astype('float32') self.faiss_index = faiss.IndexFlatIP(embedding_space_dims) if similarity_algorithm == SimilarityAlgorithm.CosineSimilarity: # normalize with L2 as a proxy for cosine search faiss.normalize_L2(self.index_np) self.faiss_index.add(self.index_np)
def __init__(self, file, d, norm=True,file_str=None): self.vec = [] self.txt = [] if file.endswith('.gz'): f = gzip.open(file, 'rt') else: f = io.open(file, 'r', encoding='utf-8', newline='\n', errors='ignore') for l in f: l = l.rstrip().split(' ') if len(l) != d: logging.error('found {} floats instead of {}'.format(len(l),d)) sys.exit() self.vec.append(l) self.vec = np.array(self.vec).astype('float32') if norm: faiss.normalize_L2(self.vec) if file_str is None: return if file_str.endswith('.gz'): f = gzip.open(file_str, 'rt') else: f = io.open(file_str, 'r', encoding='utf-8', newline='\n', errors='ignore') for l in f: self.txt.append(l.rstrip()) if len(self.txt) != len(self.vec): logging.error('diff num of entries {} <> {} in files {} and {}'.format(len(self.vec),len(self.txt),file, file_str)) sys.exit()
def build_index(hidden_states): d = hidden_states.shape[1] index = faiss.index_factory(d, "Flat", faiss.METRIC_INNER_PRODUCT) faiss.normalize_L2(hidden_states) index.add(hidden_states) return (index)
def _createSenSetVecsNumpy(self): ''' 将所有句子的语料全部加载到内存 :return: ''' print("\n开始生成所有句子的句子向量....") start_time = datetime.datetime.now() # 放在程序开始处 self.sens = [] # 用来所有的存储句子 我 爱 你 senVecs = [] # 用来存储所有句子向量 with open(self.sentencesfile, mode="r", encoding="utf-8") as fr: for line in fr: line = line.strip() if line != "": tokens = line.split(" ") tokens_ids = self._convertWords2ids(tokens) self.sens.append(line) senVecs.append(self._getSenVec(tokens_ids)) self.senVecs = np.ascontiguousarray(senVecs) # 是一个numpy print("\n开始执行_normalize_L2") faiss.normalize_L2(self.senVecs) print("\n结束执行_normalize_L2") print("\n结束生成所有句子的句子向量....") end_time = datetime.datetime.now() # 放在程序结尾处 interval = (end_time - start_time).seconds # 以秒的形式 print("句子向量生成完毕,共用时:", interval)
def __init__(self, file, d=0, norm=True, max_vec=1000000): logging.info('Reading {}'.format(file)) self.file = file self.d = d ### will contain length of vectors self.vec = [] ### list with all vectors found in file self.max_vec = max_vec if self.file.endswith('.gz'): f = gzip.open(self.file, 'rt') else: f = io.open(self.file, 'r', encoding='utf-8', newline='\n', errors='ignore') for l in f: l = l.rstrip().split(' ') if self.d == 0: self.d = len(l) if len(l) != self.d: logging.error('found a vector with {} cells instead of {} in line {} of file {}'.format(len(l),self.d,len(self.vec)+1,self.file)) sys.exit() self.vec.append(l) if self.max_vec == 0: self.vecs = [self.vec] else: self.vecs = [self.vec[i: i+self.max_vec] for i in range(0, len(self.vec), self.max_vec)] logging.info('\t\tRead {} vectors into {} chunks ({} cells)'.format(len(self.vec),len(self.vecs),self.d)) for i in range(len(self.vecs)): self.vecs[i] = np.array(self.vecs[i]).astype('float32') logging.info('\t\tBuilt float32 array for chunk {} with {} vectors'.format(i,len(self.vecs[i]))) if norm: faiss.normalize_L2(self.vecs[i])
def get_intent(self, query, prefix, tasks, k_nearest=1): index = faiss.read_index(f"data/{prefix}_intent_index.idx") query_vector = np.array([self._get_embedding(query, prefix) ]).astype(np.float32) faiss.normalize_L2(query_vector) similarities, similarities_ids = index.search(query_vector, k_nearest) return similarities_ids[0][0], tasks['task'][similarities_ids[0][0]]
def __init__(self, iterator=None, filename=None, embeddings=None, shape=None, device="cpu"): self.iterator = iterator if os.path.exists(filename) == True: print(f'Index file {filename}') self.index = faiss.read_index( filename) # index2 is identical to index else: self.index = faiss.index_factory(shape, "Flat", faiss.METRIC_INNER_PRODUCT) faiss.normalize_L2(embeddings) self.index.add(embeddings) faiss.write_index(self.index, filename) print(f'Index written at {filename}') if device == "cuda": print('Now running on CUDA') self.index = faiss.index_cpu_to_all_gpus(self.index) print(f'Index trained - {self.index.is_trained}')
def test_normalized(self): rs = np.random.RandomState(123) m = rs.rand(40, 20).astype('float32') faiss.normalize_L2(m) comments = faiss.MatrixStats(m).comments print(comments) assert 'vectors are normalized' in comments
def _knn_faiss(data_numpy, k, metric='euclidean', use_gpu=False): import faiss data_numpy = data_numpy.astype(np.float32) data_numpy = data_numpy.copy(order='C') data_numpy = np.ascontiguousarray(data_numpy, dtype=np.float32) if use_gpu: print('Using GPU for Faiss...') res = faiss.StandardGpuResources() else: print('Using CPU for Faiss...') if metric == 'euclidean': index = faiss.IndexFlatL2(data_numpy.shape[1]) elif metric == 'manhattan': index = faiss.IndexFlat(data_numpy.shape[1], faiss.METRIC_L1) elif metric == 'cosine': index = faiss.IndexFlat(data_numpy.shape[1], faiss.METRIC_INNER_PRODUCT) faiss.normalize_L2(data_numpy) if use_gpu: index = faiss.index_cpu_to_gpu(res, 0, index) data_numpy = np.ascontiguousarray(data_numpy, dtype=np.float32) index.train(data_numpy) assert index.is_trained index.add(data_numpy) nprobe = data_numpy.shape[0] index.nprobe = nprobe distances, neighbors = index.search(data_numpy, k) return distances, neighbors
def evaluate(self): faiss.normalize_L2(self.embedding_vectors) self.index.add(self.embedding_vectors) self.D, self.I = self.index.search(self.embedding_vectors, self.k) self.I = self.I.astype(np.int32) np.savetxt(join(self.target_dir, "D.out"), self.D, delimiter=",") np.savetxt(join(self.target_dir, "I.out"), self.I.astype(np.int), delimiter=",", fmt="%i") with open(join(self.target_dir, "filenames.txt"), "w") as f: f.writelines([x + "\n" for x in self.filepaths]) with open(join(self.target_dir, "vectors.npz"), "wb") as f: np.save(f, self.embedding_vectors) #TODO: Calculate top-5 accuracy etc. from index matrix and class dictionaries self.A = self.vec_index_to_ad_id_func(idx=self.I) # Subtract query id from all columns. # If there is a match, the jth column with have a zero in it self.A_proper = self.A.copy() self.A[:, 1:] -= self.A[:, 0][:, None] k_accuracies = np.zeros(self.k) for k in range(1, self.k + 1): k_accuracies[k - 1] = (np.mean( np.count_nonzero(self.A[:, 1:k] == 0, axis=1) >= 1)) with open(join(self.target_dir, "k_accuracies.npz"), "wb") as f: np.save(f, k_accuracies) fig, ax = plt.subplots(1, 1) ax.plot(np.arange(1, self.k + 1), k_accuracies) plt.show()
def vector_search( query_vector: Union[str, np.ndarray], data: List[str], encoded_data: np.ndarray = np.ndarray([0]), embed: Optional[Callable] = None, # embed_data index_: str = "", # default to indexflatl2, or indexflatip sanity_check: Union[bool, int] = False, topk: int = 5, ) -> Optional[Tuple[np.ndarray, np.ndarray]]: """Search via faiss.""" if embed is None: embed = fetch_embed if encoded_data is None: encoded_data = embed(data) if isinstance(query_vector, str): query_vector = fetch_embed(query_vector) if isinstance(query_vector, np.ndarray): try: assert query_vector.shape[1] == encoded_data.shape[1] except AssertionError as exc: raise SystemExit( "dimentions query vector and vectors in " "database dimensions do not match" ) from exc else: logger.info( "You probably need to embed (encode) the list of str first." "\n\t.e.g, embed(nameof(query_vector)). Exiting" ) try: _ = fetch_embed(query_vector) # _ = np.array().astype("float16") assert _.shape[1] == encoded_data.shape[1] query_vector = _ except Exception as exc: logger.error(exc) raise SystemExit(1) from exc if index_.lower() in [ "indexflat_ip", "flat_ip", "flatip", "flat-ip", "indexflat-ip" ]: index = faiss_flat_ip(encoded_data) else: # index_.lower() in ["indexflatl2", "flat_l2", "flatl2"] index = faiss_flat_l2(encoded_data) faiss.normalize_L2(query_vector) _ = index.search(query_vector, topk) # _ = index.search(ed, topk) if sanity_check: # query_vector = encoded_data[:10] top_k = index.search(encoded_data[:10], topk) print([np.round(top_k[0], 2), top_k[1]]) # return None return _
def _execute_one_chunk(cls, ctx, op): (inp, ), device_id, xp = as_same_device( [ctx[c.key] for c in op.inputs], device=op.device, ret_extra=True) with device(device_id): # create index index = faiss.index_factory(inp.shape[1], op.faiss_index, op.faiss_metric_type) # GPU if device_id >= 0: # pragma: no cover res = faiss.StandardGpuResources() index = faiss.index_cpu_to_gpu(res, device_id, index) # train index if not index.is_trained: assert op.n_sample is not None sample_indices = xp.random.choice(inp.shape[0], size=op.n_sample, replace=False) sampled = inp[sample_indices] index.train(sampled) if op.metric == 'cosine': # faiss does not support cosine distances directly, # data needs to be normalize before adding to index, # refer to: # https://github.com/facebookresearch/faiss/wiki/FAQ#how-can-i-index-vectors-for-cosine-distance faiss.normalize_L2(inp) # add vectors to index index.add(inp) ctx[op.outputs[0].key] = _store_index(ctx, op, index, device_id)
def predict(self, text: str) -> str: response = "" tag = "" # update ques_embedding_dict, response_cluster_dict = self.data_controller.update( ) if self.ques_embedding_dict != ques_embedding_dict: self.ques_embedding_dict = ques_embedding_dict self.querys = self.ques_embedding_dict['sentences'] self.querys_wo_space = [s.replace(" ", "") for s in self.querys] self.faiss_index, self.class_list = self._faiss_indexing() if self.response_cluster_dict != response_cluster_dict: self.response_cluster_dict = response_cluster_dict self.thres_prob = self.data_controller.threshold_dict[ 'scenario_similarity_threshold'] self.thres_similar = self.data_controller.threshold_dict[ 'character_similarity_threshold'] # 1) exact matching res_class = self._exact_matching(text) if res_class: response = self._generate_response(res_class) tag = "<Scenario>" # 2) similarity analysis else: # a) character similarity res_class = self._char_similarity_analysis(text) if res_class: response = self._generate_response(res_class) tag = "<Scenario>" # b) semantic similarity else: query_vec = self.inferencer.infer(text) query_vec = np.array(query_vec).astype(np.float32) if len(query_vec) == 0: return "" normalize_L2(query_vec) D, I = self.faiss_index.search(query_vec, self.k) topk_class = [self.class_list[i] for i in I[0]] pred_counts = Counter(topk_class) res_class = max(pred_counts) max_prob = D[0].max() if pred_counts[res_class] >= math.ceil( self.k / 2) and max_prob >= self.thres_prob: response = self._generate_response(res_class) tag = "<Scenario-Semantic | Score: {}>".format( str(round(max_prob, 2))) if tag: response = response + "\n" + tag return response
def search_top_k(corp_emb, query_emb, embedding_dim, k, config): """ Returns a tuple with ordered lists of lists of cosine distances between and top k matches in corpus_embeddings. Each list corresponds to one query. Needs GPU Available metrics = faiss.METRIC_INNER_PRODUCT, faiss.METRIC_L2, ... more here https://github.com/facebookresearch/faiss/wiki/MetricType-and-distances return type = (List[List[int = cosine_distance]], List[List[int = index_of_corpus_embedding]]); type(cosine_distance) == Float type(index_of_corpus_embedding) == Int """ config.logger.info(f"Preparing index and executing the search with embedding dim {embedding_dim}") index = faiss.index_factory(embedding_dim, "PCA384,Flat", faiss.METRIC_INNER_PRODUCT) # Flat = exhaustive search faiss.normalize_L2(corp_emb) # need to normalize query and corpus vectors for cosine distance faiss.normalize_L2(query_emb) if config.device != 'cpu': res = faiss.StandardGpuResources() if len(config.devices) > 1: dev_index = faiss.index_cpu_to_all_gpus(index) # use gpu else: dev_index = faiss.index_cpu_to_gpu(res, 0, index) else: dev_index = index dev_index.train(corp_emb) dev_index.add(corp_emb) return dev_index.search(query_emb, k) # return distances, indices matrices
def LoadDataNLI(fn1, fn2, fn_lbl, dim=1024, bsize=32, shuffle=False, quiet=False): x = np.fromfile(fn1, dtype=np.float32, count=-1) x.resize(x.shape[0] // dim, dim) faiss.normalize_L2(x) y = np.fromfile(fn2, dtype=np.float32, count=-1) y.resize(y.shape[0] // dim, dim) faiss.normalize_L2(y) lbl = np.loadtxt(fn_lbl, dtype=np.int32) lbl.reshape(lbl.shape[0], 1) if not quiet: print(' - read {:d}x{:d} elements in {:s}'.format( x.shape[0], x.shape[1], fn1)) print(' - read {:d}x{:d} elements in {:s}'.format( y.shape[0], y.shape[1], fn2)) print(' - read {:d} labels [{:d},{:d}] in {:s}'.format( lbl.shape[0], lbl.min(), lbl.max(), fn_lbl)) # nli = torch.cat((x, y, torch.abs(x - y), x * y), 1) if not quiet: print(' - combine premises and hyps') nli = np.concatenate((x, y, np.absolute(x - y), np.multiply(x, y)), axis=1) D = data_utils.TensorDataset(torch.from_numpy(nli), torch.from_numpy(lbl)) loader = data_utils.DataLoader(D, batch_size=bsize, shuffle=shuffle) return loader
def index(self): """Creates a faiss index for similarity searches over the node embeddings. Simple implementation of a cached property. Returns ------- a faiss index with input embeddings added and optionally trained""" if self._index is None: if not self._masks_set: self.set_masks() if self.distance_metric=='cosine': self._index = faiss.IndexFlatIP(self.embedding_dim) embeddings = np.copy(self.embeddings[self.entity_mask]) #this function operates in place so np.copy any views into a new array before using. faiss.normalize_L2(embeddings) elif self.distance_metric=='l2': self._index = faiss.IndexFlatL2(self.embedding_dim) embeddings = self.embeddings[self.entity_mask] if self.train_faiss: training_points = min( len(self.node_ids)//FAISS_NODES_TO_CLUSTERS+1, MAXIMUM_FAISS_CLUSTERS) self._index = faiss.IndexIVFFlat(self._index, self.embedding_dim, training_points) self._index.train(embeddings) self._index.add(embeddings) if self.faiss_gpu: GPU = faiss.StandardGpuResources() self._index = faiss.index_cpu_to_gpu(GPU, 0, self._index) return self._index
def faiss_flat_ip(encoded_data): """Faiss flatip.""" dim = encoded_data.shape[1] index = faiss.IndexIDMap(faiss.IndexFlatIP(dim)) faiss.normalize_L2(encoded_data) index.add_with_ids(encoded_data, np.arange(len(encoded_data))) return index
def annSearch(emds, uuid, uvec, test_label, ivec, iid, topk): indextree = faiss.IndexFlatIP(emds) faiss.normalize_L2(ivec) indextree.add(ivec) faiss.normalize_L2(uvec) #D,I = indextree.search(np.ascontiguousarray(uvec),topk) D, I = indextree.search(uvec, topk) score = [] hit = 0 #uid测试集中 用户,,iid所有数据集上的item for i, uid in tqdm(enumerate(uuid)): try: pred = [iid.values[x] for x in I[i]] recall_score = len(set(pred[:topk]) & set(test_label)) * 1.0 / len(test_label) score.append(recall_score) if test_label[uid] in pred: hit += 1 except: print(i) score_mean = np.mean(score) hit_rate = hit / len(uuid) return score_mean, hit_rate
def cosine_similar(): ''' cosine_similarity use :return: ''' d = 64 # dimension nb = 105 # database size #主要是为了测试不是归一化的vector training_vectors = np.random.random((nb, d)).astype('float32') * 10 print('just compare with skearn') from sklearn.metrics.pairwise import cosine_similarity #主要是为了与sklearn 比较结果 ag = cosine_similarity(training_vectors) fe = np.sort(ag, axis=1) print('normalize_L2') normalize_L2(training_vectors) print('IndexFlatIP') index = faiss.IndexFlatIP(d) index.train(training_vectors) print(index) print('train') print(index.is_trained) print('add') print(index) index.add(training_vectors) print('search') D, I = index.search(training_vectors[:100], 5) print(I[:5]) # 表示最相近的前5个的index print(D[:5]) # 表示最相近的前5个的相似度的值
def search(self, query, top=5, nprobe=1, ret_vec=0, index=None): # D, I, V = [], [], [] # 查找聚类中心的个数,默认为1个。 self.index.nprobe = nprobe #self.nprobe # 如果指定了索引号,则使用索引号指定的向量 2020/9/10 if index: query = self.xb[index, :] else: if not query.dtype == 'float32': query = query.astype('float32') #print(query.shape) # 如果是单条查询,把向量处理成二维 if len(query.shape) == 1: query = query[np.newaxis, :] # 向量归一化 if self.normalize: faiss.normalize_L2(query) # print('q,n:', (query, top) ) # 查询 D, I = self.index.search(query, top) # 添加向量输出 2020/9/7 V = [] if ret_vec: V = self.xb[I, :] return D, I, V
def faiss_search(embeddings, uids, num_results): """Returns and index and query vector from FAISS Model based on given embeddings. Create a matrix to store article embeddings Assign dimension for the vector space Build the index IndexFlatIP: taking inner product of the vectors with normalized vectors, the inner product (IP, of IndexFlatIP) becomes cosine similarity Adding vectors to the index Prepare query vector """ xb = np.ascontiguousarray(embeddings).astype(np.float32) d = xb.shape[1] index = faiss.IndexFlatIP(d) faiss.normalize_L2(xb) index.add(xb) query_vec = np.ascontiguousarray(embeddings.loc[uids]).reshape( 1, -1).astype(np.float32) faiss.normalize_L2(query_vec) _, matches = index.search(query_vec, num_results) similar_embeddings = matches.tolist()[0] return [ uid for uid in embeddings.iloc[similar_embeddings].index if uid not in uids ]
def submit(valid_data, all_data, mapped_data): t1 = time.time() corpus = np.array(all_data['embedding'].values.tolist()).astype('float32') import faiss faiss.normalize_L2(corpus) index = faiss.IndexFlatIP(corpus.shape[1]) index.train(corpus) index.add(corpus) query = np.array(valid_data['embedding'].values.tolist()).astype('float32') faiss.normalize_L2(query) D, I = index.search(query, len(corpus)) res = [] for i, d in enumerate(I): index_lst = I[i][:3] paper_id_lst = [ all_data.loc[index, 'paper_id'][0] for index in index_lst ] description_id = valid_data.loc[i, 'description_id'][0] res.append({ "description_id": description_id, "paper_id_lst": ",".join(paper_id_lst) }) print("Time {:.02f}s".format(time.time() - t1)) res = pd.DataFrame(res) res.to_csv("./result/submit.csv", index=0, header=0)
def _loadTextAndEmb(textF, encoding, embF, encoderDim, unify, verbose): inds, sents = TextLoadUnify(textF, encoding, unify, verbose) emb = EmbedLoad(embF, encoderDim, verbose=verbose) if unify: emb = unique_embeddings(emb, inds) faiss.normalize_L2(emb) return inds, sents, emb
def build_advanced_index(self, vecs: 'np.ndarray'): """Load all vectors (in numpy ndarray) into Faiss indexers """ import faiss metric = faiss.METRIC_L2 if self.distance == 'inner_product': metric = faiss.METRIC_INNER_PRODUCT if self.distance not in {'inner_product', 'l2'}: self.logger.warning( 'Invalid distance metric for Faiss index construction. Defaulting to l2 distance' ) index = self.to_device( index=faiss.index_factory(self.num_dim, self.index_key, metric)) if not self.is_trained and self.train_filepath: train_data = self._load_training_data(self.train_filepath) if train_data is None: self.logger.warning( 'loading training data failed. some faiss indexes require previous training.' ) else: train_data = train_data.astype(np.float32) if self.normalize: faiss.normalize_L2(train_data) self.train(index, train_data) self.build_partial_index(vecs, index) index.nprobe = self.nprobe return index
def create_faiss_index(sequence_vectors): array = np.array(sequence_vectors).astype(np.float32) index = faiss.IndexFlatIP(array.shape[1]) faiss.normalize_L2(array) index.add(array) return index
def query(self, vecs: 'np.ndarray', top_k: int, *args, **kwargs) -> Tuple['np.ndarray', 'np.ndarray']: if self.normalize: from faiss import normalize_L2 normalize_L2(vecs) dist, ids = self.query_handler.search(vecs, top_k) keys = self.int2ext_id[self.valid_indices][ids] return keys, dist
def manual_trans(x): x = x.copy() faiss.normalize_L2(x) x = pca.apply_py(x) faiss.normalize_L2(x) return x