def annSearch(uvec,ivec,topk):
    emds=64
    
    indextree = faiss.IndexFlatIP(emds)

    indextree.add(ivec)

    Dis,Index= indextree.search(np.array(uvec).astype('float32'),topk) 
    return Dis,Index
Example #2
0
    def update_index(self):
        path = os.path.abspath(
            os.path.dirname(__file__) + '/../../var/model/esse/embeddings.npy')
        embeddings = self.sentence_encoder.encode(
            [esse.get_index_text() for esse in self.esses])
        np.save(path, embeddings)

        self.index = faiss.IndexFlatIP(512)
        self.index.add(embeddings)
Example #3
0
def load_train_data():
    eli5 = datasets.load_dataset("eli5", name="LFQA_reddit")
    eli5_train = eli5["train_eli5"]
    eli5_train_q_reps = np.memmap(
        "eli5_questions_reps.dat", dtype="float32", mode="r", shape=(eli5_train.num_rows, 128)
    )
    eli5_train_q_index = faiss.IndexFlatIP(128)
    eli5_train_q_index.add(eli5_train_q_reps)
    return (eli5_train, eli5_train_q_index)
Example #4
0
def faissLoadKmeans(KEMANS_PKL):

    kmeans = pickle.load(open(KMEANS_PKL,'rb'))
    vectors = np.array(kmeans.cluster_centers_).astype('float32')

    faissIndex = faiss.IndexFlatIP(DIM_BERT)
    faissIndex.add(vectors)

    return faissIndex
Example #5
0
 def __load_faiss_index(vectors: List[np.array], use_gpu: bool):
     vectors_dim = len(vectors[0])
     vector_stack = np.stack(vectors)
     index = faiss.IndexFlatIP(vectors_dim)
     if use_gpu:
         res = faiss.StandardGpuResources()
         index = faiss.index_cpu_to_gpu(res, 0, index)
     index.add(vector_stack)
     return index
Example #6
0
 def build_index(self):
     """:returns an inverted index for the search documents"""
     vectors = [self.encode(document) for document in self.documents]
     index = faiss.IndexIDMap(
         faiss.IndexFlatIP(768))  # dimensionality of vector space
     # Add document vectors into index after transforming into numpy arrays. IDs should match len(documents)
     index.add_with_ids(np.array([vec.numpy() for vec in vectors]),
                        np.array(range(0, len(self.documents))))
     return index
Example #7
0
 def build_index(image_features):
     """
     Builds index based on provided images' features for a fast access
     @param image_features: the features of images
     @return: Index
     """
     index = faiss.IndexFlatIP(image_features.shape[1])
     index.add(image_features)
     return index
Example #8
0
def faiss_knn(Q, X, k, dist='IP'):
    d = X.shape[1]
    if dist == 'IP':
        index = faiss.IndexFlatIP(d)
    elif dist == 'L2':
        index = faiss.IndexFlatL2(d)
    index.add(X)
    dists, inds = index.search(Q, k)
    return dists, inds
Example #9
0
def index_model(names, models, norm='l2'):
    '''
    To normalize or not to normalize:
    
    - stats.stackexchange.com/questions/177905
    - stackoverflow.com/questions/36034454

    Usage:

    fp = f'{base}/models/{n}/nanotext_r89.model'
    model3 = load_embedding(fp)
    m3 = subtract_mean(model3)
    found, m, index = index_model(names, [m1, m2, m3], norm=norm)
    '''
    import faiss
    import numpy as np
    from sklearn.preprocessing import normalize

    from nanotext.io import eprint

    m = []
    found, notfound = [], 0

    # first take mean of vectors ...
    for i in names:
        model_vv = []
        try:
            for model in models:
                model_vv.append(model[i])
        except KeyError:
            notfound += 1
            continue

        sum_ = np.sum(model_vv, axis=0) / len(model_vv)
        found.append(i)
        m.append(sum_)
        # if only one model is present, this will return the original vector

    db = np.array(m, dtype='float32')
    dim = db.shape[1]  # dimensions

    # ... then normalize
    if not norm:
        index = faiss.IndexFlatL2(dim)
    elif norm == 'l2':
        index = faiss.IndexFlatIP(dim)
        db = normalize(db, norm=norm, axis=1)
        # the inner product IP of two unit length vectors = cosine similarity
    else:
        raise ValueError('This norm is not supported, abort!')

    index.add(db)
    if notfound > 0:
        fraction = round(notfound / len(names), 4)
        eprint(f'{notfound} entries ({fraction}) not found.')
    return found, db, index
def validate(epoch, loader, imenc, capenc, vocab, args, SETTING):
    begin = time.time()
    print("begin validation for epoch {}".format(epoch), flush=True)
    dset = EmbedDset(loader, imenc, capenc, vocab, args)
    print("val dataset created | {} ".format(sec2str(time.time()-begin)), flush=True)
    im = dset.embedded["image"]
    cap = dset.embedded["caption"]

    nd = im.shape[0]
    nq = cap.shape[0]
    d = im.shape[1]
    cpu_index = faiss.IndexFlatIP(d)

    print("# images: {}, # captions: {}, dimension: {}".format(nd, nq, d), flush=True)

    # im2cap
    cpu_index.add(cap)
    # calculate every conbination and sort 
    # D = result , I = imgid
    D, I = cpu_index.search(im, nq)
    data = {}
    allrank = []
    # TODO: Make more efficient, do not hardcode 5
    cap_per_image = 5
    # brinf correct answer rank for each sentence(their are 5 each)
    for i in range(cap_per_image):
        gt = (np.arange(nd) * cap_per_image).reshape(-1, 1) + i
        rank = np.where(I == gt)[1]
        allrank.append(rank)
    allrank = np.stack(allrank)
    # minimal rank for ans(best of 5 each)
    allrank = np.amin(allrank, 0)
    # how many images were correct bellow @num
    for rank in [1, 5, 10, 20]:
        data["i2c_recall@{}".format(rank)] = 100 * np.sum(allrank < rank) / len(allrank)
    data["i2c_median@r"] = np.median(allrank) + 1
    data["i2c_mean@r"] = np.mean(allrank)

    # cap2im
    cpu_index.reset()
    cpu_index.add(im)
    D, I = cpu_index.search(cap, nd)
    # TODO: Make more efficient, do not hardcode 5
    gt = np.arange(nq).reshape(-1, 1) // cap_per_image
    allrank = np.where(I == gt)[1]
    for rank in [1, 5, 10, 20]:
        data["c2i_recall@{}".format(rank)] = 100 * np.sum(allrank < rank) / len(allrank)
    data["c2i_median@r"] = np.median(allrank) + 1
    data["c2i_mean@r"] = np.mean(allrank)

    print("-"*50)
    print("results of cross-modal retrieval")
    for key, val in data.items():
        print("{}: {}".format(key, val), flush=True)
    print("-"*50)
    return data
Example #11
0
    def __init__(self, args: Namespace, dim: int = 2048) -> None:
        self.data_dir = args.data_dir
        self.images_dir = args.images_dir
        with open(path.join(args.data_dir, args.captions)) as infile:
            self.captions = infile.readlines()
        self.embeddings = np.load(path.join(args.data_dir, args.embeddings))
        self.k = k
        self.metric = metric

        if self.metric == -1:
            # Cosine similarity
            self.index = faiss.IndexFlatIP(dim)
            faiss.normalize_L2(self.embeddings)
            self.index.add(self.embeddings)
        elif self.metric == 1:
            # Euclidean distance (no square root)
            self.index = faiss.IndexFlatL2(dim)
            self.index.add(self.embeddings)
        elif self.metric == 23:
            # Mahalanobis distance
            self.index = faiss.IndexFlatL2(dim)
            x_centered = self.embeddings - self.embeddings.mean(0)
            self.transform = np.linalg.inv(np.linalg.cholesky(
                np.dot(x_centered.T, x_centered) / x_centered.shape[0])).T
            self.index.add(
                np.dot(self.embeddings, self.transform).astype(np.float32))
        elif self.metric == 0:
            # Inner project
            self.index = faiss.IndexFlatIP(dim)
            self.index.add(self.embeddings)
        else:
            self.index = faiss.IndexFlat(dim, self.metric)
            self.index.add(self.embeddings)

        self.model = wide_resnet101_2(pretrained=True, progress=True)
        self.model.eval()  # Don't forget to put model in evaluation mode!
        self.model.fc = Identity()
        # Use recommended sequence of transforms for ImageNet pretrained models
        self.transforms = Compose([Resize(256, interpolation=Image.BICUBIC),  # Default is bilinear
                                   CenterCrop(224),
                                   ToTensor(),
                                   Normalize(mean=[0.485, 0.456, 0.406],
                                             std=[0.229, 0.224, 0.225])])
Example #12
0
 def __init__(self, X, method="inner_prod", *args, **kwargs):
     import faiss
     self.method = method
     if method == "inner_prod":
         self.index = faiss.IndexFlatIP(X.shape[1])
     elif method == "l2":
         self.index = faiss.IndexFlatL2(X.shape[1])
     else:
         raise NotImplementedError()
     self.index.add(X.astype(np.float32))
Example #13
0
def create_index(vectors, dim=300):
    nlist = 5
    nprobe = nlist
    quantizer = faiss.IndexFlatIP(dim)
    index = faiss.IndexIVFFlat(quantizer, dim, nlist,
                               faiss.METRIC_INNER_PRODUCT)
    index.train(vectors)
    index.add(vectors)
    index.nprobe = nprobe
    return index, quantizer
Example #14
0
    def __init__(self, sentence_encoder: Optional[SentenceEncoder] = None):
        self.esses: List[Esse] = []
        self.__load_esses()

        if sentence_encoder is None:
            sentence_encoder = SentenceEncoder()
        self.sentence_encoder: SentenceEncoder = sentence_encoder

        self.index: faiss.IndexFlatIP = faiss.IndexFlatIP(512)
        self.__load_index()
Example #15
0
def get_faiss_ip_index(d=768, use_gpu=True):
    # build a Inner Product (CPU) index
    index_cpu = faiss.IndexFlatIP(d)
    if use_gpu:
        # claim single GPU resource
        resource = faiss.StandardGpuResources()
        # make it into a gpu index
        index_gpu = faiss.index_cpu_to_gpu(resource, 0, index_cpu)
        return index_gpu
    return index_cpu
Example #16
0
 def __build_index(self, index_dimension):
     if self.index_type is IndexType.L2_INDEX:
         log.debug("Building L2 index")
         index = faiss.IndexFlatL2(index_dimension)
     elif self.index_type is IndexType.COSINE_INDEX:
         log.debug("Building cosine index")
         index = faiss.IndexFlatIP(index_dimension)
     else:
         raise ValueError(f"Unknown index type {self.index_type}")
     self.__index = faiss.IndexIDMap(index)
Example #17
0
 def _init_index(self):
     quantizer = faiss.IndexFlatIP(self.dimension)
     if self.n_pq is None:
         self.index = faiss.IndexIVFFlat(quantizer, self.dimension,
                                         self.n_clusters)
     else:
         self.index = faiss.IndexIVFPQ(quantizer, self.dimension,
                                       self.n_clusters, self.n_pq,
                                       self.n_bytes)
     self.index.nprob = self.nprob
def get_faiss_index(faiss_index_path):
    if os.path.exists(faiss_index_path):
        faiss_index = faiss.read_index(faiss_index_path)
        print('Read faiss index from {}'.format(faiss_index_path))
        return faiss_index
    else:
        faiss_index = faiss.IndexFlatIP(4096)
        faiss_index = faiss.IndexIDMap2(faiss_index)
        print('Creating new faiss index at {}'.format(faiss_index_path))
        return faiss_index
def kNN(x, y, k, use_ann_search=False, ann_num_clusters=32768, ann_num_cluster_probe=3):
    start_time = time.time()
    if use_ann_search:
        print("Perform approx. kNN search")
        n_cluster = min(ann_num_clusters, int(y.shape[0]/1000))
        quantizer = faiss.IndexFlatIP(y.shape[1])
        index = faiss.IndexIVFFlat(quantizer, y.shape[1], n_cluster, faiss.METRIC_INNER_PRODUCT)
        index.nprobe = ann_num_cluster_probe
        index.train(y)
        index.add(y)
        sim, ind = index.search(x, k)
    else:
        print("Perform exact search")
        idx = faiss.IndexFlatIP(y.shape[1])
        idx.add(y)
        sim, ind = idx.search(x, k)

    print("Done: {:.2f} sec".format(time.time()-start_time))
    return sim, ind
Example #20
0
    def _get_faiss_index(self):
        # with Pool(cpu_count()) as p:
        #     question_bert = p.map(eval, self.df["Q_FFNN_embeds"].tolist())
        #     answer_bert = p.map(eval, self.df["A_FFNN_embeds"].tolist())
        question_bert = self.df["Q_FFNN_embeds"].tolist()
        self.df.drop(columns=["Q_FFNN_embeds"], inplace=True)
        answer_bert = self.df["A_FFNN_embeds"].tolist()
        self.df.drop(columns=["A_FFNN_embeds"], inplace=True)
        question_bert = np.array(question_bert, dtype='float32')
        answer_bert = np.array(answer_bert, dtype='float32')

        self.answer_index = faiss.IndexFlatIP(answer_bert.shape[-1])

        self.question_index = faiss.IndexFlatIP(question_bert.shape[-1])

        self.answer_index.add(answer_bert)
        self.question_index.add(question_bert)

        del answer_bert, question_bert
Example #21
0
 def __init__(self,
              vector_sz: int,
              buffer_size: int = 50000,
              index_factory_string: str = None):
     super(DenseFlatIndexer, self).__init__(buffer_size=buffer_size)
     self.index = faiss.IndexFlatIP(vector_sz)
     if index_factory_string:
         self.index = faiss.index_factory(vector_sz, index_factory_string,
                                          faiss.METRIC_INNER_PRODUCT)
         self.index.nprobe = 32
    def load_movie_vectors_into_faiss(self):
        """加载电影向量 到 faiss 中"""
        movie_output_vectors, movie_ids_index = self.tower_model_cls.get_movie_vectors(
        )

        movie_output_vectors = movie_output_vectors.astype(np.float32)

        faiss_model = faiss.IndexFlatIP(self.tower_model_cls.dense_size)
        faiss_model.add(movie_output_vectors)
        return faiss_model, movie_ids_index
Example #23
0
def train_faiss(item_feature, D):
    '''
    use IndexFlatIP to get similar item
    '''
    #construct the index
    index = faiss.IndexFlatIP(D)
    index.add(item_feature)
    print index.ntotal
    D, I = index.search(item_feature, 200)
    return I
Example #24
0
def main():
    print("loading X_PCA...")
    X_pca = joblib.load("../chapter09/X_PCA")
    t_index = joblib.load("../chapter09/t_index")
    faiss_pca = faiss.IndexFlatIP(300)
    faiss_pca.add(X_pca.astype('float32'))

    print("loading word2vec...")
    word_vectors = joblib.load("word_vectors")
    word_index = joblib.load("word_index")
    faiss_w2v = faiss.IndexFlatIP(300)
    faiss_w2v.add(word_vectors.astype('float32'))
    with open("family.txt") as f, open(
            "pca_family.txt", mode="w") as f_pca, open("w2v_family.txt",
                                                       mode="w") as f_w2v:
        for line in map(lambda x: x.rstrip(), f):
            word = [""] * 3
            word[0], word[1], word[2], *_ = line.split()
            try:
                # 類似ベクトル検索
                # 9章のベクトル
                v1 = X_pca[t_index[word[0]]]
                v2 = X_pca[t_index[word[1]]]
                v3 = X_pca[t_index[word[2]]]
                vec_pca = v2 - v1 + v3
                sim_num = faiss_pca.search(
                    np.array([vec_pca]).astype('float32'), 1)[1][0][0]
                pred_word = list(t_index.keys())[sim_num]
                f_pca.write(f"{word[0]} {word[1]} {word[2]} {pred_word}\n")
            except:
                f_pca.write(f"{word[0]} {word[1]} {word[2]} -\n")
            try:
                # word2vec
                v1 = word_vectors[word_index[word[0]]]
                v2 = word_vectors[word_index[word[1]]]
                v3 = word_vectors[word_index[word[2]]]
                vec_w2v = v2 - v1 + v3
                sim_num = faiss_w2v.search(
                    np.array([vec_w2v]).astype('float32'), 1)[1][0][0]
                pred_word = list(word_index.keys())[sim_num]
                f_w2v.write(f"{word[0]} {word[1]} {word[2]} {pred_word}\n")
            except:
                f_w2v.write(f"{word[0]} {word[1]} {word[2]} -\n")
Example #25
0
 def setup_model_utils():
     """
     Loads the nlp SpaCy model.
     """
     ModelUtils.nlp = spacy.load(Config.get_config("spacy_model_name_key"))
     ModelUtils.nlp.max_length = 10030000
     ModelUtils.generate_embeddings_matrix()
     ModelUtils.index = faiss.IndexFlatIP(ModelUtils.dimensions)
     faiss.normalize_L2(ModelUtils.embeddings)
     ModelUtils.index.add(ModelUtils.embeddings)
Example #26
0
 def __init__(self,
              feats,
              k,
              index_path='',
              index_key='',
              nprobe=128,
              omp_num_threads=None,
              rebuild_index=True,
              verbose=True,
              **kwargs):
     import faiss
     if omp_num_threads is not None:
         faiss.omp_set_num_threads(omp_num_threads)
     self.verbose = verbose
     with Timer('[faiss] build index', verbose):
         if index_path != '' and not rebuild_index and os.path.exists(
                 index_path):
             print('[faiss] read index from {}'.format(index_path))
             index = faiss.read_index(index_path)
         else:
             feats = feats.astype('float32')
             size, dim = feats.shape
             index = faiss.IndexFlatIP(dim)
             if index_key != '':
                 assert index_key.find(
                     'HNSW') < 0, 'HNSW returns distances insted of sims'
                 metric = faiss.METRIC_INNER_PRODUCT
                 nlist = min(4096, 8 * round(math.sqrt(size)))
                 if index_key == 'IVF':
                     quantizer = index
                     index = faiss.IndexIVFFlat(quantizer, dim, nlist,
                                                metric)
                 else:
                     index = faiss.index_factory(dim, index_key, metric)
                 if index_key.find('Flat') < 0:
                     assert not index.is_trained
                 index.train(feats)
                 index.nprobe = min(nprobe, nlist)
                 assert index.is_trained
                 print('nlist: {}, nprobe: {}'.format(nlist, nprobe))
             index.add(feats)
             if index_path != '':
                 print('[faiss] save index to {}'.format(index_path))
                 mkdir_if_no_exists(index_path)
                 faiss.write_index(index, index_path)
     with Timer('[faiss] query topk {}'.format(k), verbose):
         knn_ofn = index_path + '.npz'
         if os.path.exists(knn_ofn):
             print('[faiss] read knns from {}'.format(knn_ofn))
             self.knns = np.load(knn_ofn)['data']
         else:
             sims, nbrs = index.search(feats, k=k)
             self.knns = [(np.array(nbr, dtype=np.int32),
                           1 - np.array(sim, dtype=np.float32))
                          for nbr, sim in zip(nbrs, sims)]
 def build_index(path, features: np.ndarray, train, normalize):
     if path is None:
         if normalize:
             features = features / (
                 (features**2).sum(axis=1, keepdims=True)**0.5)
         dim = features.shape[1]
         if not train:
             index = faiss.IndexFlatIP(dim)
             index.add(features)
         else:
             quantizer = faiss.IndexFlatIP(dim)
             num_clusters = 100
             index = faiss.IndexIVFFlat(quantizer, dim, num_clusters,
                                        faiss.METRIC_INNER_PRODUCT)
             index.train(features)
             index.add(features)
     else:
         assert os.path.exists(path), f"{path} is not existed!"
         index = faiss.read_index(path)
     return index
def build_index(corpus_embedding, n_cluster=256, embedding_size=768, nprobe=4):
    quantizer = faiss.IndexFlatIP(embedding_size)
    index = faiss.IndexIVFFlat(quantizer, embedding_size, n_cluster,
                               faiss.METRIC_INNER_PRODUCT)
    index.nprobe = nprobe

    corpus_embeddings = corpus_embedding / np.linalg.norm(corpus_embedding,
                                                          axis=1)[:, None]
    index.train(corpus_embeddings)
    index.add(corpus_embeddings)
    return index
Example #29
0
 def __setstate__(self, newstate):
     embedding_space_dims = newstate[newstate['embedding_space_dims_name']]
     similarity_algorithm = newstate[newstate['similarity_algorithm_name']]
     index_np = newstate[newstate['index_np_name']]
     faiss_index = faiss.IndexFlatIP(embedding_space_dims)
     if similarity_algorithm == SimilarityAlgorithm.CosineSimilarity:
         # normalize with L2 as a proxy for cosine search
         faiss.normalize_L2(index_np)
     faiss_index.add(index_np)
     newstate[newstate['faiss_index_name']] = faiss_index
     self.__dict__.update(newstate)
Example #30
0
def _sim_faiss(query_features, index_features, KNN):
    """Faissでsimilarity"""
    import faiss
    assert query_features.shape[1] == index_features.shape[1]
    dim = query_features.shape[1]
    Nq = query_features.shape[0]
    Nd = index_features.shape[0]
    index = faiss.IndexFlatIP(dim)
    index.add(index_features)
    D, I = index.search(query_features, KNN)
    return D, I, (Nq, Nd)