Example #1
0
vector_ids = []
id_query = {}
with open("embed.txt","r")as f:
    for line in f:
        line_lst = line.strip().split("\t")
        vectors.append(list(map(float,(line_lst[2].split()))))
        vector_ids.append(int(line_lst[1]))
        id_query[int(line_lst[1])] = line_lst[0]

vectors = np.array(vectors)
vectors = vectors.astype("float32")
# 创建索引  

quantizer = faiss.IndexFlatL2(vec_dim)  # 使用欧式距离作为度量  
nlist = 16384
faiss_index = faiss.IndexIVFFlat(quantizer, vec_dim, nlist, faiss.METRIC_L2)
faiss_index.nprobe = 16

#ssert not index.is_trained
faiss_index.train(vectors)
faiss_index.add(vectors) 

faiss.write_index(faiss_index,"large.index")
# 查询向量 假设有5个
query_vectors = vectors[:20]
# 搜索结果
# 分别是 每条记录对应topk的距离和索引
# ndarray类型 。shape:len(query_vectors)*topk

res_distance, res_index = faiss_index.search(query_vectors, 5)
t = time.time()
Example #2
0
def find_nearest_neighbors(
    target, emb, k=5, metric="euclidean", gpu_id=None, exact=True
):
    """Find the nearest neighbors for each point.

    :param emb: vectors for the points for which we find the nearest neighbors
    :type emb: numpy.ndarray (num_entities, dim)
    :param emb: vectors for the points from which we find the nearest neighbors.
    :type emb: numpy.ndarray (num_entities, dim)
    :param k: Number of nearest neighbors, defaults to 5
    :type k: int, optional
    :paramm metric: Distance metric for finding nearest neighbors. Available metric `metric="euclidean"`, `metric="cosine"` , `metric="dotsim"`
    :type metric: str
    :return: IDs of emb (indices), and similarity (distances)
    :rtype: indices (numpy.ndarray), distances (numpy.ndarray)

    .. highlight:: python
    .. code-block:: python

        >>> import emlens
        >>> import numpy as np
        >>> emb = np.random.randn(100, 20)
        >>> target = np.random.randn(10, 20)
        >>> A = emlens.find_nearest_neighbors(target, emb, k = 10)
    """
    if emb.flags["C_CONTIGUOUS"]:
        emb = emb.copy(order="C")
    if target.flags["C_CONTIGUOUS"]:
        target = target.copy(order="C")
    emb = emb.astype(np.float32)
    target = target.astype(np.float32)
    # Find the nearest neighbors
    if metric == "euclidean":
        if exact:
            index = faiss.IndexFlatL2(emb.shape[1])
        else:
            quantiser = faiss.IndexFlatL2(emb.shape[1])
            nlist = int(np.ceil(10 * np.sqrt(emb.shape[0])))
            index = faiss.IndexIVFFlat(quantiser, emb.shape[1], nlist, faiss.METRIC_L2)
            index.train(emb)
    elif metric == "cosine":
        denom = np.array(np.linalg.norm(emb, axis=1)).reshape(-1)
        denom[np.isclose(denom, 0)] = 1
        emb = np.einsum("i,ij->ij", 1 / denom, emb)

        denom = np.array(np.linalg.norm(target, axis=1)).reshape(-1)
        denom[np.isclose(denom, 0)] = 1
        target = np.einsum("i,ij->ij", 1 / denom, target)

        if exact:
            index = faiss.IndexFlatIP(emb.shape[1])
        else:
            quantiser = faiss.IndexFlatIP(emb.shape[1])
            nlist = int(np.ceil(10 * np.sqrt(emb.shape[0])))
            index = faiss.IndexIVFFlat(
                quantiser, emb.shape[1], nlist, faiss.METRIC_INNER_PRODUCT
            )
            index.train(emb)
    elif metric == "dotsim":
        if exact:
            index = faiss.IndexFlatIP(emb.shape[1])
        else:
            quantiser = faiss.IndexFlatIP(emb.shape[1])
            nlist = int(np.ceil(10 * np.sqrt(emb.shape[0])))
            index = faiss.IndexIVFFlat(
                quantiser, emb.shape[1], nlist, faiss.METRIC_INNER_PRODUCT
            )
            index.train(emb)
    else:
        raise NotImplementedError("does not support metric: {}".format(metric))

    if gpu_id is None:
        gpu_id = 0

    if k >= 2048:  # if k is larger than that supported by GPU
        index.add(emb)
    else:
        try:
            res = faiss.StandardGpuResources()
            index = faiss.index_cpu_to_gpu(res, gpu_id, index)
            index.add(emb)
        except (RuntimeError, AttributeError):
            index.add(emb)
    distances, neighbors = index.search(target, k=k)

    assert distances.dtype == "float32"
    assert neighbors.dtype == "int64"

    nodes = (np.arange(target.shape[0]).reshape((-1, 1)) @ np.ones((1, k))).astype(int)
    neighbors = neighbors.astype(int)
    return nodes, neighbors, distances
Example #3
0
print(index.ntotal)
serarch_i = np.asarray([data[700], data[100]])

k = 4  # we want to see 4 nearest neighbors
start = time.clock()
D, I = index.search(serarch_i[:5], k)  # sanity check
end = time.clock()
print end - start

# ----------------------------------------------------------
# 加快搜索
nlist = 100  # 聚类中心的个数
k = 4
quantizer = faiss.IndexFlatL2(d)  # the other index
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
# here we specify METRIC_L2, by default it performs inner-product search
assert not index.is_trained
index.train(data)
assert index.is_trained

start_k_suoyin = time.clock()
index.add(data)  # add may be a bit slower as well
end_k_suoyin = time.clock()
print 'the time in add kmeans:'
print end_k_suoyin - start_k_suoyin

serarch_a = np.asarray([data[700], data[200]])
# print serarch_a

start = time.clock()
Example #4
0
    def __init__(self,
                 load_path: str,
                 word_to_idlist_filename: str,
                 entities_list_filename: str,
                 entities_ranking_filename: str,
                 vectorizer_filename: str,
                 faiss_index_filename: str,
                 chunker: NerChunker = None,
                 ner: Chainer = None,
                 ner_parser: EntityDetectionParser = None,
                 entity_ranker: RelRankerBertInfer = None,
                 num_faiss_candidate_entities: int = 20,
                 num_entities_for_bert_ranking: int = 50,
                 num_faiss_cells: int = 50,
                 use_gpu: bool = True,
                 save_path: str = None,
                 fit_vectorizer: bool = False,
                 max_tfidf_features: int = 1000,
                 include_mention: bool = False,
                 ngram_range: List[int] = None,
                 num_entities_to_return: int = 10,
                 lang: str = "ru",
                 use_descriptions: bool = True,
                 lemmatize: bool = False,
                 **kwargs) -> None:
        """

        Args:
            load_path: path to folder with inverted index files
            word_to_idlist_filename: file with dict of words (keys) and start and end indices in
                entities_list filename of the corresponding entity ids
            entities_list_filename: file with the list of entity ids from the knowledge base
            entities_ranking_filename: file with dict of entity ids (keys) and number of relations in Wikidata
                for entities
            vectorizer_filename: filename with TfidfVectorizer data
            faiss_index_filename: file with Faiss index of words
            chunker: component deeppavlov.models.kbqa.ner_chunker
            ner: config for entity detection
            ner_parser: component deeppavlov.models.kbqa.entity_detection_parser
            entity_ranker: component deeppavlov.models.kbqa.rel_ranking_bert_infer
            num_faiss_candidate_entities: number of nearest neighbors for the entity substring from the text
            num_entities_for_bert_ranking: number of candidate entities for BERT ranking using description and context
            num_faiss_cells: number of Voronoi cells for Faiss index
            use_gpu: whether to use GPU for faster search of candidate entities
            save_path: path to folder with inverted index files
            fit_vectorizer: whether to build index with Faiss library
            max_tfidf_features: maximal number of features for TfidfVectorizer
            include_mention: whether to leave entity mention in the context (during BERT ranking)
            ngram_range: char ngrams range for TfidfVectorizer
            num_entities_to_return: number of candidate entities for the substring which are returned
            lang: russian or english
            use_description: whether to perform entity ranking by context and description
            lemmatize: whether to lemmatize tokens
            **kwargs:
        """
        super().__init__(save_path=save_path, load_path=load_path)
        self.morph = pymorphy2.MorphAnalyzer()
        self.lemmatize = lemmatize
        self.word_to_idlist_filename = word_to_idlist_filename
        self.entities_list_filename = entities_list_filename
        self.entities_ranking_filename = entities_ranking_filename
        self.vectorizer_filename = vectorizer_filename
        self.faiss_index_filename = faiss_index_filename
        self.num_entities_for_bert_ranking = num_entities_for_bert_ranking
        self.num_faiss_candidate_entities = num_faiss_candidate_entities
        self.num_faiss_cells = num_faiss_cells
        self.use_gpu = use_gpu
        self.chunker = chunker
        self.ner = ner
        self.ner_parser = ner_parser
        self.entity_ranker = entity_ranker
        self.fit_vectorizer = fit_vectorizer
        self.max_tfidf_features = max_tfidf_features
        self.include_mention = include_mention
        self.ngram_range = ngram_range
        self.num_entities_to_return = num_entities_to_return
        self.lang_str = f"@{lang}"
        if self.lang_str == "@en":
            self.stopwords = set(stopwords.words("english"))
        elif self.lang_str == "@ru":
            self.stopwords = set(stopwords.words("russian"))
        self.use_descriptions = use_descriptions

        self.load()

        if self.fit_vectorizer:
            self.vectorizer = TfidfVectorizer(
                analyzer="char_wb",
                ngram_range=tuple(self.ngram_range),
                max_features=self.max_tfidf_features,
                max_df=0.85)
            self.vectorizer.fit(self.word_list)
            self.matrix = self.vectorizer.transform(self.word_list)
            self.dense_matrix = self.matrix.toarray()
            if self.num_faiss_cells > 1:
                quantizer = faiss.IndexFlatIP(self.max_tfidf_features)
                self.faiss_index = faiss.IndexIVFFlat(quantizer,
                                                      self.max_tfidf_features,
                                                      self.num_faiss_cells)
                self.faiss_index.train(self.dense_matrix.astype(np.float32))
            else:
                self.faiss_index = faiss.IndexFlatIP(self.max_tfidf_features)
                if self.use_gpu:
                    res = faiss.StandardGpuResources()
                    self.faiss_index = faiss.index_cpu_to_gpu(
                        res, 0, self.faiss_index)
            self.faiss_index.add(self.dense_matrix.astype(np.float32))
            self.save_vectorizers_data()
Example #5
0
            result = {}
            result["dim"] = d
            result["nb"] = nb
            result["k"] = k
            print(result)
            xb = np.random.random((nb, d)).astype('float32')
            xb[:, 0] += np.arange(nb) / 1000.
            xq = np.random.random((nq, d)).astype('float32')
            xq[:, 0] += np.arange(nq) / 1000.

            normalize_L2(xb)
            normalize_L2(xq)

            nlist = nb / 10000
            quantizer = faiss.IndexFlatIP(d)  # the other index
            index = faiss.IndexIVFFlat(quantizer, d, nlist,
                                       faiss.METRIC_INNER_PRODUCT)
            index.nprobe = nlist / 4
            index.verbose = True
            assert not index.is_trained
            index.train(xb)
            assert index.is_trained

            index.add(xb)  # add may be a bit slower as well
            spent = []
            for i in range(100):
                start = datetime.datetime.now()
                D, I = index.search(xq[:1000], k)  # actual search
                end = datetime.datetime.now()
                s = end - start
                spent.append(s.total_seconds())
            result["IVF_avg_spent"] = sum(spent) / 100
Example #6
0
def predict_topk(biosyn,
                 eval_dictionary,
                 eval_queries,
                 topk,
                 score_mode='hybrid',
                 type_given=False):
    """
    Parameters
    ----------
    score_mode : str
        hybrid, dense, sparse
    """
    encoder = biosyn.get_dense_encoder()
    tokenizer = biosyn.get_dense_tokenizer()
    sparse_encoder = biosyn.get_sparse_encoder()
    sparse_weight = biosyn.get_sparse_weight().item()  # must be scalar value

    # useful if we're conditioning on types
    all_indv_types = [x for t in eval_dictionary[:, 1] for x in t.split('|')]
    unique_types = np.unique(all_indv_types).tolist()
    v_check_type = np.vectorize(check_label)
    inv_idx = {
        t: v_check_type(eval_dictionary[:, 1], t).nonzero()[0]
        for t in unique_types
    }

    # embed dictionary
    dict_sparse_embeds = biosyn.embed_sparse(names=eval_dictionary[:, 0],
                                             show_progress=True)
    dict_dense_embeds = biosyn.embed_dense(names=eval_dictionary[:, 0],
                                           show_progress=True)

    # build the sparse index
    if not type_given:
        sparse_index = nmslib.init(method='hnsw',
                                   space='negdotprod_sparse_fast',
                                   data_type=nmslib.DataType.SPARSE_VECTOR)
        sparse_index.addDataPointBatch(dict_sparse_embeds)
        sparse_index.createIndex({'post': 2}, print_progress=False)
    else:
        sparse_index = {}
        for sty, indices in inv_idx.items():
            sparse_index[sty] = nmslib.init(
                method='hnsw',
                space='negdotprod_sparse_fast',
                data_type=nmslib.DataType.SPARSE_VECTOR)
            sparse_index[sty].addDataPointBatch(dict_sparse_embeds[indices])
            sparse_index[sty].createIndex({'post': 2}, print_progress=False)

    # build the dense index
    d = dict_dense_embeds.shape[1]
    if not type_given:
        nembeds = dict_dense_embeds.shape[0]
        if nembeds < 10000:  # if the number of embeddings is small, don't approximate
            dense_index = faiss.IndexFlatIP(d)
            dense_index.add(dict_dense_embeds)
        else:
            nlist = int(math.floor(
                math.sqrt(nembeds)))  # number of quantized cells
            nprobe = int(math.floor(
                math.sqrt(nlist)))  # number of the quantized cells to probe
            quantizer = faiss.IndexFlatIP(d)
            dense_index = faiss.IndexIVFFlat(quantizer, d, nlist,
                                             faiss.METRIC_INNER_PRODUCT)
            dense_index.train(dict_dense_embeds)
            dense_index.add(dict_dense_embeds)
            dense_index.nprobe = nprobe
    else:
        dense_index = {}
        for sty, indices in inv_idx.items():
            sty_dict_dense_embeds = dict_dense_embeds[indices]
            nembeds = sty_dict_dense_embeds.shape[0]
            if nembeds < 10000:  # if the number of embeddings is small, don't approximate
                dense_index[sty] = faiss.IndexFlatIP(d)
                dense_index[sty].add(sty_dict_dense_embeds)
            else:
                nlist = int(math.floor(
                    math.sqrt(nembeds)))  # number of quantized cells
                nprobe = int(math.floor(math.sqrt(
                    nlist)))  # number of the quantized cells to probe
                quantizer = faiss.IndexFlatIP(d)
                dense_index[sty] = faiss.IndexIVFFlat(
                    quantizer, d, nlist, faiss.METRIC_INNER_PRODUCT)
                dense_index[sty].train(sty_dict_dense_embeds)
                dense_index[sty].add(sty_dict_dense_embeds)
                dense_index[sty].nprobe = nprobe

    # respond to mention queries
    queries = []
    for eval_query in tqdm(eval_queries, total=len(eval_queries)):
        mentions = eval_query[0].replace("+", "|").split("|")
        golden_cui = eval_query[1].replace("+", "|")
        golden_sty = eval_query[2].replace("+", "|")
        pmid = eval_query[3]
        start_char = eval_query[4]
        end_char = eval_query[5]

        dict_mentions = []
        for mention in mentions:

            mention_sparse_embeds = biosyn.embed_sparse(
                names=np.array([mention]))
            mention_dense_embeds = biosyn.embed_dense(
                names=np.array([mention]))

            # search the sparse index
            if not type_given:
                sparse_nn = sparse_index.knnQueryBatch(mention_sparse_embeds,
                                                       k=topk,
                                                       num_threads=20)
            else:
                sparse_nn = sparse_index[golden_sty].knnQueryBatch(
                    mention_sparse_embeds, k=topk, num_threads=20)
            sparse_idxs, _ = zip(*sparse_nn)
            s_candidate_idxs = np.asarray(sparse_idxs)
            if type_given:
                # reverse mask index mapping
                s_candidate_idxs = inv_idx[golden_sty][s_candidate_idxs]
            s_candidate_idxs = s_candidate_idxs.astype(np.int64)

            # search the dense index
            if not type_given:
                _, d_candidate_idxs = dense_index.search(
                    mention_dense_embeds, topk)
            else:
                _, d_candidate_idxs = dense_index[golden_sty].search(
                    mention_dense_embeds, topk)
                # reverse mask index mapping
                d_candidate_idxs = inv_idx[golden_sty][d_candidate_idxs]
            d_candidate_idxs = d_candidate_idxs.astype(np.int64)

            # get the reduced candidate set
            reduced_candidate_idxs = np.unique(
                np.hstack([
                    s_candidate_idxs.reshape(-1, ),
                    d_candidate_idxs.reshape(-1, )
                ]))

            # get score matrix
            sparse_score_matrix = biosyn.get_score_matrix(
                query_embeds=mention_sparse_embeds,
                dict_embeds=dict_sparse_embeds[
                    reduced_candidate_idxs, :]).todense()
            dense_score_matrix = biosyn.get_score_matrix(
                query_embeds=mention_dense_embeds,
                dict_embeds=dict_dense_embeds[reduced_candidate_idxs, :])

            if score_mode == 'hybrid':
                score_matrix = sparse_weight * sparse_score_matrix + dense_score_matrix
            elif score_mode == 'dense':
                score_matrix = dense_score_matrix
            elif score_mode == 'sparse':
                score_matrix = sparse_score_matrix
            else:
                raise NotImplementedError()

            # take care of getting the best indices
            candidate_idxs = biosyn.retrieve_candidate(
                score_matrix=score_matrix, topk=topk)
            candidate_idxs = reduced_candidate_idxs[candidate_idxs]

            np_candidates = eval_dictionary[candidate_idxs].squeeze()
            dict_candidates = []
            for np_candidate in np_candidates:
                dict_candidates.append({
                    'name':
                    np_candidate[0],
                    'sty':
                    np_candidate[1],
                    'cui':
                    np_candidate[2],
                    'label':
                    check_label(np_candidate[2], golden_cui)
                })
            dict_mentions.append({
                'mention': mention,
                'golden_cui': golden_cui,  # golden_cui can be composite cui
                'pmid': pmid,
                'start_char': start_char,
                'end_char': end_char,
                'candidates': dict_candidates
            })
        queries.append({'mentions': dict_mentions})

    result = {'queries': queries}

    return result
Example #7
0
df.drop_duplicates(inplace=True, subset=["description"])
top_k_hits = 3

df.dropna(inplace=True, subset=["description"])

model = SentenceTransformer(
    'bert-base-nli-stsb-mean-tokens')  # BERT model fine tuned on STS dataset

embedding_size = 768  # Size of embeddings of each book description
top_k = 3  # Number of similarity matchings to output
embedding_cache_path = "data.pkl"
num_clusters = 200

# Define FAISS
quantizer = faiss.IndexFlatIP(embedding_size)
index = faiss.IndexIVFFlat(quantizer, embedding_size, num_clusters,
                           faiss.METRIC_INNER_PRODUCT)

index.nprobe = 3

if not os.path.exists(embedding_cache_path):

    descriptions = []
    titles = []
    isbn13 = []
    isbn = []

    for row in df.itertuples():
        descriptions.append(row.description)
        titles.append(row.title)
        isbn13.append(row.isbn13)
        isbn.append(row.isbn)
Example #8
0
def IVFFlatGpu(config):
    print("IVFFlatGpu, ", config)
    d = config['dimension']  # dimension
    nb = config['db_size']  # database size
    nq = config['query_num']  # nb of queries
    topk = config['top_k']
    nlist = config['nlist']
    nprobe = config['nprobe']
    search_repeat = 10

    res = faiss.StandardGpuResources()  # use a single GPU
    # temp memory
    if config["temp_memory"] == 0:
        res.noTempMemory()
    elif config["temp_memory"] != -1:
        res.setTempMemory(config["temp_memory"] * 1024 * 1024)

    index_list = []
    create_ave_duration = 0
    search_ave_duration = 0

    if config['test_batch_write'] == True:
        batch_write_ave_duration = 0
        batch_write_num = config['write_batch_num']
        batch_write_time = int(nb / config['write_batch_num'])
        print("batch_write_time = ", batch_write_num)
        for i in range(config['db_num']):
            # Using an IVF index
            quantizer = faiss.IndexFlatL2(d)  # the other index
            index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist,
                                           faiss.METRIC_L2)
            gpu_index_ivf = faiss.index_cpu_to_gpu(res, 0, index_ivf)
            batch_write_ave_one_lib = 0
            for j in range(batch_write_time):
                np.random.seed(i * batch_write_time + j)
                xb = np.random.random((batch_write_num, d)).astype('float32')
                xb[:, 0] += np.arange(batch_write_num) / 1000.
                begin_time = time.time()
                if gpu_index_ivf.is_trained == False:
                    print("train, j=", j)
                    gpu_index_ivf.train(xb)
                gpu_index_ivf.add(xb)
                duration = time.time() - begin_time
                batch_write_ave_one_lib += duration
                batch_write_ave_duration += duration
            print(i, ",batch_write_ave_one_lib = ",
                  (batch_write_ave_one_lib / batch_write_time) * 1000 * 1000,
                  " us")
            index_list.append(index_ivf)
        print("batch_write_ave_duration = ",
              (batch_write_ave_duration / len(index_list) / batch_write_time) *
              1000 * 1000, " us")

        return index_list

    for i in range(config['db_num']):
        np.random.seed(i)  # make reproducible
        xb = np.random.random((nb, d)).astype('float32')
        xb[:, 0] += np.arange(nb) / 1000.
        begin_time = time.time()
        quantizer = faiss.IndexFlatL2(d)  # the other index
        index_ivf = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
        # here we specify METRIC_L2, by default it performs inner-product search
        # make it an IVF GPU index
        gpu_index_ivf = faiss.index_cpu_to_gpu(res, 0, index_ivf)
        assert not gpu_index_ivf.is_trained
        gpu_index_ivf.train(xb)  # add vectors to the index
        assert gpu_index_ivf.is_trained
        gpu_index_ivf.add(xb)  # add vectors to the index
        gpu_index_ivf.nprobe = nprobe
        duration = time.time() - begin_time
        create_ave_duration += duration
        index_list.append(gpu_index_ivf)
        if i == 0:
            gpu_index_ivf.search(xb[:5], 4)
    print("craete ave duration = ", create_ave_duration / len(index_list),
          " s")
    if len(index_list) == 0:
        return index_list
    for i in range(len(index_list)):
        for j in range(search_repeat):
            np.random.seed(i * search_repeat + j + config['db_num'])
            xq = np.random.random((nq, d)).astype('float32')
            xq[:, 0] += np.arange(nq) / 1000.
            begin_time = time.time()
            index_list[i].search(xq, topk)  # actual search
            duration = time.time() - begin_time
            search_ave_duration += duration

    print("search index aver time = ",
          search_ave_duration / len(index_list) / search_repeat, " s")
    return index_list
Example #9
0
    def build_index(self,
                    sentences_or_file_path: Union[str, List[str]],
                    use_faiss: bool = None,
                    faiss_fast: bool = False,
                    device: str = None,
                    batch_size: int = 64):

        if use_faiss is None or use_faiss:
            try:
                import faiss
                assert hasattr(faiss, "IndexFlatIP")
                use_faiss = True
            except:
                logger.warning(
                    "Fail to import faiss. If you want to use faiss, install faiss through PyPI. Now the program continues with brute force search."
                )
                use_faiss = False

        # if the input sentence is a string, we assume it's the path of file that stores various sentences
        if isinstance(sentences_or_file_path, str):
            sentences = []
            with open(sentences_or_file_path, "r") as f:
                logging.info("Loading sentences from %s ..." %
                             (sentences_or_file_path))
                for line in tqdm(f):
                    sentences.append(line.rstrip())
            sentences_or_file_path = sentences

        logger.info("Encoding embeddings for sentences...")
        embeddings = self.encode(sentences_or_file_path,
                                 device=device,
                                 batch_size=batch_size,
                                 normalize_to_unit=True,
                                 return_numpy=True)

        logger.info("Building index...")
        self.index = {"sentences": sentences_or_file_path}

        if use_faiss:
            quantizer = faiss.IndexFlatIP(embeddings.shape[1])
            if faiss_fast:
                index = faiss.IndexIVFFlat(
                    quantizer, embeddings.shape[1],
                    min(self.num_cells, len(sentences_or_file_path)))
            else:
                index = quantizer

            if (self.device == "cuda" and device != "cpu") or device == "cuda":
                if hasattr(faiss, "StandardGpuResources"):
                    logger.info("Use GPU-version faiss")
                    res = faiss.StandardGpuResources()
                    res.setTempMemory(20 * 1024 * 1024 * 1024)
                    index = faiss.index_cpu_to_gpu(res, 0, index)
                else:
                    logger.info("Use CPU-version faiss")
            else:
                logger.info("Use CPU-version faiss")

            if faiss_fast:
                index.train(embeddings.astype(np.float32))
            index.add(embeddings.astype(np.float32))
            index.nprobe = min(self.num_cells_in_search,
                               len(sentences_or_file_path))
            self.is_faiss_index = True
        else:
            index = embeddings
            self.is_faiss_index = False
        self.index["index"] = index
        logger.info("Finished")
Example #10
0
def train_faiss(item_vector):
    quantizer = faiss.IndexFlatL2(item_vector.shape[1])
    index = faiss.IndexIVFFlat(quantizer, item_vector.shape[1], 80)
    index.train(item_vector)
    index.add(item_vector)
    return index
Example #11
0
        "hdfs://localhost:9000/database_embeddings/*",
        recursiveFileLookup=True).select("features").toJSON().collect()
    # writing databases to database_vector.pkl
    list_all_db_vectors = []
    list_all_db_vectors_index = []
    i = 0
    for data in databases:
        data = json.loads(data)["features"]
        data = get_features(data, "database")
        data_index = [i] * len(data)
        list_all_db_vectors += data
        list_all_db_vectors_index.append(data_index)
    list_all_db_vectors = np.asarray(list_all_db_vectors).astype('float32')
    list_all_db_vectors = normalize(list_all_db_vectors, axis=1, norm='l2')
    list_all_db_vectors_index = flatten(list_all_db_vectors_index)
    with open("database_vector_index.pkl", "wb") as f:
        pickle.dump(list_all_db_vectors_index, f)
    # initializing database vectors
    # with open("database_vector.pkl",'rb') as f :
    #     database_vector = pickle.load(f)
    # initialization for FAISS algorithm
    cluster = NUMBER_OF_CLUSTER
    dimension = list_all_db_vectors[0].shape[0]
    quantiser = faiss.IndexFlatIP(dimension)
    index = faiss.IndexIVFFlat(quantiser, dimension, cluster,
                               faiss.METRIC_INNER_PRODUCT)
    # training index on database vectors
    index.train(list_all_db_vectors)
    index.add(list_all_db_vectors)
    faiss.write_index(index, "database_faiss.index")
Example #12
0
def search(q, query_type):
    videos = findVideos(q)
    timings = {}
    for each_video in videos:
        df_embeddings = pd.read_csv("data/" + str(each_video) +
                                    '/person_embeddings_mapping.csv',
                                    sep='\t')
        cols = list(df_embeddings)
        cols.insert(0, cols.pop(cols.index('Embeddings')))
        df_embeddings = df_embeddings.ix[:, cols]

        x = str(df_embeddings['Embeddings'].tolist()).replace("\'", "")
        x = ast.literal_eval(x)
        y = numpy.array(x)
        y = y.astype('float32')
        d = 128
        nlist = 1
        k = 1
        quantizer = faiss.IndexFlatL2(d)  # the other index
        index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

        index.train(y)  # t and y ma k farak cha?

        index.add(y)  # add may be a bit slower as well

        D, I = index.search(q, k)  # actual search
        pos = [0] * len(I)
        p = [0] * len(I)
        # if face is not present: then add to the list
        if I == [[]]:
            print("Not found")

        else:
            for i in range(len(I)):
                pos[i] = I[i][0]
                p[i] = df_embeddings.iloc[pos[i], 1]

        df_person_bitmap = pd.read_csv("data/" + str(each_video) +
                                       '/person_bitmap_vector.csv',
                                       sep='\t')
        person_bitmap = [0] * len(p)
        for i in range(len(p)):
            person_bitmap[i] = df_person_bitmap.loc[
                df_person_bitmap['person_label'] == int(
                    p[i])]['BitMap'].values[0]
            person_bitmap[i] = str(person_bitmap[i]).replace("\'", "")
            person_bitmap[i] = json.loads(person_bitmap[i])
            person_bitmap[i] = numpy.array(person_bitmap[i])

        if query_type == 'next':
            timings[each_video] = next(person_bitmap[0], person_bitmap[1])
        if query_type == 'eventually':
            timings[each_video] = eventually(person_bitmap[0],
                                             person_bitmap[1])
        if query_type == 'is_before':
            timings[each_video] = is_a_before_b(person_bitmap[0],
                                                person_bitmap[1])
        if query_type == 'interval':
            timings[each_video] = interval(person_bitmap)

    return timings
Example #13
0
 def __init__(self) -> None:
     quantizer = faiss.IndexFlatL2(self.d)
     self.index = faiss.IndexIVFFlat(quantizer, self.d, self.nlist, faiss.METRIC_L2)
     self.index.nprobe = self.nprobe
Example #14
0
 def create_index(dim: int = 512, cells: int = 100):
     return faiss.IndexIVFFlat(faiss.IndexFlatL2(dim), dim, cells)
Example #15
0
def main(sys):
    np.seterr(over='ignore')
    m = len(sys)
    print ("The script has the name %s" % (sys[0]))
    print("initiate: %s " % (sys[0]))
    print ("Number of arguments: ", m, " arguments.")


    input_file_dataset = sys[1]
    input_file_queries = sys[2]
    k = int(sys[3])
    var = sys[4]
    run = sys[5]
    ground_truth_D = sys[6]
    ground_truth_I = sys[7]
    error = float(sys[8])
    nlist = int(sys[9])       #number of clusters
    nprobe = int(sys[10])          #how many times repeat search

    print("check of the arguments")
    for i in range(m):
        print("arguments: %s " % (sys[i]))



    dataset = os.path.realpath(input_file_dataset)
    queryset = os.path.realpath(input_file_queries)
    groundtruth_D = os.path.realpath(ground_truth_D)
    groundtruth_I = os.path.realpath(ground_truth_I)
    #ground_truth = os.path.realpath(output_file_gt)

    a_vectors = np.loadtxt(dataset).astype(np.float32)
    query_set = np.loadtxt(queryset).astype(np.float32)
    GT_D = np.loadtxt(groundtruth_D).astype(np.float32)
    GT_I = np.loadtxt(groundtruth_I).astype(np.float32)

    n_db = len(a_vectors)
    d = len(a_vectors[0])  #dimension of database
    n_q = len(query_set)


    # nlist = int(len(a_vectors) / k)  #number of clusters
    # nprobe = int((k/2)+1)          #how many times repeat search


    print("check of dimensions")
    print("param n_db",  n_db)
    print("param d",  d)
    print("param k",  k)
    print("param n_q",  n_q)
    print("param nlist",  nlist)
    print("param nprobe",  nprobe)
    print("param error",  error)



    print("faiss ...")
    start1 = time.clock()
    quantizer = faiss.IndexFlatL2(d)   # build the index
    index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)
    assert not index.is_trained
    index.train(a_vectors)
    assert index.is_trained
    index.add( a_vectors)
    stop1 = time.clock()


    start2 = time.clock()
    index.nprobe = nprobe  # if set to nlist = same as brute force, part of autotuning performance - speed accuracy tradeoff
    D, I = index.search(query_set, k)     # actual search
    stop2 = time.clock()



    #run recall
    recall_i = recall_similar_match( GT_I, I)
    recall_d = recall_with_error( GT_D, D, error)

    stringname_D = 'D' + sys[0][1:7] + '_' + var +'.txt'
    stringname_I = 'I' + sys[0][1:7] + '_' + var +'.txt'
    np.savetxt(stringname_D, D)
    np.savetxt(stringname_I, I)

    time1 = stop1 - start1
    time2 = stop2 - start2
    #run, filename, index_time,  build_time, recall_D, recall_I, n_db, n_q, d, k
    print_time(run, sys[0], time1, time2, recall_d, recall_i, n_db, n_q, d, k, error)
    print("finish")
    def create_faiss(self):
        quantiser = faiss.IndexFlatL2(self.num_dimensions)
        self.faiss_params = {}
        if self.similarity_metric == 'cosine':
            self.faiss_params['preprocess_opt'] = 'norm'
            self.faiss_params['metric'] = faiss.METRIC_L2

        elif self.similarity_metric == 'inner':
            self.faiss_params['preprocess_opt'] = 'false'
            self.faiss_params['metric'] = faiss.METRIC_INNER_PRODUCT

        elif self.similarity_metric == 'euclidean':
            self.faiss_params['preprocess_opt'] = 'false'
            self.faiss_params['metric'] = faiss.METRIC_L2

        elif self.similarity_metric == 'mahalanobis':
            self.faiss_params['preprocess_opt'] = 'covar'
            self.faiss_params['metric'] = faiss.METRIC_L2

        self.faiss_indices = {
            'title':
            faiss.IndexIVFFlat(quantiser, self.num_dimensions,
                               self.num_centroids,
                               self.faiss_params['metric']),
            'abstract':
            faiss.IndexIVFFlat(quantiser, self.num_dimensions,
                               self.num_centroids,
                               self.faiss_params['metric']),
            'body':
            faiss.IndexIVFFlat(quantiser, self.num_dimensions,
                               self.num_centroids,
                               self.faiss_params['metric']),
        }

        # Title train
        self.lookup_titles = {}
        documents_with_titles = []

        self.lookup_abstracts = {}
        documents_with_abstracts = []

        self.lookup_bodies = {}
        documents_with_bodies = []
        for doc in self.documents:
            v_title = doc.mean_vector_title()
            v_abstract = doc.mean_vector('abstract')
            v_body = doc.mean_vector('body')

            if not (v_title is None or len(v_title.shape) != 1
                    and v_title.shape[0] != self.num_dimensions):
                self.lookup_titles[len(documents_with_titles)] = doc
                documents_with_titles.append(v_title)

            if not (v_abstract is None or len(v_abstract.shape) != 1
                    and v_abstract.shape[0] != self.num_dimensions):
                self.lookup_abstracts[len(documents_with_abstracts)] = doc
                documents_with_abstracts.append(v_abstract)

            if not (v_body is None or len(v_body.shape) != 1
                    and v_body.shape[0] != self.num_dimensions):
                self.lookup_bodies[len(documents_with_bodies)] = doc
                documents_with_bodies.append(v_body)

        vectors = self.search_preprocess(np.stack(documents_with_titles,
                                                  axis=0),
                                         is_train=True)
        self.faiss_indices['title'].train(vectors)
        self.faiss_indices['title'].add(vectors)

        # Train abstract
        vectors = self.search_preprocess(np.stack(documents_with_abstracts,
                                                  axis=0),
                                         is_train=True)
        self.faiss_indices['abstract'].train(vectors)
        self.faiss_indices['abstract'].add(vectors)

        # Train full
        vectors = self.search_preprocess(np.stack(documents_with_bodies,
                                                  axis=0),
                                         is_train=True)
        self.faiss_indices['body'].train(vectors)
        self.faiss_indices['body'].add(vectors)
Example #17
0
# define msg queue names
msg_queues = {}
msg_queues['crawler_notify'] = "CRAWLER_NOTIFY_QUE"
msg_queues['feature_detect'] = "FEATURE_DETECT_QUE"
msg_queues['detect_finish'] = "FEATURE_DETECT_FINISH_QUE"

# feature detect related config
static_image_feature_dir = "/Users/tonyyoung/test/feature/inception/image"
animated_image_feature_dir = "/Users/tonyyoung/test/feature/600"
static_dimension = 2048
animaed_dimension = 600
nlist = 20  # Number of clustering centers
quantizer_static = faiss.IndexFlatL2(static_dimension)
quantizer_animaed = faiss.IndexFlatL2(animaed_dimension)

static_image_index = faiss.IndexIVFFlat(quantizer_static, static_dimension,
                                        nlist, faiss.METRIC_L2)
animated_image_index = faiss.IndexIVFFlat(quantizer_animaed, animaed_dimension,
                                          nlist, faiss.METRIC_L2)
'''static_image_index = faiss.IndexFlatL2(static_dimension)
animated_image_index = faiss.IndexFlatL2(animaed_dimension)'''

duplicate_threshold = 10

# OSS account info.
clientId = 'LTAIW5NjZnlwWIjr'
clientSecret = 'BWajgSlWW32EtuQbTmDywvSf7pvwuj'

MEDIA_ROOT = "/home/ubuntu/workspace/smile_sv/smile/media"

# ========================= Redis operations ========================
'''sticker_md5 = "sticker_md5"
 def __init__(self, dim=VECTOR_DIMENSION):
     # if isfile(save_path):
     #     self._index = faiss.read_index(save_path)
     # else:
     quantizer = faiss.IndexFlatL2(dim)
     self._index = faiss.IndexIVFFlat(quantizer, dim, N_LIST, faiss.METRIC_L2)
Example #19
0
def ivf_search():
    '''
    IndexIVFFlat 倒排搜索  基于 直接搜索
    要求:行需大于等于 行分割数nlist
    优点:减少计算量速度提升明显,5百万8G 0.5秒
    缺点:丢了点精度,数据倒排耗时
    通过使用IndexIVFFlat索引,将数据集分割成多个,
    我们在d维空间中定义Voronoi单元,每个数据库向量落在其中一个单元格中。在搜索时,只有查询x所在的单元格中包含的数据库向量y和几个相邻的数据库向量y与查询向量进行比较
    :return:
    '''
    nlist = 4  # 行分割数-行分割成的单元格数,分割数越多分割越耗时
    index = faiss.IndexFlatL2(d)
    iv_index = faiss.IndexIVFFlat(index, d, nlist, faiss.METRIC_L2)
    if not iv_index.is_trained:
        iv_index.train(xb)  # train xb
    iv_index.add(xb)  # 添加xb

    # 动态新增数据
    xb1 = np.random.random((1, d)).astype('float32')
    xb1[:, 0] += np.arange(1) / 1000.
    xb1[0][0] -= 1
    print(xb1)

    iv_index.train(xb1)  # 需要训练
    iv_index.add(xb1)

    xb2 = np.ones((1, d)).astype('float32')
    iv_index.train(xb2)
    iv_index.add(xb2)

    iv_index.nprobe = 3  # 搜索访问的单元格数,访问越大越精确越耗时 :既 128维倒排切割成4份,只计算3份的维度相似

    start = time.time()
    print(type(xq))
    D, I = iv_index.search(xq, K)
    print('ivf_search_time', time.time() - start)

    print('ivf_search_相似索引', I)
    print('ivf_search_相似值', D)

    xq3 = np.ones((1, d)).astype('float32')
    xq3[:, 0] += np.arange(1) / 1000.
    D, I = iv_index.search(xq3, K)

    print('ivf_search_相似索引', I)
    print('ivf_search_相似值', D)

    vectors = MyRecognition.face_descriptors(
        img_path=
        '/home/stringk/PycharmProjects/pytorchDemo/demos/face_recognition_demo/img/0/0image0.jpg'
    )
    for v in vectors:
        nv = np.array([v], dtype='float32')
        print(nv)
        print(type(nv))
        print(nv[0][-1])
        iv_index.train(nv)
        iv_index.add(nv)
        D, I = iv_index.search(nv, K)
        print('ivf_search_相似索引', I)
        print('ivf_search_相似值', D)
Example #20
0
def build_index(cfg: DictConfig, model: object):
    """
    Builds faiss index from index dataset specified in the config.
        
    Args:
        cfg (DictConfig): Config file specifying index parameters
        model (object): Encoder model
    """

    # Get index dataset embeddings
    # PCA model exists and index embeddings have already been PCAed, no need to re-extract/PCA them
    if cfg.apply_pca and os.path.isfile(
            cfg.pca.pca_save_name) and os.path.isfile(
                cfg.pca_embeddings_save_name):
        logging.info("Loading reduced dimensionality embeddings")
        embeddings = h5py.File(cfg.pca_embeddings_save_name, "r")
        embeddings = embeddings[cfg.index_ds.name][:]

    elif os.path.isfile(cfg.embedding_save_name):
        logging.info("Loading previously extracted index dataset embeddings")
        embeddings = h5py.File(cfg.embedding_save_name, "r")
        embeddings = embeddings[cfg.index_ds.name][:]

    else:
        logging.info("Encoding index dataset, this may take a while")
        index_dataloader = model.setup_dataloader(cfg.index_ds,
                                                  is_index_data=True)
        embeddings, concept_ids = get_index_embeddings(cfg, index_dataloader,
                                                       model)

    # Create pca model to reduce dimensionality of index dataset and decrease memory footprint
    if cfg.apply_pca:

        # Need to train PCA model and apply PCA transformation with newly trained model
        if not os.path.isfile(cfg.pca.pca_save_name):
            logging.info(
                "Fitting PCA model for embedding dimensionality reduction")
            pca_train_set = random.sample(
                list(embeddings),
                k=int(len(embeddings) * cfg.pca.sample_fraction))
            pca = PCA(n_components=cfg.pca.output_dim)
            pca.fit(pca_train_set)
            pkl.dump(pca, open(cfg.pca.pca_save_name, "wb"))
            embeddings = reduce_embedding_dim(pca, embeddings, cfg)

        # PCA model already trained, just need to reduce dimensionality of all embeddings
        elif not os.path.isfile(cfg.pca_embeddings_save_name):
            pca = pkl.load(open(cfg.pca.pca_save_name, "rb"))
            embeddings = reduce_embedding_dim(pca, embeddings, cfg)

    # Build faiss index from embeddings
    logging.info(
        f"Training index with embedding dim size {cfg.dims} using {faiss.get_num_gpus()} gpus"
    )
    quantizer = faiss.IndexFlatL2(cfg.dims)
    index = faiss.IndexIVFFlat(quantizer, cfg.dims, cfg.nlist)
    index = faiss.index_cpu_to_all_gpus(index)
    index.train(embeddings)

    logging.info("Adding dataset embeddings to index")
    for i in tqdm(range(0, embeddings.shape[0], cfg.index_batch_size)):
        index.add(embeddings[i:i + cfg.index_batch_size])

    logging.info("Saving index")
    faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index_save_name)
    logging.info("Index built and saved")
Example #21
0
# features_astype = features.astype(np.float32)

# mat = faiss.PCAMatrix (1024, 128)
# mat.train(features_astype)
# assert mat.is_trained
# features_shape = mat.apply_py(features_astype)
# print(features_shape.shape)
# np.savetxt('PCA_features.txt',features_shape)

print(features_shape.shape)

dimension = 1024
n = 95276
nlist = 50
quantiser = faiss.IndexFlatL2(dimension)  
index = faiss.IndexIVFFlat(quantiser, dimension, nlist,   faiss.METRIC_L2)

# print(index.is_trained)  
index.train(features_shape) 
# print(index.ntotal)  
index.add(features_shape)   
# print(index.is_trained)  
# print(index.ntotal)

nprobe = 10  # find 2 most similar clusters
k = 5  # return 3 nearest neighbours 

a = np.reshape(features_shape[1], (1, -1))
distances, indices = index.search(a, k)
print(distances)
print(indices)
Example #22
0
def serve(args):
    # serve_demo: Load saved embeddings, serve question model. question in, results out.
    # serve_question: only serve question model. question in, vector out.
    # serve_context: only serve context model. context in, phrase-vector pairs out.
    # serve: serve all three.
    device = torch.device('cuda' if args.cuda else 'cpu')
    pprint(args.__dict__)

    interface = FileInterface(**args.__dict__)
    # use cache for metadata
    if args.cache:
        out = interface.cache(preprocess, args)
        processor = out['processor']
        processed_metadata = out['processed_metadata']
    else:
        processor = Processor(**args.__dict__)
        metadata = interface.load_metadata()
        processed_metadata = processor.process_metadata(metadata)

    model = Model(**args.__dict__).to(device)
    model.init(processed_metadata)
    interface.bind(processor, model)

    interface.load(args.iteration, session=args.load_dir)

    with torch.no_grad():
        model.eval()

        if args.mode == 'serve_demo':
            phrases = []
            paras = []
            results = []
            embs = []
            idxs = []
            iterator = interface.context_load(metadata=True, emb_type=args.emb_type)
            for _, (cur_phrases, each_emb, metadata) in zip(range(args.num_train_mats), iterator):
                embs.append(each_emb)
                phrases.extend(cur_phrases)
                for span in metadata['answer_spans']:
                    results.append([len(paras), span[0], span[1]])
                    idxs.append(len(idxs))
                paras.append(metadata['context'])
            if args.emb_type == 'dense':
                import faiss
                emb = np.concatenate(embs, 0)

                d = 4 * args.hidden_size * args.num_heads
                if args.metric == 'ip':
                    quantizer = faiss.IndexFlatIP(d)  # Exact Search
                elif args.metric == 'l2':
                    quantizer = faiss.IndexFlatL2(d)
                else:
                    raise ValueError()

                if args.nlist != args.nprobe:
                    # Approximate Search. nlist > nprobe makes it faster and less accurate
                    if args.bpv is None:
                        if args.metric == 'ip':
                            search_index = faiss.IndexIVFFlat(quantizer, d, args.nlist, faiss.METRIC_INNER_PRODUCT)
                        elif args.metric == 'l2':
                            search_index = faiss.IndexIVFFlat(quantizer, d, args.nlist)
                        else:
                            raise ValueError()
                    else:
                        assert args.metric == 'l2'  # only l2 is supported for product quantization
                        search_index = faiss.IndexIVFPQ(quantizer, d, args.nlist, args.bpv, 8)
                    search_index.train(emb)
                else:
                    search_index = quantizer

                search_index.add(emb)
                for cur_phrases, each_emb, metadata in iterator:
                    phrases.extend(cur_phrases)
                    for span in metadata['answer_spans']:
                        results.append([len(paras), span[0], span[1]])
                    paras.append(metadata['context'])
                    search_index.add(each_emb)

                if args.nlist != args.nprobe:
                    search_index.nprobe = args.nprobe

                def search(emb, k):
                    D, I = search_index.search(emb, k)
                    return D[0], I[0]

            elif args.emb_type == 'sparse':
                assert args.metric == 'l2'  # currently only l2 is supported (couldn't find a good ip library)
                import pysparnn.cluster_index as ci

                cp = ci.MultiClusterIndex(embs, idxs)

                for cur_phrases, each_emb, metadata in iterator:
                    phrases.extend(cur_phrases)
                    for span in metadata['answer_spans']:
                        results.append([len(paras), span[0], span[1]])
                    paras.append(metadata['context'])
                    for each_vec in each_emb:
                        cp.insert(each_vec, len(idxs))
                        idxs.append(len(idxs))

                def search(emb, k):
                    return zip(*[each[0] for each in cp.search(emb, k=k)])

            else:
                raise ValueError()

            def retrieve(question, k):
                example = {'question': question, 'id': 'real', 'idx': 0}
                dataset = (processor.preprocess(example), )
                loader = DataLoader(dataset, batch_size=1, collate_fn=processor.collate)
                batch = next(iter(loader))
                question_output = model.get_question(**batch)
                question_results = processor.postprocess_question_batch(dataset, batch, question_output)
                id_, emb = question_results[0]
                D, I = search(emb, k)
                out = [(paras[results[i][0]], results[i][1], results[i][2], '%.4r' % d.item(),)
                       for d, i in zip(D, I)]
                return out

            if args.mem_info:
                import psutil
                import os
                pid = os.getpid()
                py = psutil.Process(pid)
                info = py.memory_info()[0] / 2. ** 30
                print('Memory Use: %.2f GB' % info)

            # Demo server. Requires flask and tornado
            from flask import Flask, request, jsonify
            from flask_cors import CORS

            from tornado.wsgi import WSGIContainer
            from tornado.httpserver import HTTPServer
            from tornado.ioloop import IOLoop

            app = Flask(__name__, static_url_path='/static')

            app.config['JSONIFY_PRETTYPRINT_REGULAR'] = False
            CORS(app)

            @app.route('/')
            def index():
                return app.send_static_file('index.html')

            @app.route('/files/<path:path>')
            def static_files(path):
                return app.send_static_file('files/' + path)

            @app.route('/api', methods=['GET'])
            def api():
                query = request.args['query']
                out = retrieve(query, 5)
                return jsonify(out)

            print('Starting server at %d' % args.port)
            http_server = HTTPServer(WSGIContainer(app))
            http_server.listen(args.port)
            IOLoop.instance().start()
 def test_IndexIVF_2(self):
     index = faiss.IndexIVFFlat(faiss.IndexFlatL2(d), d, 10)
     index.train(xt)
     index.add(xb)
Example #24
0
def _build_query_ann_index(
        charge: int, mz_splits: np.ndarray, vectorize: Callable, n_probe: int,
        batch_size: int, n_neighbors: int, n_neighbors_ann: int,
        precursor_tol_mass: float, precursor_tol_mode: str,
        distances: np.ndarray, indices: np.ndarray, indptr: np.ndarray,
        work_dir: str) -> pd.DataFrame:
    """
    Create ANN index(es) for spectra with the given charge per precursor m/z
    split.

    Parameters
    ----------
    charge : int
        Precursor charge of the spectra to be processed.
    mz_splits : np.ndarray
        M/z splits used to create separate ANN indexes.
    vectorize : Callable
        Function to convert the spectra to vectors.
    batch_size : int
        The number of vectors to be simultaneously added to the index.
    work_dir : str
        Directory to read and store (intermediate) results.

    Returns
    -------
    pd.DataFrame
        Metadata (identifier, precursor charge, precursor m/z) of the spectra
        for which indexes were built.
    """
    identifiers, precursor_mzs = [], []
    indptr_i = 0
    # Find neighbors per specified precursor m/z interval.
    for mz in tqdm.tqdm(mz_splits, desc='Intervals queried', unit='index'):
        pkl_filename = os.path.join(work_dir, 'spectra', f'{charge}_{mz}.pkl')
        if not os.path.isfile(pkl_filename):
            continue
        # Read the spectra for the m/z split.
        with open(pkl_filename, 'rb') as f_in:
            spectra_split = pickle.load(f_in)
        precursor_mzs_split = []
        for spec in spectra_split:
            identifiers.append(spec.identifier)
            precursor_mzs_split.append(spec.precursor_mz)
        precursor_mzs.append(np.asarray(precursor_mzs_split))
        # Convert the spectra to vectors.
        vectors_split = vectorize(spectra_split)
        n_split, dim = len(spectra_split), vectors_split.shape[1]
        # Figure out a decent value for the n_list hyperparameter based on
        # the number of vectors.
        # Rules of thumb from the Faiss wiki:
        # https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index#how-big-is-the-dataset
        if n_split == 0:
            continue
        if n_split < 10e2:
            # Use a brute-force index instead of an ANN index when there
            # are only a few items.
            n_list = -1
        elif n_split < 10e5:
            n_list = 2**math.floor(math.log2(n_split / 39))
        elif n_split < 10e6:
            n_list = 2**16
        elif n_split < 10e7:
            n_list = 2**18
        else:
            n_list = 2**20
            if n_split > 10e8:
                logger.warning('More than 1B vectors to be indexed, consider '
                               'decreasing the ANN size')
        # Create an ANN index using the inner product (proxy for cosine
        # distance) for fast NN queries.
        if n_list <= 0:
            index = faiss.IndexIDMap(faiss.IndexFlatIP(dim))
        else:
            index = faiss.IndexIVFFlat(faiss.IndexFlatIP(dim), dim, n_list,
                                       faiss.METRIC_INNER_PRODUCT)
            index.nprobe = min(math.ceil(index.nlist / 8), n_probe)
        # Compute cluster centroids.
        # noinspection PyArgumentList
        index.train(vectors_split)
        # Add the vectors to the index in batches.
        batch_size = min(n_split, batch_size)
        for batch_start in range(0, n_split, batch_size):
            batch_stop = min(batch_start + batch_size, n_split)
            # noinspection PyArgumentList
            index.add_with_ids(vectors_split[batch_start:batch_stop],
                               np.arange(batch_start, batch_stop))
        # Query the index to calculate NN distances.
        _dist_mz_interval(
            index, vectors_split, precursor_mzs[-1], batch_size, n_neighbors,
            n_neighbors_ann, precursor_tol_mass, precursor_tol_mode,
            distances, indices, indptr, indptr_i)
        indptr_i += vectors_split.shape[0]
    return pd.DataFrame({'identifier': identifiers, 'precursor_charge': charge,
                         'precursor_mz': np.hstack(precursor_mzs)})
Example #25
0
def main(sys):
    np.seterr(over='ignore')
    m = len(sys)
    print("The script has the name %s" % (sys[0]))
    print("initiate: %s " % (sys[0]))
    print("Number of arguments: ", m, " arguments.")

    input_file_dataset = sys[1]
    input_file_queries = sys[2]
    k = int(sys[3])
    var = sys[4]
    run = sys[5]
    ground_truth_D = sys[6]
    ground_truth_I = sys[7]
    error = float(sys[8])
    nlist = int(sys[9])  #number of clusters
    nprobe = int(sys[10])  #how many times repeat search

    print("check of the arguments")
    for i in range(m):
        print("arguments: %s " % (sys[i]))

    dataset = os.path.realpath(input_file_dataset)
    queryset = os.path.realpath(input_file_queries)
    groundtruth_D = os.path.realpath(ground_truth_D)
    groundtruth_I = os.path.realpath(ground_truth_I)
    #ground_truth = os.path.realpath(output_file_gt)

    a_vectors = np.loadtxt(dataset).astype(np.float32)
    query_set = np.loadtxt(queryset).astype(np.float32)
    GT_D = np.loadtxt(groundtruth_D).astype(np.float32)
    GT_I = np.loadtxt(groundtruth_I).astype(np.float32)
    #
    # k = len(GT_D[0])
    n_db = len(a_vectors)
    d = len(a_vectors[0])  #dimension of database
    n_q = len(query_set)
    fo = len(a_vectors)
    # nlist = int(float(fo) / k)  #number of clusters
    # nprobe = int((k/2)+1)          #how many times repeat search

    print("check of dimensions")
    print("param n_db", n_db)
    print("param d", d)
    print("param k", k)
    print("param n_q", n_q)
    print("param nlist", nlist)
    print("param nprobe", nprobe)
    print("param error", error)

    print("faiss ...")
    start1 = time.clock()

    quantizer = faiss.IndexHNSWFlat(d, 32)
    index = faiss.IndexIVFFlat(quantizer, d, nlist)
    index.cp.min_points_per_centroid = 10  # quiet warning
    index.quantizer_trains_alone = 2

    assert not index.is_trained
    index.train(a_vectors)
    assert index.is_trained

    quantizer.hnsw.efSearch = nprobe  #setting up the precission, higger the better but slower (def. is 40)
    index.add(a_vectors)

    stop1 = time.clock()
    #----#start search
    start2 = time.clock()

    D, I = index.search(query_set, k)  # actual search

    stop2 = time.clock()

    #---#end

    #run recall
    recall_i = recall_similar_match(GT_I, I)
    recall_d = recall_with_error(GT_D, D, error)

    stringname_D = 'D' + sys[0][1:7] + '_' + var + '.txt'
    stringname_I = 'I' + sys[0][1:7] + '_' + var + '.txt'
    np.savetxt(stringname_D, D)
    np.savetxt(stringname_I, I)

    time1 = stop1 - start1
    time2 = stop2 - start2
    #run, filename, index_time,  build_time, recall_D, recall_I, n_db, n_q, d, k
    print_time(run, sys[0], time1, time2, recall_d, recall_i, n_db, n_q, d, k,
               error)
    print("finish")
import numpy as np
import faiss  # make faiss available

if __name__ == "__main__":

    e = np.load('encodings.npy', allow_pickle=True).tolist()

    vectorEncoding = np.array([x.get('encondings') for x in e],
                              dtype=np.float32)
    vectorId = np.array([x.get('id') for x in e])

    d = len(vectorEncoding[0])
    print('Tamanho d do vetor: {}'.format(d))

    index = faiss.IndexIVFFlat(d)  # build the index
    index2 = faiss.IndexIDMap(index)

    index2.add_with_ids(vectorEncoding, vectorId)

    print('Tamanho do indice: {}'.format(index2.ntotal))
    faiss.write_index(index2, '10000.index')
Example #27
0
    print "add"
    # to see progress
    index.verbose = True
    index.add(xb)

    print "search"
    for efSearch in 16, 32, 64, 128, 256:
        print "efSearch", efSearch,
        index.hnsw.efSearch = efSearch
        evaluate(index)

if 'ivf' in todo:

    print "Testing IVF Flat (baseline)"
    quantizer = faiss.IndexFlatL2(d)
    index = faiss.IndexIVFFlat(quantizer, d, 16384)
    index.cp.min_points_per_centroid = 5   # quiet warning

    # to see progress
    index.verbose = True

    print "training"
    index.train(xt)

    print "add"
    index.add(xb)

    print "search"
    for nprobe in 1, 4, 16, 64, 256:
        print "nprobe", nprobe,
        index.nprobe = nprobe
Example #28
0
    "/home/jianx/results/passage_0__emb_p__data_obj_0.pb")
query_train_embeddings = obj_reader(
    "/home/jianx/results/query_0__emb_p__data_obj_0.pb")
query_train_mapping = obj_reader(
    "/datadrive/jianx/data/annoy/100_ance_query_train_map.dict")
pid_mapping = obj_reader(
    "/datadrive/jianx/data/annoy/100_ance_passage_map.dict")

print_message("Building index")
faiss.omp_set_num_threads(16)
dim = passage_embeddings.shape[1]
if IS_FLAT:
    cpu_index = faiss.IndexFlatIP(dim)
else:
    quantizer = faiss.IndexFlatIP(dim)
    cpu_index = faiss.IndexIVFFlat(quantizer, dim, NLIST)
    assert not cpu_index.is_trained
    cpu_index.train(passage_embeddings)
    assert cpu_index.is_trained

cpu_index.add(passage_embeddings)
print_message("Searching for all queries")
with open(OUT_PATH, "w+") as f:
    print_message("Writing to {}".format(OUT_PATH))
    f.write("")
for starting in range(0, len(query_train_embeddings), BATCH_SIZE):
    mini_batch = query_train_embeddings[starting:starting + BATCH_SIZE]
    _, dev_I = cpu_index.search(mini_batch, RANK)
    print_message("Batch No.{}/{}".format(
        starting / BATCH_SIZE + 1,
        len(query_train_embeddings) / BATCH_SIZE))
Example #29
0
    def test_dedup(self):
        d = 10
        nb = 1000
        nq = 200
        nt = 500
        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        # introduce duplicates
        xb[500:900:2] = xb[501:901:2]
        xb[901::4] = xb[900::4]
        xb[902::4] = xb[900::4]
        xb[903::4] = xb[900::4]

        # also in the train set
        xt[201::2] = xt[200::2]

        quantizer = faiss.IndexFlatL2(d)
        index_new = faiss.IndexIVFFlatDedup(quantizer, d, 20)

        index_new.verbose = True
        # should display
        # IndexIVFFlatDedup::train: train on 350 points after dedup (was 500 points)
        index_new.train(xt)

        index_ref = faiss.IndexIVFFlat(quantizer, d, 20)
        assert index_ref.is_trained

        index_ref.nprobe = 5
        index_ref.add(xb)
        index_new.nprobe = 5
        index_new.add(xb)

        Dref, Iref = index_ref.search(xq, 20)
        Dnew, Inew = index_new.search(xq, 20)

        for i in range(nq):
            ref = self.normalize_res(Dref[i], Iref[i])
            new = self.normalize_res(Dnew[i], Inew[i])
            assert ref == new

        # test I/O
        _, tmpfile = tempfile.mkstemp()
        try:
            faiss.write_index(index_new, tmpfile)
            index_st = faiss.read_index(tmpfile)
        finally:
            if os.path.exists(tmpfile):
                os.unlink(tmpfile)
        Dst, Ist = index_st.search(xq, 20)

        for i in range(nq):
            new = self.normalize_res(Dnew[i], Inew[i])
            st = self.normalize_res(Dst[i], Ist[i])
            assert st == new

        # test remove
        toremove = np.hstack((np.arange(3, 1000, 5), np.arange(850, 950)))
        index_ref.remove_ids(toremove)
        index_new.remove_ids(toremove)

        Dref, Iref = index_ref.search(xq, 20)
        Dnew, Inew = index_new.search(xq, 20)

        for i in range(nq):
            ref = self.normalize_res(Dref[i], Iref[i])
            new = self.normalize_res(Dnew[i], Inew[i])
            assert ref == new
Example #30
0
import faiss
import numpy
import numpy as np

if __name__ == '__main__':
    nlist = 5
    dimension = 512
    quantizer = faiss.IndexFlatL2(dimension)
    index = faiss.IndexIVFFlat(quantizer, dimension, nlist)
    features_folder = "features/features.npy"
    image_features = np.load(features_folder).astype('float32')
    index.train(image_features)
    index.add(image_features)
    faiss.write_index(index, "image_index")