Exemple #1
0
    def _post_process(self, dataset, resources_paths):
        if self.config.with_index:
            index_file = resources_paths["embeddings_index"]
            if os.path.exists(index_file):
                dataset.load_faiss_index("embeddings", index_file)
            else:
                if "embeddings" not in dataset.column_names:
                    raise ValueError("Couldn't build the index because there are no embeddings.")
                import faiss

                train_size = self.config.index_train_size
                logging.info("Building wiki_dpr faiss index")
                if self.config.index_name == "exact":
                    d = 768
                    index = faiss.IndexHNSWFlat(d, 512, faiss.METRIC_INNER_PRODUCT)
                    dataset.add_faiss_index("embeddings", custom_index=index)
                else:
                    d = 768
                    quantizer = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_INNER_PRODUCT)
                    ivf_index = faiss.IndexIVFPQ(quantizer, d, 4096, 64, 8, faiss.METRIC_INNER_PRODUCT)
                    ivf_index.own_fields = True
                    quantizer.this.disown()
                    dataset.add_faiss_index(
                        "embeddings",
                        train_size=train_size,
                        custom_index=ivf_index,
                    )
                logging.info("Saving wiki_dpr faiss index")
                dataset.save_faiss_index("embeddings", index_file)
        return dataset
Exemple #2
0
    def build_hnsw(self, to_file, ef=2000, m=64):
        '''
        @description: 训练hnsw模型
        @param {type}
        to_file: 模型保存目录
        @return:
        '''
        logging.info('Building hnsw index.')
        vecs = np.stack(self.data['custom_vec'].values).reshape(-1, 300)
        vecs = vecs.astype('float32')
        dim = self.w2v_model.vector_size

        # Declaring index
        index = faiss.IndexHNSWFlat(dim, m)  # build the index
        res = faiss.StandardGpuResources()  # use a single GPU
        faiss.index_cpu_to_gpu(res, 0, index)  # make it a GPU index
        index.hnsw.efConstruction = ef
        print("add")
        index.verbose = True  # to see progress
        print('xb: ', vecs.shape)

        print('dtype: ', vecs.dtype)
        index.add(vecs)  # add vectors to the index
        print("total: ", index.ntotal)
        self.evaluate(vecs[:10000])
        faiss.write_index(index, to_file)
        return index
Exemple #3
0
    def fit(x_train):
        from benchmark_utils import timer

        D = x_train.shape[1]

        if metric in ["euclidean", "angular"]:
            index = faiss.IndexHNSWFlat(D, M)
            index.hnsw.efConstruction = 500
        else:
            raise NotImplementedError(f"The '{metric}' distance is not supported.")

        # Pre-processing:
        start = timer(use_torch=False)
        index.add(x_train)
        elapsed = timer(use_torch=False) - start

        # Return an operator for actual KNN queries:
        def f(x_test, efSearch=10):
            faiss.ParameterSpace().set_index_parameter(index, "efSearch", efSearch)
            start = timer(use_torch=False)
            distances, indices = index.search(x_test, K)
            elapsed = timer(use_torch=False) - start
            return indices, elapsed

        return f, elapsed
def train_coarse_quantizer(data,
                           quantizer_path,
                           num_clusters,
                           hnsw=False,
                           niter=10,
                           cuda=False):
    d = data.shape[1]

    index_flat = faiss.IndexFlatL2(d)
    # make it into a gpu index
    if cuda:
        res = faiss.StandardGpuResources()
        index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
    clus = faiss.Clustering(d, num_clusters)
    clus.verbose = True
    clus.niter = niter
    clus.train(data, index_flat)
    centroids = faiss.vector_float_to_array(clus.centroids)
    centroids = centroids.reshape(num_clusters, d)

    if hnsw:
        quantizer = faiss.IndexHNSWFlat(d, 32)
        quantizer.hnsw.efSearch = 128
        quantizer.train(centroids)
        quantizer.add(centroids)
    else:
        quantizer = faiss.IndexFlatL2(d)
        quantizer.add(centroids)

    faiss.write_index(quantizer, quantizer_path)
Exemple #5
0
    def _post_process(self, dataset, resources_paths):
        if self.config.with_index:
            index_file = resources_paths["embeddings_index"]
            if os.path.exists(index_file):
                dataset.load_faiss_index("embeddings", index_file)
            else:
                import faiss

                train_size = self.config.index_train_size
                logging.info("Building wiki_dpr faiss index")
                if self.config.dummy:
                    dataset.add_faiss_index(
                        "embeddings",
                        string_factory="Flat",
                        metric_type=faiss.METRIC_INNER_PRODUCT,
                        train_size=train_size,
                    )
                else:
                    d = 768
                    quantizer = faiss.IndexHNSWFlat(d, 32,
                                                    faiss.METRIC_INNER_PRODUCT)
                    ivf_index = faiss.IndexIVFPQ(quantizer, d, 4096, 64, 8,
                                                 faiss.METRIC_INNER_PRODUCT)
                    ivf_index.own_fields = True
                    quantizer.this.disown()
                    dataset.add_faiss_index(
                        "embeddings",
                        train_size=train_size,
                        faiss_verbose=logging.getLogger().level <=
                        logging.DEBUG,
                        custom_index=ivf_index,
                    )
                logging.info("Saving wiki_dpr faiss index")
                dataset.save_faiss_index("embeddings", index_file)
        return dataset
Exemple #6
0
 def init_index(self, vector_sz: int):
     # IndexHNSWFlat supports L2 similarity only
     # so we have to apply DOT -> L2 similairy space conversion with the help of an extra dimension
     index = faiss.IndexHNSWFlat(vector_sz + 1, self.store_n)
     index.hnsw.efSearch = self.ef_search
     index.hnsw.efConstruction = self.ef_construction
     self.index = index
Exemple #7
0
def add_faiss_index_to_dataset(dataset):
    import faiss

    index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    return dataset
Exemple #8
0
    def save_faiss_index(self):
        try:
            diary_cover_pic_face_vec_fd = open(
                "./diary_cover_pic_face_vec.txt", "r")

            xb, ids = [], []
            for line in diary_cover_pic_face_vec_fd.readlines():
                line_term_list = line.split("\t")
                diary_id = line_term_list[0]
                face_feature = json.loads(line_term_list[1])
                face_feature_vec = np.array(face_feature)
                xb.append(face_feature_vec)
                ids.append(diary_id)

            xb_np = np.array(xb).astype('float32')
            ids_np = np.array(ids).astype('int')
            index = faiss.IndexHNSWFlat(128, 32)
            index = faiss.IndexIDMap(index)
            index.add_with_ids(xb_np, ids_np)
            faiss.write_index(index, settings.INDEX_PATH)

            diary_cover_pic_face_vec_fd.close()
        except:
            logging.error("catch exception, err_msg:%s" %
                          traceback.format_exc())
 def init_index(self):
     index = faiss.IndexHNSWFlat(self.vector_size + 1, 512)
     index.hnsw.efSearch = 128
     index.hnsw.efConstruction = 200
     self.index = index
     self._deserialize_index()
     self._index_initialized = True
    def _post_process(self, dataset, resources_paths):
        if self.config.with_index:
            index_file = resources_paths["embeddings_index"]
            if os.path.exists(index_file):
                dataset.load_faiss_index("embeddings", index_file)
            else:
                if "embeddings" not in dataset.column_names:
                    raise ValueError("Couldn't build the index because there are no embeddings.")
                import faiss

                d = 768
                train_size = self.config.index_train_size
                logger.info("Building wiki_dpr faiss index")
                if self.config.index_name == "exact":
                    index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_8bit, 128, faiss.METRIC_INNER_PRODUCT)
                    index.hnsw.efConstruction = 200
                    index.hnsw.efSearch = 128
                    dataset.add_faiss_index("embeddings", custom_index=index, train_size=train_size)
                else:
                    quantizer = faiss.IndexHNSWFlat(d, 128, faiss.METRIC_INNER_PRODUCT)
                    quantizer.hnsw.efConstruction = 200
                    quantizer.hnsw.efSearch = 128
                    ivf_index = faiss.IndexIVFPQ(quantizer, d, 4096, 128, 8, faiss.METRIC_INNER_PRODUCT)
                    ivf_index.nprobe = 64
                    ivf_index.own_fields = True
                    quantizer.this.disown()
                    dataset.add_faiss_index(
                        "embeddings",
                        train_size=train_size,
                        custom_index=ivf_index,
                    )
                logger.info("Saving wiki_dpr faiss index")
                dataset.save_faiss_index("embeddings", index_file)
        return dataset
    def __init__(self,
                 image,
                 target_mask,
                 source_mask,
                 patch_size=(11, 11),
                 patch_weight=None,
                 num_neighbors=1):
        im_h, im_w, im_ch = image.shape

        if patch_weight is None:
            self.patch_weight = np.ones(patch_size, dtype=_im_dtype)

        self.patch_size = patch_size
        self.num_neighb = num_neighbors

        print("Build NNF index: ", end=" ")
        start = time.time()

        if _NN_algorithm != "PatchMatch":
            self.source_ind = op.masked_indices(source_mask)
            self.target_ind = op.masked_indices(target_mask)

            # convert array indices to patch indices
            pad = patch_size[0] // 2
            ind_y, ind_x = np.divmod(self.source_ind, im_w)
            self.source_ind = (ind_x - pad) + (ind_y - pad) * (im_w - 2 * pad)

            source_point_cloud = extract_patches_2d( image, patch_size=patch_size )[self.source_ind].reshape((self.source_ind.size,-1)) \
                   * np.repeat(np.sqrt(self.patch_weight),im_ch)

            # need this because of FLANN bug (?) with memory release
            self.target_point_cloud = np.zeros(
                (self.target_ind.size, source_point_cloud.shape[-1]),
                dtype=_im_dtype)

        if _NN_algorithm == "FLANN":
            self.nn = flann.FLANN()
            self.nn.build_index(source_point_cloud,
                                algorithm="kdtree",
                                trees=1)  #, log_level = "info")
        elif _NN_algorithm == "Sklearn":
            self.nn = NearestNeighbors(
                n_neighbors=num_neighbors,
                algorithm='kd_tree',
                metric='minkowski',
                n_jobs=-1)  #,metric_params={'w':self.patch_weight})
            self.nn.fit(X=source_point_cloud)
        elif _NN_algorithm == "FAISS":
            self.nn = faiss.IndexHNSWFlat(source_point_cloud.shape[1], 50)
            self.nn.add(source_point_cloud)

        if _NN_algorithm == "PatchMatch":
            self.nn = pm.PatchMatch(target_mask,
                                    source_mask,
                                    patch_size=patch_size,
                                    lambdas=np.ones_like(image,
                                                         dtype=_im_dtype))

        print('%f sec' % (time.time() - start))
Exemple #12
0
 def fit(self, X):
     self.index = faiss.IndexHNSWFlat(len(X[0]), self.method_param["M"])
     self.index.hnsw.efConstruction = self.method_param["efConstruction"]
     self.index.verbose = True
     if (self.metric == 'angular'):
         X = X / np.linalg.norm(X, axis=1)[:, np.newaxis]
     self.index.add(X)
     faiss.omp_set_num_threads(1)
Exemple #13
0
    def test_size_t_ptr(self):
        # issue 1064
        index = faiss.IndexHNSWFlat(10, 32)

        hnsw = index.hnsw
        index.add(np.random.rand(100, 10).astype('float32'))
        be = np.empty(2, 'uint64')
        hnsw.neighbor_range(23, 0, faiss.swig_ptr(be), faiss.swig_ptr(be[1:]))
    def test_wrapped_quantizer_HNSW(self):
        faiss.omp_set_num_threads(1)

        def bin2float(v):
            def byte2float(byte):
                return np.array(
                    [-1.0 + 2.0 * (byte & (1 << b) != 0) for b in range(0, 8)])

            return np.hstack([byte2float(byte)
                              for byte in v]).astype('float32')

        def floatvec2nparray(v):
            return np.array([np.float32(v.at(i)) for i in range(0, v.size())]) \
                     .reshape(-1, d)

        d = 256
        nt = 12800
        nb = 10000
        nq = 500
        (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)

        index_ref = faiss.IndexBinaryFlat(d)

        index_ref.add(xb)

        nlist = 256
        clus = faiss.Clustering(d, nlist)
        clus_index = faiss.IndexFlatL2(d)

        xt_f = np.array([bin2float(v) for v in xt])
        clus.train(xt_f, clus_index)

        centroids = floatvec2nparray(clus.centroids)
        hnsw_quantizer = faiss.IndexHNSWFlat(d, 32)
        hnsw_quantizer.add(centroids)
        hnsw_quantizer.is_trained = True
        wrapped_quantizer = faiss.IndexBinaryFromFloat(hnsw_quantizer)

        assert nlist == hnsw_quantizer.ntotal
        assert nlist == wrapped_quantizer.ntotal
        assert wrapped_quantizer.is_trained

        index = faiss.IndexBinaryIVF(wrapped_quantizer, d,
                                     hnsw_quantizer.ntotal)
        index.nprobe = 128

        assert index.is_trained

        index.add(xb)

        D_ref, I_ref = index_ref.search(xq, 10)
        D, I = index.search(xq, 10)

        recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \
                 / float(D_ref.shape[0])

        assert recall > 0.77, "recall = %g" % recall
Exemple #15
0
def indexed_function(text):

    print("Building Index")
    passages_path = '/home/gsir059/Music/faisis_index_test/my_kb'
    dataset = load_from_disk(passages_path)
    print("dataset loaded")
    index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)
    print("Building Done")
def main(
    rag_example_args: "RagExampleArguments",
    processing_args: "ProcessingArguments",
    index_hnsw_args: "IndexHnswArguments",
):

    ######################################
    logger.info("Step 1 - Create the dataset")
    ######################################

    # The dataset needed for RAG must have three columns:
    # - title (string): title of the document
    # - text (string): text of a passage of the document
    # - embeddings (array of dimension d): DPR representation of the passage
    # Let's say you have documents in tab-separated csv files with columns "title" and "text"
    assert os.path.isfile(rag_example_args.csv_path), "Please provide a valid path to a csv file"

    # You can load a Dataset object this way
    dataset = load_dataset(
        "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
    )

    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files

    # Then split the documents into passages of 100 words
    dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)

    # And compute the embeddings
    ctx_encoder = DPRContextEncoder.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name).to(device=device)
    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(rag_example_args.dpr_ctx_encoder_model_name)
    new_features = Features(
        {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))}
    )  # optional, save as float32 instead of float64 to save space
    dataset = dataset.map(
        partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
        batched=True,
        batch_size=processing_args.batch_size,
        features=new_features,
    )

    # And finally save your dataset
    passages_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset")
    dataset.save_to_disk(passages_path)
    # from datasets import load_from_disk
    # dataset = load_from_disk(passages_path)  # to reload the dataset

    ######################################
    logger.info("Step 2 - Index the dataset")
    ######################################

    # Let's use the Faiss implementation of HNSW for fast approximate nearest neighbor search
    index = faiss.IndexHNSWFlat(index_hnsw_args.d, index_hnsw_args.m, faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    # And save the index
    index_path = os.path.join(rag_example_args.output_dir, "my_knowledge_dataset_hnsw_index.faiss")
    dataset.get_index("embeddings").save(index_path)
Exemple #17
0
    def test_hnsw(self):
        d = self.xq.shape[1]

        index = faiss.IndexHNSWFlat(d, 16)
        index.add(self.xb)
        Dhnsw, Ihnsw = index.search(self.xq, 1)

        self.assertGreaterEqual((self.Iref == Ihnsw).sum(), 460)

        self.io_and_retest(index, Dhnsw, Ihnsw)
def get_index(index_type, dim):
    if index_type == 'hnsw':
        m = 48
        index = faiss.IndexHNSWFlat(dim, m)
        index.hnsw.efConstruction = 128
        return index
    elif index_type == 'l2':
        return faiss.IndexFlatL2(dim)

    raise
Exemple #19
0
def generate_faiss_index_dataset(data, ctx_encoder_name, args, device):
    """
    Adapted from Huggingface example script at https://github.com/huggingface/transformers/blob/master/examples/research_projects/rag/use_own_knowledge_dataset.py
    """
    import faiss

    if isinstance(data, str):
        dataset = load_dataset("csv",
                               data_files=data,
                               delimiter="\t",
                               column_names=["title", "text"])
    else:
        dataset = HFDataset.from_pandas(data)

    dataset = dataset.map(
        partial(split_documents,
                split_text_n=args.split_text_n,
                split_text_character=args.split_text_character),
        batched=True,
        num_proc=args.process_count,
    )

    ctx_encoder = DPRContextEncoder.from_pretrained(ctx_encoder_name).to(
        device=device)
    ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained(
        ctx_encoder_name)

    new_features = Features({
        "text": Value("string"),
        "title": Value("string"),
        "embeddings": Sequence(Value("float32"))
    })  # optional, save as float32 instead of float64 to save space
    dataset = dataset.map(
        partial(embed,
                ctx_encoder=ctx_encoder,
                ctx_tokenizer=ctx_tokenizer,
                device=device),
        batched=True,
        batch_size=args.rag_embed_batch_size,
        features=new_features,
    )
    if isinstance(data, str):
        dataset = dataset["train"]

    if args.save_knowledge_dataset:
        output_dataset_directory = os.path.join(args.output_dir,
                                                "knowledge_dataset")
        os.makedirs(output_dataset_directory, exist_ok=True)
        dataset.save_to_disk(output_dataset_directory)

    index = faiss.IndexHNSWFlat(args.faiss_d, args.faiss_m,
                                faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)

    return dataset
Exemple #20
0
    def __init__(self, vector_sz: int, buffer_size: int = 50000, store_n: int = 512
                 , ef_search: int = 128, ef_construction: int = 200):
        super(DenseHNSWFlatIndexer, self).__init__(buffer_size=buffer_size)

        # IndexHNSWFlat supports L2 similarity only
        # so we have to apply DOT -> L2 similairy space conversion with the help of an extra dimension
        index = faiss.IndexHNSWFlat(vector_sz + 1, store_n)
        index.hnsw.efSearch = ef_search
        index.hnsw.efConstruction = ef_construction
        self.index = index
        self.phi = 0
 def __init__(self,
              dim,
              num_documents,
              num_subvectors=16,
              hnsw_m=32,
              nbits=8):
     super().__init__()
     nlist = math.floor(math.sqrt(num_documents))
     quantizer = faiss.IndexHNSWFlat(dim, hnsw_m)
     index = faiss.IndexIVFPQ(quantizer, dim, nlist, num_subvectors, nbits)
     self.index = faiss.index_cpu_to_all_gpus(index)
     self.num_training = max(nlist * 10, 256)
Exemple #22
0
 def _create_new_index(self, vector_dim: int, index_factory: str = "Flat", metric_type=faiss.METRIC_INNER_PRODUCT, **kwargs):
     if index_factory == "HNSW" and metric_type == faiss.METRIC_INNER_PRODUCT:
         # faiss index factory doesn't give the same results for HNSW IP, therefore direct init.
         # defaults here are similar to DPR codebase (good accuracy, but very high RAM consumption)
         n_links = kwargs.get("n_links", 128)
         index = faiss.IndexHNSWFlat(vector_dim, n_links, metric_type)
         index.hnsw.efSearch = kwargs.get("efSearch", 20)#20
         index.hnsw.efConstruction = kwargs.get("efConstruction", 80)#80
         logger.info(f"HNSW params: n_links: {n_links}, efSearch: {index.hnsw.efSearch}, efConstruction: {index.hnsw.efConstruction}")
     else:
         index = faiss.index_factory(vector_dim, index_factory, metric_type)
     return index
def add_index(shard_dir, index_path):
    data_shard_list = []

    for shard_address in glob(str(shard_dir) + "/*/"):
        data_shard_list.append(load_from_disk(shard_address))

    concat = concatenate_datasets(data_shard_list)
    faiss.omp_set_num_threads(96)

    index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT)
    concat.add_faiss_index("embeddings", custom_index=index)
    concat.get_index("embeddings").save(
        index_path
    )  # since we load the index in to memory,we can directly update the index in the disk
Exemple #24
0
def run(args):
    csv_file = args.knowledge_file
    k_col = args.k_col
    title_col = args.title_col

    out_dir = args.out_dir
    os.makedirs(out_dir, exist_ok=True)
    passages_path = os.path.join(out_dir, KNOWLEGE_DIR)
    index_path = os.path.join(out_dir, INDEX_FILE)

    context_model = args.context_model
    device = "cuda" if (args.device
                        == "cuda") and torch.cuda.is_available() else "cpu"
    batch_size = args.batch_size
    hnsw_m = args.hnsw_m

    # load pretrained context model
    ctx_encoder = transformers.DPRContextEncoder.from_pretrained(
        context_model).to(device)
    ctx_tokenizer = transformers.tokenization_bert_japanese.BertJapaneseTokenizer.from_pretrained(
        context_model)
    embedding_dim = ctx_encoder.config.hidden_size

    # convert csv file to index
    dataset = load_dataset("csv", data_files=[csv_file], split="train")
    column_names = dataset.column_names[:]

    def set_text_title(doc):
        return {
            "text": doc[k_col],
            "title": doc[title_col] if title_col is not None else ""
        }

    dataset = dataset.map(set_text_title)
    dataset.remove_columns_(column_names)

    print(dataset[0])
    dataset = dataset.map(partial(embed,
                                  ctx_encoder=ctx_encoder,
                                  ctx_tokenizer=ctx_tokenizer,
                                  device=device),
                          batched=True,
                          batch_size=batch_size)

    # save index fils
    dataset.save_to_disk(passages_path)
    index = faiss.IndexHNSWFlat(embedding_dim, hnsw_m,
                                faiss.METRIC_INNER_PRODUCT)
    dataset.add_faiss_index("embeddings", custom_index=index)
    dataset.get_index("embeddings").save(index_path)
    def __init__(
        self,
        vector_sz: int,
        buffer_size: int = 50000,
        store_n: int = 128,
        ef_search: int = 256,
        ef_construction: int = 200,
    ):
        super(DenseHNSWFlatIndexer, self).__init__(buffer_size=buffer_size)

        index = faiss.IndexHNSWFlat(vector_sz, store_n, faiss.METRIC_INNER_PRODUCT)
        index.hnsw.efSearch = ef_search
        index.hnsw.efConstruction = ef_construction
        self.index = index
Exemple #26
0
    def _build_index(self, xb):

        d = xb.shape[-1]
        # res = faiss.StandardGpuResources()
        # index = faiss.GpuIndexFlatIP(res, d)

        # brute-force
        # index = faiss.IndexFlatL2(d)

        # HNSW
        index = faiss.IndexHNSWFlat(d, 32)

        index.add(xb)
        return index
    def test_wrapped_quantizer_HNSW(self):
        def bin2float2d(v):
            n, d = v.shape
            vf = ((v.reshape(-1, 1) >> np.arange(8)) & 1).astype("float32")
            vf *= 2
            vf -= 1
            return vf.reshape(n, d * 8)

        d = 256
        nt = 12800
        nb = 10000
        nq = 500
        (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)

        index_ref = faiss.IndexBinaryFlat(d)

        index_ref.add(xb)

        nlist = 256
        clus = faiss.Clustering(d, nlist)
        clus_index = faiss.IndexFlatL2(d)

        xt_f = bin2float2d(xt)
        clus.train(xt_f, clus_index)

        centroids = faiss.vector_to_array(clus.centroids).reshape(-1, clus.d)
        hnsw_quantizer = faiss.IndexHNSWFlat(d, 32)
        hnsw_quantizer.add(centroids)
        hnsw_quantizer.is_trained = True
        wrapped_quantizer = faiss.IndexBinaryFromFloat(hnsw_quantizer)

        assert nlist == hnsw_quantizer.ntotal
        assert nlist == wrapped_quantizer.ntotal
        assert wrapped_quantizer.is_trained

        index = faiss.IndexBinaryIVF(wrapped_quantizer, d,
                                     hnsw_quantizer.ntotal)
        index.nprobe = 128

        assert index.is_trained

        index.add(xb)

        D_ref, I_ref = index_ref.search(xq, 10)
        D, I = index.search(xq, 10)

        recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \
                 / float(D_ref.shape[0])

        assert recall >= 0.77, "recall = %g" % recall
Exemple #28
0
    def _post_process(self, dataset, resources_paths, dl_manager):
        if self.config.with_index:
            index_file = resources_paths["embeddings_index"]
            if os.path.exists(index_file):
                print(">>> E")
                dataset.load_faiss_index("embeddings", index_file)
            else:
                try:
                    downloaded_resources = dl_manager.download_and_extract({
                        "embeddings_index":
                        os.path.join(
                            _INDEX_URL,
                            self.config.index_file.format(split=dataset.split))
                    })
                    dataset.load_faiss_index(
                        "embeddings", downloaded_resources["embeddings_index"])
                    print(">>> D")
                    dataset.save_faiss_index("embeddings", index_file)
                except ConnectionError:  # if the index of the specified split doesn't exist
                    print(">>> F")
                    import faiss

                    train_size = self.config.index_train_size
                    logging.info("Building wiki_dpr faiss index")
                    if self.config.index_type == "exact":
                        dataset.add_faiss_index(
                            "embeddings",
                            string_factory="Flat",
                            metric_type=faiss.METRIC_INNER_PRODUCT,
                        )
                    else:
                        d = 768
                        quantizer = faiss.IndexHNSWFlat(
                            d, 32, faiss.METRIC_INNER_PRODUCT)
                        ivf_index = faiss.IndexIVFPQ(
                            quantizer, d, 4096, 64, 8,
                            faiss.METRIC_INNER_PRODUCT)
                        ivf_index.own_fields = True
                        quantizer.this.disown()
                        dataset.add_faiss_index(
                            "embeddings",
                            train_size=train_size,
                            faiss_verbose=logging.getLogger().level <=
                            logging.DEBUG,
                            custom_index=ivf_index,
                        )
                    logging.info("Saving wiki_dpr faiss index")
                    dataset.save_faiss_index("embeddings", index_file)
        return dataset
Exemple #29
0
 def build(self, vectors):
     t0 = time.time()
     if self.add_noise:
         vectors += np.random.randn(vectors.shape[0],
                                    vectors.shape[1]) * self.noise_amount
     self.num_points += vectors.shape[0]
     if not self.assume_unit_normed:
         vectors = unit_norm(vectors)
     self.index = faiss.IndexHNSWFlat(vectors.shape[1], self.max_degree)
     self.index.hnsw.efConstruction = self.efConstruction
     self.index.hnsw.efSearch = self.efSearch
     self.index.metric_type = faiss.METRIC_L2
     self.index.add(vectors)
     t1 = time.time()
     self.total_insert_time += t1 - t0
Exemple #30
0
    def test_hnsw_IP(self):
        d = self.xq.shape[1]

        index_IP = faiss.IndexFlatIP(d)
        index_IP.add(self.xb)
        Dref, Iref = index_IP.search(self.xq, 1)

        index = faiss.IndexHNSWFlat(d, 16, faiss.METRIC_INNER_PRODUCT)
        index.add(self.xb)
        Dhnsw, Ihnsw = index.search(self.xq, 1)

        print('nb equal: ', (Iref == Ihnsw).sum())

        self.assertGreaterEqual((Iref == Ihnsw).sum(), 480)

        mask = Iref[:, 0] == Ihnsw[:, 0]
        assert np.allclose(Dref[mask, 0], Dhnsw[mask, 0])