Ejemplo n.º 1
0
 def init_index(self, vector_sz: int):
     # IndexHNSWFlat supports L2 similarity only
     # so we have to apply DOT -> L2 similairy space conversion with the help of an extra dimension
     index = faiss.IndexHNSWSQ(vector_sz + 1, faiss.ScalarQuantizer.QT_8bit, self.store_n)
     index.hnsw.efSearch = self.ef_search
     index.hnsw.efConstruction = self.ef_construction
     self.index = index
    def _post_process(self, dataset, resources_paths):
        if self.config.with_index:
            index_file = resources_paths["embeddings_index"]
            if os.path.exists(index_file):
                dataset.load_faiss_index("embeddings", index_file)
            else:
                if "embeddings" not in dataset.column_names:
                    raise ValueError("Couldn't build the index because there are no embeddings.")
                import faiss

                d = 768
                train_size = self.config.index_train_size
                logger.info("Building wiki_dpr faiss index")
                if self.config.index_name == "exact":
                    index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_8bit, 128, faiss.METRIC_INNER_PRODUCT)
                    index.hnsw.efConstruction = 200
                    index.hnsw.efSearch = 128
                    dataset.add_faiss_index("embeddings", custom_index=index, train_size=train_size)
                else:
                    quantizer = faiss.IndexHNSWFlat(d, 128, faiss.METRIC_INNER_PRODUCT)
                    quantizer.hnsw.efConstruction = 200
                    quantizer.hnsw.efSearch = 128
                    ivf_index = faiss.IndexIVFPQ(quantizer, d, 4096, 128, 8, faiss.METRIC_INNER_PRODUCT)
                    ivf_index.nprobe = 64
                    ivf_index.own_fields = True
                    quantizer.this.disown()
                    dataset.add_faiss_index(
                        "embeddings",
                        train_size=train_size,
                        custom_index=ivf_index,
                    )
                logger.info("Saving wiki_dpr faiss index")
                dataset.save_faiss_index("embeddings", index_file)
        return dataset
Ejemplo n.º 3
0
    # to see progress
    index.verbose = True
    index.add(xb)

    print "search"
    for efSearch in 16, 32, 64, 128, 256:
        print "efSearch", efSearch,
        index.hnsw.efSearch = efSearch
        evaluate(index)

if 'hnsw_sq' in todo:

    print "Testing HNSW with a scalar quantizer"
    # also set M so that the vectors and links both use 128 bytes per
    # entry (total 256 bytes)
    index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_8bit, 16)

    print "training"
    # training for the scalar quantizer
    index.train(xt)

    # this is the default, higher is more accurate and slower to
    # construct
    index.hnsw.efConstruction = 40

    print "add"
    # to see progress
    index.verbose = True
    index.add(xb)

    print "search"
Ejemplo n.º 4
0
 def __init__(self) -> None:
     self.index = faiss.IndexHNSWSQ(self.d, faiss.ScalarQuantizer.QT_8bit, 64)
     self.index.hnsw.efConstruction = 80
     self.index.hnsw.efSearch = 64
Ejemplo n.º 5
0
def build_index_streaming(
    cached_embeddings_path,
    output_path,
    hnsw=False,
    sq8_quantization=False,
    fp16_quantization=False,
    store_n=256,
    ef_search=32,
    ef_construction=80,
    sample_fraction=0.1,
    indexing_batch_size=5000000,
):

    vector_size = get_vectors_dim(cached_embeddings_path)

    if hnsw:
        if sq8_quantization:
            index = faiss.IndexHNSWSQ(vector_size + 1,
                                      faiss.ScalarQuantizer.QT_8bit, store_n)
        elif fp16_quantization:
            index = faiss.IndexHNSWSQ(vector_size + 1,
                                      faiss.ScalarQuantizer.QT_fp16, store_n)
        else:
            index = faiss.IndexHNSWFlat(vector_size + 1, store_n)

        index.hnsw.efSearch = ef_search
        index.hnsw.efConstruction = ef_construction
    else:
        if sq8_quantization:
            index = faiss.IndexScalarQuantizer(vector_size,
                                               faiss.ScalarQuantizer.QT_8bit,
                                               faiss.METRIC_L2)
        elif fp16_quantization:
            index = faiss.IndexScalarQuantizer(vector_size,
                                               faiss.ScalarQuantizer.QT_fp16,
                                               faiss.METRIC_L2)
        else:
            index = faiss.IndexIP(vector_size + 1, store_n)

    vector_sample, max_phi, N = get_vector_sample(cached_embeddings_path,
                                                  sample_fraction)
    if hnsw:
        vector_sample = augment_vectors(vector_sample, max_phi)

    if sq8_quantization or fp16_quantization:  # index requires training
        vs = vector_sample.numpy()
        logging.info(f'Training Quantizer with matrix of shape {vs.shape}')
        index.train(vs)
        del vs
    del vector_sample

    chunks_to_add = []
    added = 0
    for vector_chunk in parse_vectors_from_directory(cached_embeddings_path,
                                                     as_chunks=True):
        if hnsw:
            vector_chunk = augment_vectors(vector_chunk, max_phi)

        chunks_to_add.append(vector_chunk)

        if sum(c.shape[0] for c in chunks_to_add) > indexing_batch_size:
            logging.info(
                f'Adding Vectors {added} -> {added + to_add.shape[0]} of {N}')
            to_add = torch.cat(chunks_to_add)
            chunks_to_add = []
            index.add(to_add)
            added += 1

    if len(chunks_to_add) > 0:
        to_add = torch.cat(chunks_to_add).numpy()
        index.add(to_add)
        logging.info(
            f'Adding Vectors {added} -> {added + to_add.shape[0]} of {N}')

    logger.info(f'Index Built, writing index to {output_path}')
    faiss.write_index(index, output_path)
    logger.info(f'Index dumped')
    return index
Ejemplo n.º 6
0
0, and grouping triplets of 3,5,8 and 4,7,9 which can
blend into one another in some cases.
"""
import umap
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt
import seaborn as sns

import os.path
import faiss
import numpy as np

mnist = fetch_openml("mnist_784", version=1)
sns.set(context="paper", style="white")

index = faiss.IndexHNSWSQ(784, faiss.ScalarQuantizer.QT_8bit, 32)
index.verbose = True
faiss_index_file = 'faiss.index'
if os.path.exists(faiss_index_file):
    print('load existing index from %s' % faiss_index_file)
    index = faiss.read_index(faiss_index_file, faiss.IO_FLAG_MMAP)
    index.hnsw.efSearch = 256
else:
    # build lossy faiss index
    print('build new index and save to %s' % faiss_index_file)
    index.hnsw.efConstruction = 40
    data = np.ascontiguousarray(mnist.data, dtype=np.float32)
    # we no longer need mnist data in its original form
    print('train index...')
    index.train(data)
    print('add vectors to index...')