Esempio n. 1
0
    def do_mmappedIO(self, sparse):
        d = 10
        nb = 1000
        nq = 200
        nt = 200
        xt, xb, xq = get_dataset_2(d, nb, nt, nq)
        if sparse:
            fname = "/tmp/faiss_test_rareio_sparse.faissindex"
        else:
            fname = "/tmp/faiss_test_rareio_full.faissindex"

        quantizer = faiss.IndexFlatL2(d)
        index1 = faiss.IndexIVFFlat(quantizer, d, 20)
        if sparse:
            # makes the inverted lists sparse because all elements get
            # assigned to the same invlist
            xt += (np.ones(10) * 1000).astype('float32')

        index1.train(xt)
        index1.add(xb)
        faiss.write_index(index1, fname)

        index2 = faiss.read_index(fname)
        self.compare_results(index1, index2, xq)

        index3 = faiss.read_index(fname, faiss.IO_FLAG_MMAP)
        self.compare_results(index1, index3, xq)
Esempio n. 2
0
def get_trained_index():
    filename = "%s/%s_%s_trained.index" % (
        tmpdir, dbname, index_key)

    if not os.path.exists(filename):
        index = faiss.index_factory(d, index_key)

        n_train = choose_train_size(index_key)

        xtsub = xt[:n_train]
        print "Keeping %d train vectors" % xtsub.shape[0]
        # make sure the data is actually in RAM and in float
        xtsub = xtsub.astype('float32').copy()
        index.verbose = True

        t0 = time.time()
        index.train(xtsub)
        index.verbose = False
        print "train done in %.3f s" % (time.time() - t0)
        print "storing", filename
        faiss.write_index(index, filename)
    else:
        print "loading", filename
        index = faiss.read_index(filename)
    return index
Esempio n. 3
0
def get_populated_index(preproc):

    if not index_cachefile or not os.path.exists(index_cachefile):
        if not altadd:
            gpu_index, indexall = compute_populated_index(preproc)
        else:
            gpu_index, indexall = compute_populated_index_2(preproc)
        if index_cachefile:
            print "store", index_cachefile
            faiss.write_index(indexall, index_cachefile)
    else:
        print "load", index_cachefile
        indexall = faiss.read_index(index_cachefile)
        gpu_index = None

    co = faiss.GpuMultipleClonerOptions()
    co.useFloat16 = use_float16
    co.useFloat16CoarseQuantizer = False
    co.usePrecomputed = use_precomputed_tables
    co.indicesOptions = 0
    co.verbose = 10
    co.shard = True # the replicas will be made "manually"
    t0 = time.time()
    print "CPU index contains %d vectors, move to GPU" % indexall.ntotal
    if replicas == 1:

        if not gpu_index:
            print "copying loaded index to GPUs"
            vres, vdev = make_vres_vdev()
            index = faiss.index_cpu_to_gpu_multiple(
                vres, vdev, indexall, co)
        else:
            index = gpu_index

    else:
        del gpu_index # We override the GPU index

        print "Copy CPU index to %d sharded GPU indexes" % replicas

        index = faiss.IndexProxy()

        for i in range(replicas):
            gpu0 = ngpu * i / replicas
            gpu1 = ngpu * (i + 1) / replicas
            vres, vdev = make_vres_vdev(gpu0, gpu1)

            print "   dispatch to GPUs %d:%d" % (gpu0, gpu1)

            index1 = faiss.index_cpu_to_gpu_multiple(
                vres, vdev, indexall, co)
            index1.this.disown()
            index.addIndex(index1)
        index.own_fields = True
    del indexall
    print "move to GPU done in %.3f s" % (time.time() - t0)
    return index
Esempio n. 4
0
def get_populated_index():

    filename = "%s/%s_%s_populated.index" % (
        tmpdir, dbname, index_key)

    if not os.path.exists(filename):
        index = get_trained_index()
        i0 = 0
        t0 = time.time()
        for xs in matrix_slice_iterator(xb, 100000):
            i1 = i0 + xs.shape[0]
            print '\radd %d:%d, %.3f s' % (i0, i1, time.time() - t0),
            sys.stdout.flush()
            index.add(xs)
            i0 = i1
        print
        print "Add done in %.3f s" % (time.time() - t0)
        print "storing", filename
        faiss.write_index(index, filename)
    else:
        print "loading", filename
        index = faiss.read_index(filename)
    return index
Esempio n. 5
0
    def test_dedup(self):
        d = 10
        nb = 1000
        nq = 200
        nt = 500
        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        # introduce duplicates
        xb[500:900:2] = xb[501:901:2]
        xb[901::4] = xb[900::4]
        xb[902::4] = xb[900::4]
        xb[903::4] = xb[900::4]

        # also in the train set
        xt[201::2] = xt[200::2]

        quantizer = faiss.IndexFlatL2(d)
        index_new = faiss.IndexIVFFlatDedup(quantizer, d, 20)

        index_new.verbose = True
        # should display
        # IndexIVFFlatDedup::train: train on 350 points after dedup (was 500 points)
        index_new.train(xt)

        index_ref = faiss.IndexIVFFlat(quantizer, d, 20)
        assert index_ref.is_trained

        index_ref.nprobe = 5
        index_ref.add(xb)
        index_new.nprobe = 5
        index_new.add(xb)

        Dref, Iref = index_ref.search(xq, 20)
        Dnew, Inew = index_new.search(xq, 20)

        for i in range(nq):
            ref = self.normalize_res(Dref[i], Iref[i])
            new = self.normalize_res(Dnew[i], Inew[i])
            assert ref == new

        # test I/O
        _, tmpfile = tempfile.mkstemp()
        try:
            faiss.write_index(index_new, tmpfile)
            index_st = faiss.read_index(tmpfile)
        finally:
            if os.path.exists(tmpfile):
                os.unlink(tmpfile)
        Dst, Ist = index_st.search(xq, 20)

        for i in range(nq):
            new = self.normalize_res(Dnew[i], Inew[i])
            st = self.normalize_res(Dst[i], Ist[i])
            assert st == new

        # test remove
        toremove = np.hstack((np.arange(3, 1000, 5), np.arange(850, 950)))
        index_ref.remove_ids(toremove)
        index_new.remove_ids(toremove)

        Dref, Iref = index_ref.search(xq, 20)
        Dnew, Inew = index_new.search(xq, 20)

        for i in range(nq):
            ref = self.normalize_res(Dref[i], Iref[i])
            new = self.normalize_res(Dnew[i], Inew[i])
            assert ref == new
        similarities = scores[0]
        neighbor_ids = scores[1]
        margin_scores = scores[2]
        current_embeddings_ids = scores[3]
        with open(output_file, 'a') as output:
            for idx in range(similarities.shape[0]):
                result = str(similarities[idx].tolist()) + "\t" \
                        + str(neighbor_ids[idx].tolist()) + "\t" \
                        + str(margin_scores[idx].tolist()) + "\t" \
                        + str(current_embeddings_ids[idx])
                output.write(result)
                output.write("\n")


if __name__ == "__main__":
    argument_parser = argparse.ArgumentParser()
    argument_parser.add_argument("--index", help="Searchable index file")
    argument_parser.add_argument("--embeddings",
                                 help="Embeddings for sentences")
    argument_parser.add_argument("--batch-size",
                                 help="Batch size for searching")
    argument_parser.add_argument("--output", help="Output file")
    argument_parser.add_argument("--neighbors",
                                 help="Number of nearest neighbors")
    arguments = argument_parser.parse_args()
    index = faiss.read_index(arguments.index)
    index = faiss.index_cpu_to_all_gpus(index)
    k = int(arguments.neighbors)
    write_results_to_file(arguments.output, index, arguments.embeddings,
                          int(arguments.batch_size), k)
Esempio n. 7
0
 def __init_indexes(self):
     for fname in self.base_dir.glob(self.pattern):
         print(fname)
         idx = fname.stem.split('_')[-1]
         self.indexes[int(idx)] = faiss.read_index(str(fname))
def merge_indexes(subindex_dir, trained_index_path, target_index_path,
                  target_idx2id_path, target_inv_path):
    # target_inv_path = merged_index.ivfdata
    names = os.listdir(subindex_dir)
    idx2id_paths = [
        os.path.join(subindex_dir, name) for name in names
        if name.endswith('.hdf5')
    ]
    index_paths = [
        os.path.join(subindex_dir, name) for name in names
        if name.endswith('.faiss')
    ]
    print(len(idx2id_paths))
    print(len(index_paths))

    print('copying idx2id')
    with h5py.File(target_idx2id_path, 'w') as out:
        for idx2id_path in tqdm(idx2id_paths, desc='copying idx2id'):
            with h5py.File(idx2id_path, 'r') as in_:
                for key, g in in_.items():
                    offset = str(g.attrs['offset'])
                    assert key == offset
                    group = out.create_group(offset)
                    group.create_dataset('doc', data=in_[key]['doc'])
                    group.create_dataset('word', data=in_[key]['word'])

    print('loading invlists')
    ivfs = []
    for index_path in tqdm(index_paths, desc='loading invlists'):
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        index = faiss.read_index(index_path, faiss.IO_FLAG_MMAP)
        ivfs.append(index.invlists)

        # avoid that the invlists get deallocated with the index
        index.own_invlists = False

    # construct the output index
    index = faiss.read_index(trained_index_path)

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size,
                                         target_inv_path)

    # merge all the inverted lists
    print('merging')
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in tqdm(ivfs):
        ivf_vector.push_back(ivf)

    print("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
    print(ntotal)

    # now replace the inverted lists in the output index
    index.ntotal = ntotal
    index.replace_invlists(invlists)

    print('writing index')
    faiss.write_index(index, target_index_path)
Esempio n. 9
0
    def setup_faiss(self, args):
        if not args.dstore_filename:
            raise ValueError('Cannot build a datastore without the data.')

        start = time.time()
        index = faiss.read_index(args.indexfile, faiss.IO_FLAG_ONDISK_SAME_DIR)
        print('Reading datastore took {} s'.format(time.time() - start))
        index.nprobe = args.probe

        if args.dstore_fp16:
            print('Keys are fp16 and vals are int16')
            # Load the keys even if they are not being used.
            # if not args.no_load_keys:
            self.keys = np.memmap(args.dstore_filename + '_keys.npy',
                                  dtype=np.float16,
                                  mode='r',
                                  shape=(self.dstore_size, self.dimension))
            self.vals = np.memmap(args.dstore_filename + '_vals.npy',
                                  dtype=np.int16,
                                  mode='r',
                                  shape=(self.dstore_size, 1))
        else:
            print('Keys are fp32 and vals are int64')
            # Load the keys even if they are not being used.
            # if not args.no_load_keys:
            self.keys = np.memmap(args.dstore_filename + '_keys.npy',
                                  dtype=np.float32,
                                  mode='r',
                                  shape=(self.dstore_size, self.dimension))
            self.vals = np.memmap(args.dstore_filename + '_vals.npy',
                                  dtype=np.int,
                                  mode='r',
                                  shape=(self.dstore_size, 1))

        # If you wish to load all the keys into memory
        # CAUTION: Only do this if your RAM can handle it!
        if args.move_dstore_to_mem:
            print('Loading to memory...')
            start = time.time()

            if not args.no_load_keys:
                del self.keys
                self.keys_from_memmap = np.memmap(
                    args.dstore_filename + '_keys.npy',
                    dtype=np.float32,
                    mode='r',
                    shape=(self.dstore_size, self.dimension))
                self.keys = np.zeros(
                    (self.dstore_size, self.dimension),
                    dtype=np.float16 if args.dstore_fp16 else np.float32)
                self.keys = self.keys_from_memmap[:]
                self.keys = self.keys.astype(
                    np.float16 if args.dstore_fp16 else np.float32)

            del self.vals
            self.vals_from_memmap = np.memmap(args.dstore_filename +
                                              '_vals.npy',
                                              dtype=np.int,
                                              mode='r',
                                              shape=(self.dstore_size, 1))
            self.vals = np.zeros(
                (self.dstore_size, 1),
                dtype=np.int16 if args.dstore_fp16 else np.int)
            self.vals = self.vals_from_memmap[:]
            self.vals = self.vals.astype(
                np.int16 if args.dstore_fp16 else np.int)
            print('Loading to memory took {} s'.format(time.time() - start))

        return index
Esempio n. 10
0
origins = [
    "http://clipis.co",
    "https://clipis.co",
    "http://*****:*****@app.post("/encode/image", summary="Index one image")
async def encode_image(data: Image):
    print(data)
    img_name = data.name
    data = data.base64img.encode('ascii')
    index_one_image(data, img_name, model, preprocess, device)
    return Response("Successful", status_code=200)


@app.post("/encode/images", summary="Index multiple images at once")
async def encode_images(data: ImageBatch):
    image_base64_list = data.img_list
Esempio n. 11
0
######################################################

xt, xb, xq, gt = datasets.load_data(dataset=args.db,
                                    compute_gt=args.compute_gt)

nq, d = xq.shape
nb, d = xb.shape

######################################################
# Make index
######################################################

if os.path.exists(args.indexfile):

    print "reading", args.indexfile
    index = faiss.read_index(args.indexfile)

    if isinstance(index, faiss.IndexPreTransform):
        index_hnsw = faiss.downcast_index(index.index)
        vec_transform = index.chain.at(0).apply_py
    else:
        index_hnsw = index
        vec_transform = lambda x: x

    hnsw = index_hnsw.hnsw
    hnsw_stats = faiss.cvar.hnsw_stats

else:

    print "build index, key=", args.indexkey
Esempio n. 12
0
if stage == 0:
    # train the index
    xt = fvecs_read("sift1M/sift_learn.fvecs")
    index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
    print("training index")
    index.train(xt)
    print("write " + tmpdir + "trained.index")
    faiss.write_index(index, tmpdir + "trained.index")

if 1 <= stage <= 4:
    # add 1/4 of the database to 4 independent indexes
    bno = stage - 1
    xb = fvecs_read("sift1M/sift_base.fvecs")
    i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
    index = faiss.read_index(tmpdir + "trained.index")
    print("adding vectors %d:%d" % (i0, i1))
    index.add_with_ids(xb[i0:i1], np.arange(i0, i1))
    print("write " + tmpdir + "block_%d.index" % bno)
    faiss.write_index(index, tmpdir + "block_%d.index" % bno)

machine_ports = [
    ('localhost', 12010),
    ('localhost', 12011),
    ('localhost', 12012),
    ('localhost', 12013),
]
v6 = False

if 5 <= stage <= 8:
    # load an index slice and launch index
Esempio n. 13
0
import face_recognition
import faiss
import numpy as np
from pathlib import Path
from milvus import Milvus, IndexType, MetricType, Status
from face_recognition_app.models.images import Images
from face_recognition_app import app

index = faiss.read_index(app.config['INDEXES_LOCATION'] + 'faiss.index')
embeddings = np.load(app.config['EMBEDDINGS_LOCATION'] + 'embeddings.npy',
                     allow_pickle=True).tolist()

milvus = Milvus(host=app.config['MILVUS_HOST'], port=app.config['MILVUS_PORT'])

MAX_RESULT = 64


class Embeddings():
    def is_similar(current_image_encoded, unknown_face_encodings):
        # match your image with the image and check if it matches
        result = face_recognition.compare_faces(unknown_face_encodings,
                                                current_image_encoded,
                                                tolerance=0.6)

        # check if it was a match
        if result[0] == True:
            return True
        else:
            return False

    def distance(current_image_encoded, unknown_face_encodings):
Esempio n. 14
0
    def build(self, config):
        '''
            build index from scratch
        '''
        operation_method = config.get("index_operation", "new").lower()

        gallery_images, gallery_docs = split_datafile(
            config['data_file'], config['image_root'], config['delimiter'])

        # when remove data in index, do not need extract fatures
        if operation_method != "remove":
            gallery_features = self._extract_features(gallery_images, config)
        assert operation_method in [
            "new", "remove", "append"
        ], "Only append, remove and new operation are supported"

        # vector.index: faiss index file
        # id_map.pkl: use this file to map id to image_doc
        if operation_method in ["remove", "append"]:
            # if remove or append, vector.index and id_map.pkl must exist
            assert os.path.join(
                config["index_dir"], "vector.index"
            ), "The vector.index dose not exist in {} when 'index_operation' is not None".format(
                config["index_dir"])
            assert os.path.join(
                config["index_dir"], "id_map.pkl"
            ), "The id_map.pkl dose not exist in {} when 'index_operation' is not None".format(
                config["index_dir"])
            index = faiss.read_index(
                os.path.join(config["index_dir"], "vector.index"))
            with open(os.path.join(config["index_dir"], "id_map.pkl"),
                      'rb') as fd:
                ids = pickle.load(fd)
            assert index.ntotal == len(ids.keys(
            )), "data number in index is not equal in in id_map"
        else:
            if not os.path.exists(config["index_dir"]):
                os.makedirs(config["index_dir"], exist_ok=True)
            index_method = config.get("index_method", "HNSW32")

            # if IVF method, cal ivf number automaticlly
            if index_method == "IVF":
                index_method = index_method + str(
                    min(int(len(gallery_images) // 8), 65536)) + ",Flat"

            # for binary index, add B at head of index_method
            if config["dist_type"] == "hamming":
                index_method = "B" + index_method

            #dist_type
            dist_type = faiss.METRIC_INNER_PRODUCT if config[
                "dist_type"] == "IP" else faiss.METRIC_L2

            #build index
            if config["dist_type"] == "hamming":
                index = faiss.index_binary_factory(config["embedding_size"],
                                                   index_method)
            else:
                index = faiss.index_factory(config["embedding_size"],
                                            index_method, dist_type)
                index = faiss.IndexIDMap2(index)
            ids = {}

        if config["index_method"] == "HNSW32":
            logger.warning(
                "The HNSW32 method dose not support 'remove' operation")

        if operation_method != "remove":
            # calculate id for new data
            start_id = max(ids.keys()) + 1 if ids else 0
            ids_now = (
                np.arange(0, len(gallery_images)) + start_id).astype(np.int64)

            # only train when new index file
            if operation_method == "new":
                if config["dist_type"] == "hamming":
                    index.add(gallery_features)
                else:
                    index.train(gallery_features)

            if not config["dist_type"] == "hamming":
                index.add_with_ids(gallery_features, ids_now)

            for i, d in zip(list(ids_now), gallery_docs):
                ids[i] = d
        else:
            if config["index_method"] == "HNSW32":
                raise RuntimeError(
                    "The index_method: HNSW32 dose not support 'remove' operation"
                )
            # remove ids in id_map, remove index data in faiss index
            remove_ids = list(
                filter(lambda k: ids.get(k) in gallery_docs, ids.keys()))
            remove_ids = np.asarray(remove_ids)
            index.remove_ids(remove_ids)
            for k in remove_ids:
                del ids[k]

        # store faiss index file and id_map file
        if config["dist_type"] == "hamming":
            faiss.write_index_binary(
                index, os.path.join(config["index_dir"], "vector.index"))
        else:
            faiss.write_index(
                index, os.path.join(config["index_dir"], "vector.index"))

        with open(os.path.join(config["index_dir"], "id_map.pkl"), 'wb') as fd:
            pickle.dump(ids, fd)
Esempio n. 15
0
    def __init__(
        self,
        model: str = "paraphrase-multilingual-MiniLM-L12-v2",
        idx_path: str = os.path.join(
            os.path.expanduser('~'),
            ".dialobot",
            "intent/",
        ),
        idx_file: str = "intent.idx",
        dataset_file: str = "dataset.pkl",
        fallback_threshold: float = 0.6,
        topk: int = 5,
        labeling_count: int = 20,
        device="cpu",
    ) -> None:
        """
        IntentRetriever using USE and faiss.
        Dialobot conducts fallback checking through vector retrieval.

        Args:
            model (str): model name for sentence transformers
            idx_path (str): path to save dataset
            idx_file (str): file name of trained faiss
            dataset_file (str): file name of dataset
            fallback_threshold (float): threshold for fallback checking
            topk (int): number of distances to return
            labeling_count (int) : Minimum Labeling Count

        References:
            Universal Sentence Encoder (Cer et al., 2018)
            https://arxiv.org/abs/1803.11175

            Billion-scale similarity search with GPUs (Johnson et al., 2017)
            https://arxiv.org/abs/1702.08734

        Note:
            If the number of data is smaller than the labeling_count,
            it is classified as 'the number of data',
            and if it is more than that,
            it is classified as 'int(the number of data / topk)' labels.

        Examples:
            >>> # 1. create retriever
            >>> retriever = IntentRetriever()
            >>> # 2. add data, batch data
            >>> retriever.add(("What time is it now?", "time"))
            >>> retriever.add(("Tell me today's weather", "weather"))
            >>> retriever.add([("What time do we meet tomorrow?", "time"),  ("How will the weather be tomorrow?", "weather")])
            >>> # 3. remove data
            >>> retriever.remove(("What time is it now?", "time"))
            >>> # 4. recognize intent
            >>> retriever.recognize("Tell me tomorrow's weather")
            'weather'
            >>> # 5. set `True` param `detail` if you want more information
            >>> retriever.recognize("Tell me tomorrow's weather", detail=True)
            {'intent': 'weather', 'scores': {'weather': 0.98, 'greeting': 0.69, ...}
            >>> # 6. clear all dataset
            >>> retriever.clear()
        """
        assert model in self.available_models(), \
            "param `retriever_model` must be one of {}".format(str(list(self.available_models())))
        self.device = device
        self.model = SentenceTransformer(model).to(self.device)
        self.dim = RETRIEVER_MODELS_DIMENSION[model]
        self.topk = topk
        self.labeling_count = labeling_count

        self.quantizer = faiss.IndexFlatIP(self.dim)

        self.idx_path = idx_path
        self.idx_file = idx_file
        self.dataset_file = dataset_file
        self.fallback_threshold = fallback_threshold

        if os.path.exists(idx_path + idx_file):
            self.index = faiss.read_index(idx_path + idx_file)
            self.nlist = int(self.index.ntotal / self.topk)
        else:
            self.nlist = 1
            self.index = faiss.IndexIVFFlat(
                self.quantizer,
                self.dim,
                self.nlist,
                faiss.METRIC_INNER_PRODUCT,
            )

        if os.path.exists(idx_path + dataset_file):
            with open(idx_path + dataset_file, mode="rb") as f:
                self.dataset: List[Tuple[str, np.ndarray,
                                         str]] = pickle.load(f)
        else:
            os.makedirs(idx_path, exist_ok=True)
            self.dataset: List[Tuple[str, np.ndarray, str]] = []
Esempio n. 16
0
 def __init__(self, path, nprobe, k=1024):
     index = faiss.read_index(path)
     index.nprobe = nprobe
     self.index = index
     self.k = k
Esempio n. 17
0
import matplotlib.pyplot as plt
import seaborn as sns

import os.path
import faiss
import numpy as np

mnist = fetch_openml("mnist_784", version=1)
sns.set(context="paper", style="white")

index = faiss.IndexHNSWSQ(784, faiss.ScalarQuantizer.QT_8bit, 32)
index.verbose = True
faiss_index_file = 'faiss.index'
if os.path.exists(faiss_index_file):
    print('load existing index from %s' % faiss_index_file)
    index = faiss.read_index(faiss_index_file, faiss.IO_FLAG_MMAP)
    index.hnsw.efSearch = 256
else:
    # build lossy faiss index
    print('build new index and save to %s' % faiss_index_file)
    index.hnsw.efConstruction = 40
    data = np.ascontiguousarray(mnist.data, dtype=np.float32)
    # we no longer need mnist data in its original form
    print('train index...')
    index.train(data)
    print('add vectors to index...')
    index.add(data)
    print('save...')
    faiss.write_index(index, faiss_index_file)

reducer = umap.UMAP(random_state=42, init="random", verbose=True, n_epochs=200)
def run_index(args):
    dump_names = os.listdir(os.path.join(args.dump_dir, args.phrase_dir))
    dump_paths = sorted([
        os.path.join(args.dump_dir, args.phrase_dir, name)
        for name in dump_names if name.endswith('.hdf5')
    ])

    data = None
    if args.stage in ['all', 'coarse']:
        if args.replace or not os.path.exists(args.quantizer_path):
            if not os.path.exists(args.index_dir):
                os.makedirs(args.index_dir)
            start_data, avg_vec, std_vec = sample_data(
                dump_paths,
                doc_sample_ratio=args.doc_sample_ratio,
                vec_sample_ratio=args.vec_sample_ratio,
                norm_th=args.norm_th)
            with open(os.path.join(args.index_dir, 'avg_vec.pkl'), 'wb') as fp:
                pickle.dump(avg_vec, fp)
            with open(os.path.join(args.index_dir, 'std_vec.pkl'), 'wb') as fp:
                pickle.dump(std_vec, fp)

    if args.stage in ['all', 'fine']:
        if args.replace or not os.path.exists(args.trained_index_path):
            if start_data is None:
                start_data, avg_vec, std_vec = sample_data(
                    dump_paths,
                    doc_sample_ratio=args.doc_sample_ratio,
                    vec_sample_ratio=args.vec_sample_ratio,
                    norm_th=args.norm_th,
                    hnsw=args.hnsw)
            train_index(start_data,
                        args.quantizer_path,
                        args.trained_index_path,
                        args.num_clusters,
                        fine_quant=args.fine_quant,
                        cuda=args.cuda,
                        hnsw=args.hnsw)

    if args.stage in ['all', 'add']:
        if args.replace or not os.path.exists(args.index_path):
            avg_vec = None
            std_vec = None
            # with open(os.path.join(args.index_dir, 'avg_vec.pkl'), 'rb') as fp:
            #     avg_vec = pickle.load(fp)
            # with open(os.path.join(args.index_dir, 'std_vec.pkl'), 'rb') as fp:
            #     std_vec = pickle.load(fp)

            if args.dump_paths is not None:
                dump_paths = args.dump_paths
                if not os.path.exists(args.subindex_dir):
                    os.makedirs(args.subindex_dir)
            add_to_index(
                dump_paths,
                args.trained_index_path,
                args.index_path,
                args.idx2id_path,
                cuda=args.cuda,
                num_docs_per_add=args.num_docs_per_add,
                offset=args.offset,
                norm_th=args.norm_th,
                fine_quant=args.fine_quant,
                avg_vec=avg_vec,
                std_vec=std_vec,
                first_passage=args.first_passage,
                index_filter=args.index_filter,
            )

    if args.stage == 'merge':
        if args.replace or not os.path.exists(args.index_path):
            merge_indexes(args.subindex_dir, args.trained_index_path,
                          args.index_path, args.idx2id_path, args.inv_path)

    if args.stage == 'move':
        index = faiss.read_index(args.trained_index_path)
        invlists = faiss.OnDiskInvertedLists(index.nlist, index.code_size,
                                             args.inv_path)
        index.replace_invlists(invlists)
        faiss.write_index(index, args.index_path)
Esempio n. 19
0
    def __init__(self,
                 phrase_dump_dir,
                 tfidf_dump_dir,
                 start_index_path,
                 idx2id_path,
                 max_norm_path,
                 doc_rank_fn,
                 cuda=False,
                 dump_only=False):

        # If dump dir is a file, use it as a dump.
        if os.path.isdir(phrase_dump_dir):
            self.phrase_dump_paths = sorted([
                os.path.join(phrase_dump_dir, name)
                for name in os.listdir(phrase_dump_dir) if 'hdf5' in name
            ])
            dump_names = [
                os.path.splitext(os.path.basename(path))[0]
                for path in self.phrase_dump_paths
            ]
            self.dump_ranges = [
                list(map(int, name.split('-'))) for name in dump_names
            ]
        else:
            self.phrase_dump_paths = [phrase_dump_dir]
        self.phrase_dumps = [
            h5py.File(path, 'r') for path in self.phrase_dump_paths
        ]

        # Load tfidf dump
        assert os.path.isdir(tfidf_dump_dir), tfidf_dump_dir
        self.tfidf_dump_paths = sorted([
            os.path.join(tfidf_dump_dir, name)
            for name in os.listdir(tfidf_dump_dir) if 'hdf5' in name
        ])
        tfidf_dump_names = [
            os.path.splitext(os.path.basename(path))[0]
            for path in self.tfidf_dump_paths
        ]
        if '-' in tfidf_dump_names[0]:  # Range check
            tfidf_dump_ranges = [
                list(map(int,
                         name.split('_')[0].split('-')))
                for name in tfidf_dump_names
            ]
            assert tfidf_dump_ranges == self.dump_ranges
        self.tfidf_dumps = [
            h5py.File(path, 'r') for path in self.tfidf_dump_paths
        ]
        logger.info(f'using doc ranker functions: {doc_rank_fn["index"]}')
        self.doc_rank_fn = doc_rank_fn
        if dump_only:
            return

        # Read index
        logger.info(f'Reading {start_index_path}')
        self.start_index = faiss.read_index(start_index_path,
                                            faiss.IO_FLAG_ONDISK_SAME_DIR)
        self.idx_f = self.load_idx_f(idx2id_path)
        with open(max_norm_path, 'r') as fp:
            self.max_norm = json.load(fp)

        # Options
        self.num_docs_list = []
        self.cuda = cuda
        if self.cuda:
            assert torch.cuda.is_available(
            ), f"Cuda availability {torch.cuda.is_available()}"
            self.device = torch.device('cuda')
        else:
            self.device = torch.device("cpu")
Esempio n. 20
0
data = np.array(data).astype('float32')
printTime()

log_instance.info('normalize_L2')
normalize_L2(data)
log_instance.info("start upload data faiss!")
printTime()

# index = faiss.IndexFlatIP(data_dimension)
# index.train(data)
# log_instance.info(index.is_trained)
# index.add(data)

print("loading", "jiadian_populated.index")
index = faiss.read_index("jiadian_populated.index")

log_instance.info(index.ntotal)
log_instance.info("end upload data faiss!")
printTime()

query = np.array(search_lst).astype('float32')
normalize_L2(query)
topN = 3

log_instance.info('start query!')
dis, ind = index.search(query, topN)
printTime()
log_instance.info('end query!')

Esempio n. 21
0
    p.add_argument('--port', type=int, default=8080, help='Port to listen on')
    p.add_argument('--index',
                   default=[],
                   required=True,
                   action="append",
                   help="Location of FAISS index file.")
    p.add_argument(
        '--map',
        type=str,
        default='',
        help='Location of file mapping index IDs to other IDs as needed.')
    args = p.parse_args()

    # Import the index
    # Just taking the first element for this example, but the index flag now supports multiple index files. Use them as you need.
    idx = faiss.read_index(args.index[0])

    # Create the ID map
    id_map = {}
    if args.map:
        with open(args.map, 'rb') as f:
            for line in f:
                line = line.strip()
                if line.startswith('#'): continue
                k, v = line.split(':')
                id_map[int(k.strip())] = v.strip()

    idxsvc = IndexSvc(__name__, host="::", port=args.port)
    idxsvc.RegisterQuery(index)
    idxsvc.set_map(id_map)
    idxsvc.run()
Esempio n. 22
0
def loadIndex(filename):
    return faiss.read_index(filename)
Esempio n. 23
0
if stage == 0:
    # train the index
    xt = fvecs_read("sift1M/sift_learn.fvecs")
    index = faiss.index_factory(xt.shape[1], "IVF4096,Flat")
    print("training index")
    index.train(xt)
    print("write " + tmpdir + "trained.index")
    faiss.write_index(index, tmpdir + "trained.index")


if 1 <= stage <= 4:
    # add 1/4 of the database to 4 independent indexes
    bno = stage - 1
    xb = fvecs_read("sift1M/sift_base.fvecs")
    i0, i1 = int(bno * xb.shape[0] / 4), int((bno + 1) * xb.shape[0] / 4)
    index = faiss.read_index(tmpdir + "trained.index")
    print("adding vectors %d:%d" % (i0, i1))
    index.add(xb[i0:i1])
    print("write " + tmpdir + "block_%d.index" % bno)
    faiss.write_index(index, tmpdir + "block_%d.index" % bno)


if stage == 5:
    # merge the images into an on-disk index
    # first load the inverted lists
    ivfs = []
    for bno in range(4):
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        print("read " + tmpdir + "block_%d.index" % bno)
Esempio n. 24
0
def pdd_research_idx(input_idx_file, output_test_result, query_num,
                     sku2vector_dir, jd_check_file):

    data = []
    sku_lst = []
    error_sku = []
    idx = 0

    log_instance.info("start upload data sku part!")
    printTime()

    f_name_lst = []
    for root, dirs, files in os.walk(sku2vector_dir):
        for f_name in files:
            if not f_name.startswith('part_'): continue
            f_name = os.path.join(root, f_name)
            f_name_lst.append(f_name)
    f_name_lst = sorted(f_name_lst)

    for f_name in f_name_lst:
        with open(f_name) as f1:
            for line in f1:
                line = line.strip()
                if line == '': continue
                lst1 = line.split('\t')
                if len(lst1) != 2: continue
                lst1 = [tmp for tmp in lst1]
                sku, vec_str = lst1

                lst2 = vec_str.split("|")
                lst2 = [float(d1) for d1 in lst2]
                data.append(lst2)
                sku_lst.append(sku)
                if idx % 5000 == 0:
                    log_instance.info("idx: %s" % idx)
                idx += 1

    log_instance.info("end upload data sku part!")
    printTime()

    log_instance.info(len(error_sku))
    log_instance.info(len(data))

    search_lst = data[:query_num]
    query = np.array(search_lst).astype('float32')
    normalize_L2(query)
    topN = 3

    log_instance.info('loading index from %s!' % (input_idx_file))
    printTime()
    index = faiss.read_index(input_idx_file)
    log_instance.info('ended loading index from %s!' % (input_idx_file))
    printTime()

    log_instance.info('start query!')
    faiss.omp_set_num_threads(10)
    dis, ind = index.search(query, topN)
    printTime()
    log_instance.info('end query!')

    def result_dealing(dis, ind, sku_lst, result_warehouse_list):
        score_lst = dis.tolist()
        idx_lst = ind.tolist()
        r_lst = []

        for j in range(len(score_lst)):
            s1, s2, s3 = score_lst[j]
            i1, i2, i3 = idx_lst[j]
            sku1_pdd = sku_lst[j]
            # if sku1 not in sku_name_dict:
            #     n1 = "unk"
            #     log_instance.info(sku1)
            # else:
            #     n1 = sku_name_dict[sku1]
            # if sku2 not in sku_name_dict:
            #     n2 = "unk"
            #     log_instance.info(sku2)
            # else:
            #     n2 = sku_name_dict[sku2]
            # if sku3 not in sku_name_dict:
            #     n3 = "unk"
            #     log_instance.info(sku3)
            # else:
            #     n3 = sku_name_dict[sku3]
            r_lst.append(
                "%s\t%s:%s\t%s:%s\t%s:%s" %
                (sku1_pdd, result_warehouse_list[i1], s1,
                 result_warehouse_list[i2], s2, result_warehouse_list[i3], s3))

        with open(output_test_result, "w") as f2:
            f2.write("\n".join(r_lst))
            f2.flush()

    with open(jd_check_file, "r", encoding="utf-8") as f3:
        result_warehouse_list = json.load(f3)

    result_dealing(dis, ind, sku_lst, result_warehouse_list)

    log_instance.info("finished!")
Esempio n. 25
0
 def initialize(self):
     self.index = faiss.read_index(self.faiss_filepath)
def add_to_index(dump_paths,
                 trained_index_path,
                 target_index_path,
                 idx2id_path,
                 num_docs_per_add=1000,
                 cuda=False,
                 fine_quant='SQ4',
                 offset=0,
                 norm_th=999,
                 ignore_ids=None,
                 avg_vec=None,
                 std_vec=None,
                 first_passage=False,
                 index_filter=-1e8):

    sidx2doc_id = []
    sidx2word_id = []
    dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths]
    # filter dumps
    if index_filter != -1e8:
        f_dumps = [
            h5py.File(dump_path.replace('/phrase/', '/filter/'), 'r')
            for dump_path in dump_paths
        ]

    print('reading %s' % trained_index_path)
    start_index = faiss.read_index(trained_index_path)
    if 'none' not in fine_quant:
        index_ivf = faiss.extract_index_ivf(start_index)
        index_ivf.make_direct_map()
        index_ivf.set_direct_map_type(faiss.DirectMap.Hashtable)

    if cuda:
        if 'PQ' in fine_quant:
            index_ivf = faiss.extract_index_ivf(start_index)
            quantizer = index_ivf.quantizer
            quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer)
            index_ivf.quantizer = quantizer_gpu
        else:
            res = faiss.StandardGpuResources()
            co = faiss.GpuClonerOptions()
            co.useFloat16 = True
            start_index = faiss.index_cpu_to_gpu(res, 0, start_index, co)

    print('adding following dumps:')
    for dump_path in dump_paths:
        print(dump_path)
    start_total = 0
    start_total_prev = 0
    cnt = 0
    for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')):
        starts = []
        start_valids = []
        dump_length = len(phrase_dump)
        for i, (doc_idx, doc_group) in enumerate(
                tqdm(phrase_dump.items(), desc='adding %d' % di)):
            if ignore_ids is not None and doc_idx in ignore_ids:
                continue
            num_start = doc_group['start'].shape[0]
            if num_start == 0: continue
            cnt += 1

            # First passage only
            if first_passage:
                f2o_start = doc_group['f2o_start'][:]
                cut = sum(f2o_start < doc_group['len_per_para'][0])
                start = int8_to_float(doc_group['start'][:cut],
                                      doc_group.attrs['offset'],
                                      doc_group.attrs['scale'])
                num_start = start.shape[0]
            # Apply index filter
            elif index_filter != -1e8:
                o2f_start = {
                    orig: ft
                    for ft, orig in enumerate(doc_group['f2o_start'][:])
                }
                filter_start = f_dumps[di][doc_idx]['filter_start'][:]
                filter_end = f_dumps[di][doc_idx]['filter_end'][:]
                start_idxs, = np.where(filter_start > index_filter)
                end_idxs, = np.where(filter_end > index_filter)
                save_idx = set(np.concatenate([start_idxs, end_idxs]))
                save_idx = sorted(
                    [o2f_start[si] for si in save_idx if si in o2f_start])
                start = int8_to_float(doc_group['start'][save_idx],
                                      doc_group.attrs['offset'],
                                      doc_group.attrs['scale'])
                num_start = start.shape[0]
            else:
                start = int8_to_float(doc_group['start'][:],
                                      doc_group.attrs['offset'],
                                      doc_group.attrs['scale'])
            start_valid = np.linalg.norm(start, axis=1) <= norm_th

            starts.append(start)
            start_valids.append(start_valid)
            sidx2doc_id.extend([int(doc_idx)] * num_start)
            if index_filter == -1e8:
                sidx2word_id.extend(range(num_start))
            else:
                sidx2word_id.extend(save_idx)
            start_total += num_start

            if len(starts) > 0 and ((i % num_docs_per_add == 0) or
                                    (i == dump_length - 1)):
                print('adding at %d' % (i + 1))
                add_with_offset(
                    start_index,
                    concat_vectors(starts),
                    concat_vectors(start_valids),
                    start_total_prev,
                    offset,
                    fine_quant,
                )
                start_total_prev = start_total
                starts = []
                start_valids = []
        if len(starts) > 0:
            print('final adding at %d' % (i + 1))
            add_with_offset(
                start_index,
                concat_vectors(starts),
                concat_vectors(start_valids),
                start_total_prev,
                offset,
                fine_quant,
            )
            start_total_prev = start_total
    print('number of docs', cnt)

    for dump in dumps:
        dump.close()

    if cuda:
        print('moving back to cpu')
        if 'PQ' in fine_quant:
            index_ivf.quantizer = quantizer
            del quantizer_gpu
        else:
            start_index = faiss.index_gpu_to_cpu(start_index)

    print('start_index ntotal: %d' % start_index.ntotal)
    print(start_total)
    sidx2doc_id = np.array(sidx2doc_id, dtype=np.int32)
    sidx2word_id = np.array(sidx2word_id, dtype=np.int32)

    print('writing index and metadata')
    with h5py.File(idx2id_path, 'w') as f:
        g = f.create_group(str(offset))
        g.create_dataset('doc', data=sidx2doc_id)
        g.create_dataset('word', data=sidx2word_id)
        g.attrs['offset'] = offset

    faiss.write_index(start_index, target_index_path)
    print('done')
Esempio n. 27
0
 def load_index(self, index_dir: str):
     index_path = os.path.join(index_dir, 'index')
     docid_path = os.path.join(index_dir, 'docid')
     index = faiss.read_index(index_path)
     docids = self.load_docids(docid_path)
     return index, docids
Esempio n. 28
0
                                    compute_gt=args.compute_gt)

print("dataset sizes: train %s base %s query %s GT %s" %
      (xt.shape, xb.shape, xq.shape, gt.shape))

nq, d = xq.shape
nb, d = xb.shape

######################################################
# Make index
######################################################

if args.indexfile and os.path.exists(args.indexfile):

    print("reading", args.indexfile)
    index = faiss.read_index(args.indexfile)

    if isinstance(index, faiss.IndexPreTransform):
        index_ivf = faiss.downcast_index(index.index)
    else:
        index_ivf = index
        assert isinstance(index_ivf, faiss.IndexIVF)
        vec_transform = lambda x: x
    assert isinstance(index_ivf, faiss.IndexIVF)

else:

    print("build index, key=", args.indexkey)

    index = faiss.index_factory(d, args.indexkey)