def main():
    parser = argparse.ArgumentParser(
        description='make index for a subset of the data')

    def aa(*args, **kwargs):
        group.add_argument(*args, **kwargs)

    group = parser.add_argument_group('index type')
    aa('--inputindex',
       default=workdir + 'trained.faissindex',
       help='empty input index to fill in')
    aa('--nt', default=-1, type=int, help='nb of openmp threads to use')

    group = parser.add_argument_group('db options')
    aa('--input', default=deep1bdir + "base.fvecs")
    aa('--bs', default=2**18, type=int, help='batch size for db access')
    aa('--i0', default=0, type=int, help='lower bound to index')
    aa('--i1', default=-1, type=int, help='upper bound of vectors to index')

    group = parser.add_argument_group('output')
    aa('-o', default='/tmp/x', help='output index')
    aa('--keepquantizer',
       default=False,
       action='store_true',
       help='by default we remove the data from the quantizer to save space')

    args = parser.parse_args()
    print('args=', args)

    print('start accessing data')
    src = produce_batches(args)

    print('loading index', args.inputindex)
    index = faiss.read_index(args.inputindex)

    if args.nt != -1:
        faiss.omp_set_num_threads(args.nt)

    t0 = time.time()
    ntot = 0
    for ids, x in rate_limited_iter(src):
        print('add %d:%d (%.3f s)' % (ntot, ntot + ids.size, time.time() - t0))
        index.add_with_ids(np.ascontiguousarray(x, dtype='float32'), ids)
        ntot += ids.size

    index_ivf = faiss.extract_index_ivf(index)
    print('invlists stats: imbalance %.3f' %
          index_ivf.invlists.imbalance_factor())
    index_ivf.invlists.print_stats()

    if not args.keepquantizer:
        print('resetting quantizer content')
        index_ivf = faiss.extract_index_ivf(index)
        index_ivf.quantizer.reset()

    print('store output', args.o)
    faiss.write_index(index, args.o)
Beispiel #2
0
    def get_cluster_ids(self, list_num: int) -> np.ndarray:
        """
        TODO: docstring

        """

        # TODO: assert IVF
        assert self.is_trained

        # This fixes problem with SWIG and numpy int
        list_num = int(list_num)

        index = faiss.read_index(str(self.tempdir / self.MERGED_INDEX_NAME))

        # Get the IVF from potentially opaque index
        invlists = faiss.extract_index_ivf(index).invlists
        list_size = invlists.list_size(list_num)
        list_ids = np.zeros(list_size, dtype=np.int64)
        temp_ids = invlists.get_ids(list_num)

        # Need to copy since memory will be deallocated along with the invlist.
        faiss.memcpy(faiss.swig_ptr(list_ids), temp_ids, list_ids.nbytes)
        invlists.release_ids(list_num, temp_ids)

        if self.multi_id:
            list_ids = self._invert_cantor_pairing_vec(list_ids)

        return list_ids
Beispiel #3
0
 def __init__(self, index, sub_indexes):
     self.index = index
     self.code_size = faiss.extract_index_ivf(index.index).code_size
     self.sub_indexes = sub_indexes
     self.ni = len(self.sub_indexes)
     # pool of threads. Each thread manages one sub-index.
     self.pool = ThreadPool(self.ni)
     self.verbose = False
Beispiel #4
0
def merge_on_disk(trained_index: faiss.Index, shard_fnames: List[str],
                  ivfdata_fname: str) -> None:
    """
	Adds the contents of the indexes stored in shard_fnames into the index
    trained_index. The on-disk data is stored in ivfdata_fname.

    Args:
        trained_index: The trained index to add the data to.
        shard_fnames: A list of the partial index filenames.
        ivfdata_fname: The filename for the on-disk extracted data.

	"""

    # Load the inverted lists
    ivfs = []
    for fname in shard_fnames:
        # The IO_FLAG_MMAP is to avoid actually loading the data, and thus the
        # total size of the inverted lists can exceed the available RAM
        index = faiss.read_index(fname, faiss.IO_FLAG_MMAP)
        index_ivf = faiss.extract_index_ivf(index)
        ivfs.append(index_ivf.invlists)

        # Avoid deallocating the invlists with the index
        index_ivf.own_invlists = False

    # Construct the output index
    index = trained_index
    index_ivf = faiss.extract_index_ivf(index)

    assert index.ntotal == 0, 'The trained index should be empty'

    # Prepare the output inverted lists, which are written to ivfdata_fname.
    invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size,
                                         ivfdata_fname)

    # Merge all the inverted lists
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in ivfs:
        ivf_vector.push_back(ivf)

    n_total = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

    # Replace the inverted lists in the output index
    index.ntotal = index_ivf.ntotal = n_total
    index_ivf.replace_invlists(invlists, True)
    invlists.this.disown()
Beispiel #5
0
def merge_ondisk(trained_index: faiss.Index, shard_fnames: List[str],
                 ivfdata_fname: str) -> None:
    """Add the contents of the indexes stored in shard_fnames into the index
    trained_index. The on-disk data is stored in ivfdata_fname"""
    assert not isinstance(
        trained_index,
        faiss.IndexIVFPQR), "IndexIVFPQR is not supported as an on disk index."
    # merge the images into an on-disk index
    # first load the inverted lists
    ivfs = []
    for fname in shard_fnames:
        # the IO_FLAG_MMAP is to avoid actually loading the data thus
        # the total size of the inverted lists can exceed the
        # available RAM
        LOG.info("read " + fname)
        index = faiss.read_index(fname, faiss.IO_FLAG_MMAP)
        index_ivf = faiss.extract_index_ivf(index)
        ivfs.append(index_ivf.invlists)

        # avoid that the invlists get deallocated with the index
        index_ivf.own_invlists = False

    # construct the output index
    index = trained_index
    index_ivf = faiss.extract_index_ivf(index)

    assert index.ntotal == 0, "works only on empty index"

    # prepare the output inverted lists. They will be written
    # to merged_index.ivfdata
    invlists = faiss.OnDiskInvertedLists(index_ivf.nlist, index_ivf.code_size,
                                         ivfdata_fname)

    # merge all the inverted lists
    ivf_vector = faiss.InvertedListsPtrVector()
    for ivf in ivfs:
        ivf_vector.push_back(ivf)

    LOG.info("merge %d inverted lists " % ivf_vector.size())
    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())

    # now replace the inverted lists in the output index
    index.ntotal = index_ivf.ntotal = ntotal
    index_ivf.replace_invlists(invlists, True)
    invlists.this.disown()
def train_index(start_data,
                quantizer_path,
                trained_index_path,
                num_clusters,
                fine_quant='SQ4',
                cuda=False,
                hnsw=False):
    ds = start_data.shape[1]
    quantizer = faiss.IndexFlatIP(ds)

    # Used only for reimplementation
    if fine_quant == 'SQ4':
        start_index = faiss.IndexIVFScalarQuantizer(
            quantizer, ds, num_clusters, faiss.ScalarQuantizer.QT_4bit,
            faiss.METRIC_INNER_PRODUCT)

    # Default index type
    elif 'OPQ' in fine_quant:
        code_size = int(fine_quant[fine_quant.index('OPQ') + 3:])
        if hnsw:
            start_index = faiss.IndexHNSWPQ(ds, "HNSW32,PQ96",
                                            faiss.METRIC_INNER_PRODUCT)
        else:
            opq_matrix = faiss.OPQMatrix(ds, code_size)
            opq_matrix.niter = 10
            sub_index = faiss.IndexIVFPQ(quantizer, ds, num_clusters,
                                         code_size, 8,
                                         faiss.METRIC_INNER_PRODUCT)
            start_index = faiss.IndexPreTransform(opq_matrix, sub_index)
    elif 'none' in fine_quant:
        start_index = faiss.IndexFlatIP(ds)
    else:
        raise ValueError(fine_quant)

    start_index.verbose = False
    if cuda:
        # Convert to GPU index
        res = faiss.StandardGpuResources()
        co = faiss.GpuClonerOptions()
        co.useFloat16 = True
        gpu_index = faiss.index_cpu_to_gpu(res, 0, start_index, co)
        gpu_index.verbose = False

        # Train on GPU and back to CPU
        gpu_index.train(start_data)
        start_index = faiss.index_gpu_to_cpu(gpu_index)
    else:
        start_index.train(start_data)

    # Make sure to set direct map again
    if 'none' not in fine_quant:
        index_ivf = faiss.extract_index_ivf(start_index)
        index_ivf.make_direct_map()
        index_ivf.set_direct_map_type(faiss.DirectMap.Hashtable)
    faiss.write_index(start_index, trained_index_path)
Beispiel #7
0
    def __init__(self,
                 invlist_fnames,
                 empty_index_fname,
                 masked_index_fname=None):

        self.indexes = indexes = []
        ilv = faiss.InvertedListsPtrVector()

        for fname in invlist_fnames:
            if os.path.exists(fname):
                print('reading', fname, end='\r', flush=True)
                index = faiss.read_index(fname)
                indexes.append(index)
                il = faiss.extract_index_ivf(index).invlists
            else:
                assert False
            ilv.push_back(il)
        print()

        self.big_il = faiss.VStackInvertedLists(ilv.size(), ilv.data())
        if masked_index_fname:
            self.big_il_base = self.big_il
            print('loading', masked_index_fname)
            self.masked_index = faiss.read_index(
                masked_index_fname,
                faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)
            self.big_il = faiss.MaskedInvertedLists(
                faiss.extract_index_ivf(self.masked_index).invlists,
                self.big_il_base)

        print('loading empty index', empty_index_fname)
        self.index = faiss.read_index(empty_index_fname)
        ntotal = self.big_il.compute_ntotal()

        print('replace invlists')
        index_ivf = faiss.extract_index_ivf(self.index)
        index_ivf.replace_invlists(self.big_il, False)
        index_ivf.ntotal = self.index.ntotal = ntotal
        index_ivf.parallel_mode = 1  # seems reasonable to do this all the time

        quantizer = faiss.downcast_index(index_ivf.quantizer)
        quantizer.hnsw.efSearch = 1024
Beispiel #8
0
    def test_ivf_train_2level(self):
        " check 2-level clustering with IVF training "
        ds = datasets.SyntheticDataset(32, 10000, 1000, 200)
        index = faiss.index_factory(ds.d, "PCA16,IVF100,SQ8")
        faiss.extract_index_ivf(index).nprobe = 10
        index.train(ds.get_train())
        index.add(ds.get_database())
        Dref, Iref = index.search(ds.get_queries(), 1)

        index = faiss.index_factory(ds.d, "PCA16,IVF100,SQ8")
        faiss.extract_index_ivf(index).nprobe = 10
        clustering.train_ivf_index_with_2level(index,
                                               ds.get_train(),
                                               verbose=True)
        index.add(ds.get_database())
        Dnew, Inew = index.search(ds.get_queries(), 1)

        # normally 47 / 200 differences
        ndiff = (Iref != Inew).sum()
        self.assertLess(ndiff, 50)
Beispiel #9
0
    def get_cluster_sizes(self) -> List[int]:
        """Returns the number of vectors assigned to each cluster."""

        # TODO: assert IVF
        assert self.is_trained

        index = faiss.read_index(str(self.tempdir / self.MERGED_INDEX_NAME))

        # Get the IVF from potentially opaque index
        invlists = faiss.extract_index_ivf(index).invlists
        list_sizes = [invlists.list_size(i) for i in range(invlists.nlist)]
        return list_sizes
Beispiel #10
0
    def __init__(self,
                 phrase_dump_dir,
                 index_path,
                 idx2id_path,
                 cuda=False,
                 logging_level=logging.INFO):
        self.phrase_dump_dir = phrase_dump_dir

        # Read index
        self.index = {}
        logger.info(f'Reading {index_path}')
        self.index = faiss.read_index(index_path,
                                      faiss.IO_FLAG_ONDISK_SAME_DIR)
        self.max_idx = 1e8 if 'PQ' not in index_path else 1e9
        logger.info(
            f'index ntotal: {self.index.ntotal} | PQ: {"PQ" in index_path}')

        # Read idx2id
        self.idx_f = {}
        logger.info('Load idx2id on memory')
        self.idx_f = self.load_idx_f(idx2id_path)
        self.offset = None
        self.scale = None
        self.doc_groups = None

        # Options
        logger.setLevel(logging_level)
        self.num_docs_list = []
        self.cuda = cuda
        if self.cuda:
            assert torch.cuda.is_available(
            ), f"Cuda availability {torch.cuda.is_available()}"
            self.device = torch.device('cuda')
            logger.info("Load IVF on GPU")
            index_ivf = faiss.extract_index_ivf(self.index)
            quantizer = index_ivf.quantizer
            quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer)
            index_ivf.quantizer = quantizer_gpu
        else:
            self.device = torch.device("cpu")

        # Load metadata on RAM if possible
        doc_group_path = os.path.join(
            self.phrase_dump_dir[:self.phrase_dump_dir.index('/phrase')],
            'dph_meta_compressed.pkl')  # 1 min
        if os.path.exists(doc_group_path) and ('PQ' in index_path):
            logger.info(
                f"Loading metadata on RAM from {doc_group_path} (for PQ only)")
            self.doc_groups = pickle.load(open(doc_group_path, 'rb'))
        else:
            logger.info(
                f"Will read metadata directly from hdf5 files (requires SSDs for faster inference)"
            )
    def training_initialize(self, index, quantizer):
        """
        The index and quantizer should be owned by caller.
        """

        assert self.ngpu > 0

        s = time.time()
        self.index_ivf = faiss.extract_index_ivf(index)
        self.clustering_index = faiss.index_cpu_to_all_gpus(quantizer)
        self.index_ivf.clustering_index = self.clustering_index
        print(time.time() - s)
Beispiel #12
0
    def get_centroids(self) -> np.ndarray:
        """Returns the IVF centroids."""

        # TODO: assert IVF
        assert self.is_trained

        index = faiss.read_index(str(self.tempdir / self.MERGED_INDEX_NAME))

        # Get the IVF from potentially opaque index
        index_ivf = faiss.extract_index_ivf(index)

        centroids = index_ivf.quantizer.reconstruct_n(0, index_ivf.nlist)
        return centroids
Beispiel #13
0
 def train_quantizer(self, data):
     db_ids = [t[0] for t in data]
     #        [print("CHANGE BACK TO 60 * 65536") for _ in range(1000)]
     data = random.sample(data, 60 * 65536)
     vectors = [np.reshape(t[1], (1, -1)) for t in data]
     vectors = np.concatenate(vectors, axis=0)
     if not self.index.is_trained:
         print("training product quantizer")
         index_ivf = faiss.extract_index_ivf(self.index)
         clustering_index = faiss.index_cpu_to_all_gpus(
             faiss.IndexFlatL2(768))
         index_ivf.clustering_index = clustering_index
         self.index.train(vectors)
Beispiel #14
0
    def transform_and_assign(self, xq):
        index = self.index

        if isinstance(index, faiss.IndexPreTransform):
            assert index.chain.size() == 1
            vt = index.chain.at(0)
            xq = vt.apply_py(xq)

        # perform quantization
        index_ivf = faiss.extract_index_ivf(index)
        quantizer = index_ivf.quantizer
        coarse_dis, list_nos = quantizer.search(xq, index_ivf.nprobe)
        return xq, list_nos, coarse_dis
Beispiel #15
0
 def ivf_search_preassigned(self, xq, list_nos, coarse_dis, k):
     index_ivf = faiss.extract_index_ivf(self.index)
     n, d = xq.shape
     assert d == index_ivf.d
     n2, d2 = list_nos.shape
     assert list_nos.shape == coarse_dis.shape
     assert n2 == n
     assert d2 == index_ivf.nprobe
     D = np.empty((n, k), dtype='float32')
     I = np.empty((n, k), dtype='int64')
     index_ivf.search_preassigned(
         n, faiss.swig_ptr(xq), k,
         faiss.swig_ptr(list_nos), faiss.swig_ptr(coarse_dis),
         faiss.swig_ptr(D), faiss.swig_ptr(I), False)
     return D, I
Beispiel #16
0
    def ivf_range_search_preassigned(self, xq, list_nos, coarse_dis, radius):
        index_ivf = faiss.extract_index_ivf(self.index)
        n, d = xq.shape
        assert d == index_ivf.d
        n2, d2 = list_nos.shape
        assert list_nos.shape == coarse_dis.shape
        assert n2 == n
        assert d2 == index_ivf.nprobe
        res = faiss.RangeSearchResult(n)

        index_ivf.range_search_preassigned(n, faiss.swig_ptr(xq), radius,
                                           faiss.swig_ptr(list_nos),
                                           faiss.swig_ptr(coarse_dis), res)

        lims = faiss.rev_swig_ptr(res.lims, n + 1).copy()
        nd = int(lims[-1])
        D = faiss.rev_swig_ptr(res.distances, nd).copy()
        I = faiss.rev_swig_ptr(res.labels, nd).copy()
        return lims, D, I
Beispiel #17
0
    def build(self, use_gpu=False):
        self.vectors = np.array(self.vectors)

        faiss.normalize_L2(self.vectors)

        logging.info('Indexing {} vectors'.format(self.vectors.shape[0]))

        if self.vectors.shape[0] > 50000:
            num_centroids = 8 * int(
                math.sqrt(math.pow(2, int(math.log(self.vectors.shape[0],
                                                   2)))))

            logging.info('Using {} centroids'.format(num_centroids))

            self.index = faiss.index_factory(
                self.d, "IVF{}_HNSW32,Flat".format(num_centroids))

            ngpu = faiss.get_num_gpus()
            if ngpu > 0 and use_gpu:
                logging.info('Using {} GPUs'.format(ngpu))

                index_ivf = faiss.extract_index_ivf(self.index)
                clustering_index = faiss.index_cpu_to_all_gpus(
                    faiss.IndexFlatL2(self.d))
                index_ivf.clustering_index = clustering_index

            logging.info('Training index...')

            self.index.train(self.vectors)
        else:
            self.index = faiss.IndexFlatL2(self.d)
            if faiss.get_num_gpus() > 0 and use_gpu:
                self.index = faiss.index_cpu_to_all_gpus(self.index)

        logging.info('Adding vectors to index...')

        self.index.add(self.vectors)
Beispiel #18
0
def simulate_mee_runtime(n_videos=1000000, d=256, n_query=100, max_neighbors=100, n_runs=5, n_warmup_runs=10):
    """ Search over a database of shape [n_videos, d] with query of shape [n_query, d].
    For each query, return max_neighbors results.
    """
    import faiss
    torch.cuda.synchronize()
    st_time = time.time()
    fake_database = faiss.rand((n_videos, d))
    fake_query = faiss.rand((n_query, d))
    torch.cuda.synchronize()
    logger.info("Construct fake database + query time {}".format(time.time() - st_time))

    torch.cuda.synchronize()
    st_time = time.time()
    index = faiss.index_factory(d, "IVF4096,Flat", faiss.METRIC_L2)
    index_ivf = faiss.extract_index_ivf(index)
    clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
    index_ivf.clustering_index = clustering_index
    torch.cuda.synchronize()
    logger.info("Build/Move to GPU? index time {}".format(time.time() - st_time))

    st_time = time.time()
    torch.cuda.synchronize()
    index_ivf.train(fake_database)
    torch.cuda.synchronize()
    logger.info("Train index time {}".format(time.time() - st_time))

    times = []
    for _ in range(n_warmup_runs+n_runs):
        torch.cuda.synchronize()
        st_time = time.time()
        D, I = index_ivf.search(fake_query, max_neighbors)
        torch.cuda.synchronize()
        times.append(time.time() - st_time)
    avg_time = np.mean(times[n_warmup_runs:]) * 2  # video + sub
    logger.info("Avg searching time ({} runs) {}".format(n_runs, avg_time))
    return avg_time
Beispiel #19
0
    return parser.parse_args()


if __name__ == '__main__':
    args = arguments()

    reader = MemoryMappedDatasetReader(args.input_database, start=True)
    n, d = reader.shape
    start = datetime.now()
    print(f"Starting at {start}", flush=True)
    if args.size == 'large':
        outputfile = args.input_database / "trained.faiss.index"
        index = faiss.index_factory(d, "OPQ64_128,IVF262144_HNSW32,PQ64")
        #index  = faiss.index_factory(d, "OPQ64_128,IVF16384_HNSW32,PQ64")
        #index  = faiss.index_factory(d, "OPQ64_128,IVF8192_HNSW32,PQ64")
        ivf = faiss.extract_index_ivf(index)
        clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(64))
        ivf.clustering_index = clustering_index
        print(f"Sent clustering index to GPU", flush=True)

        print(f"Clustering ({n}, {d}) matrix ({args.size} mode)", flush=True)
        index.train(reader.embedding_matrix)
        print("Finished training", flush=True)
        faiss.write_index(index, str(outputfile))
    else:
        outputfile = args.input_database / "trained.kdtree.index"
        print(f"Clustering ({n}, {d}) matrix ({args.size} mode)", flush=True)
        kdt = KDTree(reader.embedding_matrix, metric='euclidean')
        print("Finished training")
        joblib.dump(kdt, str(outputfile))
Beispiel #20
0
 def test_parenthesis_2(self):
     index = faiss.index_factory(50, "PCA30,IVF32(PQ15),Flat")
     index_ivf = faiss.extract_index_ivf(index)
     quantizer = faiss.downcast_index(index_ivf.quantizer)
     self.assertEqual(quantizer.pq.M, 15)
     self.assertEqual(quantizer.d, 30)
 def __init__(self, s: int, index: faiss.Index):
     rpc.Server.__init__(self, s)
     self.index = index
     self.index_ivf = faiss.extract_index_ivf(index)
Beispiel #22
0
def add_to_index(dump_paths,
                 trained_index_path,
                 target_index_path,
                 idx2id_path,
                 num_docs_per_add=1000,
                 cuda=False,
                 fine_quant='SQ4',
                 offset=0,
                 norm_th=999,
                 ignore_ids=None):

    sidx2doc_id = []
    sidx2word_id = []
    dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths]
    print('reading %s' % trained_index_path)
    start_index = faiss.read_index(trained_index_path)
    start_index.make_direct_map()
    start_index.set_direct_map_type(faiss.DirectMap.Hashtable)

    if cuda:
        if 'PQ' in fine_quant:
            index_ivf = faiss.extract_index_ivf(start_index)
            quantizer = index_ivf.quantizer
            quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer)
            index_ivf.quantizer = quantizer_gpu
        else:
            res = faiss.StandardGpuResources()
            co = faiss.GpuClonerOptions()
            co.useFloat16 = True
            start_index = faiss.index_cpu_to_gpu(res, 0, start_index, co)

    print('adding following dumps:')
    for dump_path in dump_paths:
        print(dump_path)
    start_total = 0
    start_total_prev = 0
    cnt = 0
    for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')):
        starts = []
        start_valids = []
        dump_length = len(phrase_dump)
        for i, (doc_idx, doc_group) in enumerate(
                tqdm(phrase_dump.items(), desc='adding %d' % di)):
            if ignore_ids is not None and doc_idx in ignore_ids:
                continue
            num_start = doc_group['start'].shape[0]
            if num_start == 0: continue
            cnt += 1

            start = int8_to_float(doc_group['start'][:],
                                  doc_group.attrs['offset'],
                                  doc_group.attrs['scale'])
            start_valid = np.linalg.norm(start, axis=1) <= norm_th

            starts.append(start)
            start_valids.append(start_valid)
            sidx2doc_id.extend([int(doc_idx)] * num_start)
            sidx2word_id.extend(range(num_start))
            start_total += num_start

            if len(starts) > 0 and ((i % num_docs_per_add == 0) or
                                    (i == dump_length - 1)):
                print('adding at %d' % (i + 1))
                add_with_offset(start_index, concat_vectors(starts),
                                concat_vectors(start_valids), start_total_prev,
                                offset)
                start_total_prev = start_total
                starts = []
                start_valids = []
        if len(starts) > 0:
            print('final adding at %d' % (i + 1))
            add_with_offset(start_index, concat_vectors(starts),
                            concat_vectors(start_valids), start_total_prev,
                            offset)
            start_total_prev = start_total
    print('number of docs', cnt)

    for dump in dumps:
        dump.close()

    if cuda:
        print('moving back to cpu')
        if 'PQ' in fine_quant:
            index_ivf.quantizer = quantizer
            del quantizer_gpu
        else:
            start_index = faiss.index_gpu_to_cpu(start_index)

    print('start_index ntotal: %d' % start_index.ntotal)
    print(start_total)
    sidx2doc_id = np.array(sidx2doc_id, dtype=np.int32)
    sidx2word_id = np.array(sidx2word_id, dtype=np.int32)

    print('writing index and metadata')
    with h5py.File(idx2id_path, 'w') as f:
        g = f.create_group(str(offset))
        g.create_dataset('doc', data=sidx2doc_id)
        g.create_dataset('word', data=sidx2word_id)
        g.attrs['offset'] = offset

    faiss.write_index(start_index, target_index_path)
    print('done')
Beispiel #23
0
 def set_prefetch_nthread(self, nt):
     for idx in self.indexes:
         il = faiss.downcast_InvertedLists(
             faiss.extract_index_ivf(idx).invlists)
         il.prefetch_nthread
         il.prefetch_nthread = nt
Beispiel #24
0
 def set_parallel_mode(self, pm):
     index_ivf = faiss.extract_index_ivf(self.index)
     index_ivf.parallel_mode = pm
def add_to_index(dump_paths,
                 trained_index_path,
                 target_index_path,
                 idx2id_path,
                 num_docs_per_add=1000,
                 cuda=False,
                 fine_quant='SQ4',
                 offset=0,
                 norm_th=999,
                 ignore_ids=None,
                 avg_vec=None,
                 std_vec=None,
                 first_passage=False,
                 index_filter=-1e8):

    sidx2doc_id = []
    sidx2word_id = []
    dumps = [h5py.File(dump_path, 'r') for dump_path in dump_paths]
    # filter dumps
    if index_filter != -1e8:
        f_dumps = [
            h5py.File(dump_path.replace('/phrase/', '/filter/'), 'r')
            for dump_path in dump_paths
        ]

    print('reading %s' % trained_index_path)
    start_index = faiss.read_index(trained_index_path)
    if 'none' not in fine_quant:
        index_ivf = faiss.extract_index_ivf(start_index)
        index_ivf.make_direct_map()
        index_ivf.set_direct_map_type(faiss.DirectMap.Hashtable)

    if cuda:
        if 'PQ' in fine_quant:
            index_ivf = faiss.extract_index_ivf(start_index)
            quantizer = index_ivf.quantizer
            quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer)
            index_ivf.quantizer = quantizer_gpu
        else:
            res = faiss.StandardGpuResources()
            co = faiss.GpuClonerOptions()
            co.useFloat16 = True
            start_index = faiss.index_cpu_to_gpu(res, 0, start_index, co)

    print('adding following dumps:')
    for dump_path in dump_paths:
        print(dump_path)
    start_total = 0
    start_total_prev = 0
    cnt = 0
    for di, phrase_dump in enumerate(tqdm(dumps, desc='dumps')):
        starts = []
        start_valids = []
        dump_length = len(phrase_dump)
        for i, (doc_idx, doc_group) in enumerate(
                tqdm(phrase_dump.items(), desc='adding %d' % di)):
            if ignore_ids is not None and doc_idx in ignore_ids:
                continue
            num_start = doc_group['start'].shape[0]
            if num_start == 0: continue
            cnt += 1

            # First passage only
            if first_passage:
                f2o_start = doc_group['f2o_start'][:]
                cut = sum(f2o_start < doc_group['len_per_para'][0])
                start = int8_to_float(doc_group['start'][:cut],
                                      doc_group.attrs['offset'],
                                      doc_group.attrs['scale'])
                num_start = start.shape[0]
            # Apply index filter
            elif index_filter != -1e8:
                o2f_start = {
                    orig: ft
                    for ft, orig in enumerate(doc_group['f2o_start'][:])
                }
                filter_start = f_dumps[di][doc_idx]['filter_start'][:]
                filter_end = f_dumps[di][doc_idx]['filter_end'][:]
                start_idxs, = np.where(filter_start > index_filter)
                end_idxs, = np.where(filter_end > index_filter)
                save_idx = set(np.concatenate([start_idxs, end_idxs]))
                save_idx = sorted(
                    [o2f_start[si] for si in save_idx if si in o2f_start])
                start = int8_to_float(doc_group['start'][save_idx],
                                      doc_group.attrs['offset'],
                                      doc_group.attrs['scale'])
                num_start = start.shape[0]
            else:
                start = int8_to_float(doc_group['start'][:],
                                      doc_group.attrs['offset'],
                                      doc_group.attrs['scale'])
            start_valid = np.linalg.norm(start, axis=1) <= norm_th

            starts.append(start)
            start_valids.append(start_valid)
            sidx2doc_id.extend([int(doc_idx)] * num_start)
            if index_filter == -1e8:
                sidx2word_id.extend(range(num_start))
            else:
                sidx2word_id.extend(save_idx)
            start_total += num_start

            if len(starts) > 0 and ((i % num_docs_per_add == 0) or
                                    (i == dump_length - 1)):
                print('adding at %d' % (i + 1))
                add_with_offset(
                    start_index,
                    concat_vectors(starts),
                    concat_vectors(start_valids),
                    start_total_prev,
                    offset,
                    fine_quant,
                )
                start_total_prev = start_total
                starts = []
                start_valids = []
        if len(starts) > 0:
            print('final adding at %d' % (i + 1))
            add_with_offset(
                start_index,
                concat_vectors(starts),
                concat_vectors(start_valids),
                start_total_prev,
                offset,
                fine_quant,
            )
            start_total_prev = start_total
    print('number of docs', cnt)

    for dump in dumps:
        dump.close()

    if cuda:
        print('moving back to cpu')
        if 'PQ' in fine_quant:
            index_ivf.quantizer = quantizer
            del quantizer_gpu
        else:
            start_index = faiss.index_gpu_to_cpu(start_index)

    print('start_index ntotal: %d' % start_index.ntotal)
    print(start_total)
    sidx2doc_id = np.array(sidx2doc_id, dtype=np.int32)
    sidx2word_id = np.array(sidx2word_id, dtype=np.int32)

    print('writing index and metadata')
    with h5py.File(idx2id_path, 'w') as f:
        g = f.create_group(str(offset))
        g.create_dataset('doc', data=sidx2doc_id)
        g.create_dataset('word', data=sidx2word_id)
        g.attrs['offset'] = offset

    faiss.write_index(start_index, target_index_path)
    print('done')
Beispiel #26
0
 def set_nprobe(self, nprobe):
     index_ivf = faiss.extract_index_ivf(self.index)
     index_ivf.nprobe = nprobe
Beispiel #27
0
    def __init__(self,
                 phrase_dump_dir,
                 index_path,
                 idx2id_path,
                 cuda=False,
                 logging_level=logging.INFO):
        self.phrase_dump_dir = phrase_dump_dir

        # Read index
        self.index = {}
        logger.info(
            f'Reading {index_path} - could take up to 15 mins depending on the file reading speed of HDD/SSD'
        )
        self.index = faiss.read_index(index_path,
                                      faiss.IO_FLAG_ONDISK_SAME_DIR)
        self.reconst_fn = faiss.downcast_index(self.index.index).reconstruct
        self.R = torch.FloatTensor(
            faiss.vector_to_array(
                faiss.downcast_VectorTransform(
                    self.index.chain.at(0)).A).reshape(self.index.d,
                                                       self.index.d))
        self.max_idx = 1e8 if 'PQ' not in index_path else 1e9
        logger.info(
            f'index ntotal: {self.index.ntotal} | PQ: {"PQ" in index_path}')

        # Read idx2id
        self.idx_f = {}
        logger.info('Load idx2id on memory')
        self.idx_f = self.load_idx_f(idx2id_path)
        self.offset = None
        self.scale = None
        self.doc_groups = None

        # Options
        logger.setLevel(logging_level)
        self.num_docs_list = []
        self.cuda = cuda
        if self.cuda:
            assert torch.cuda.is_available(
            ), f"Cuda availability {torch.cuda.is_available()}"
            self.device = torch.device('cuda')
            logger.info("Load IVF on GPU")
            index_ivf = faiss.extract_index_ivf(self.index)
            index_ivf.nprobe = 256
            quantizer = index_ivf.quantizer
            quantizer_gpu = faiss.index_cpu_to_all_gpus(quantizer)
            index_ivf.quantizer = quantizer_gpu
            self.R = self.R.to(self.device)
            logger.info(f"N probe: {index_ivf.nprobe}")
        else:
            self.device = torch.device("cpu")
            index_ivf = faiss.extract_index_ivf(self.index)
            index_ivf.nprobe = 256

        # For sentence split
        self.sentencizer = English()
        self.sentencizer.add_pipe(self.sentencizer.create_pipe('sentencizer'))

        # Load metadata on RAM if possible
        doc_group_path = os.path.join(
            self.phrase_dump_dir[:self.phrase_dump_dir.index('/phrase')],
            'meta_compressed.pkl')
        if os.path.exists(doc_group_path) and ('PQ' in index_path):
            logger.info(
                f"Loading metadata on RAM from {doc_group_path} (for PQ only)")
            self.doc_groups = pickle.load(open(doc_group_path, 'rb'))
        else:
            logger.info(
                f"Will read metadata directly from hdf5 files (requires SSDs for faster inference)"
            )