Ejemplo n.º 1
0
    def test_factory(self):
        import faiss

        index = FaissIndex(string_factory="Flat")
        index.add_vectors(np.eye(5, dtype=np.float32))
        self.assertIsInstance(index.faiss_index, faiss.IndexFlat)
        index = FaissIndex(string_factory="LSH")
        index.add_vectors(np.eye(5, dtype=np.float32))
        self.assertIsInstance(index.faiss_index, faiss.IndexLSH)
        with self.assertRaises(ValueError):
            _ = FaissIndex(string_factory="Flat",
                           custom_index=faiss.IndexFlat(5))
Ejemplo n.º 2
0
    def fit(self, Ciu, show_progress=True):
        import faiss

        # train the model
        super(FaissAlternatingLeastSquares, self).fit(Ciu, show_progress)

        self.quantizer = faiss.IndexFlat(self.factors)

        if self.use_gpu:
            self.gpu_resources = faiss.StandardGpuResources()

        item_factors = self.item_factors.astype('float32')

        if self.approximate_recommend:
            log.debug("Building faiss recommendation index")

            # build up a inner product index here
            if self.use_gpu:
                index = faiss.GpuIndexIVFFlat(self.gpu_resources, self.factors,
                                              self.nlist,
                                              faiss.METRIC_INNER_PRODUCT)
            else:
                index = faiss.IndexIVFFlat(self.quantizer, self.factors,
                                           self.nlist,
                                           faiss.METRIC_INNER_PRODUCT)

            index.train(item_factors)
            index.add(item_factors)
            index.nprobe = self.nprobe
            self.recommend_index = index

        if self.approximate_similar_items:
            log.debug("Building faiss similar items index")

            # likewise build up cosine index for similar_items, using an inner product
            # index on normalized vectors`
            norms = numpy.linalg.norm(item_factors, axis=1)
            norms[norms == 0] = 1e-10

            normalized = (item_factors.T / norms).T.astype('float32')
            if self.use_gpu:
                index = faiss.GpuIndexIVFFlat(self.gpu_resources, self.factors,
                                              self.nlist,
                                              faiss.METRIC_INNER_PRODUCT)
            else:
                index = faiss.IndexIVFFlat(self.quantizer, self.factors,
                                           self.nlist,
                                           faiss.METRIC_INNER_PRODUCT)

            index.train(normalized)
            index.add(normalized)
            index.nprobe = self.nprobe
            self.similar_items_index = index
    def __init__(self,
                 name,
                 dimensions,
                 create_ind2id=True,
                 create_ind2sent=False):
        self.name = name
        self.dimensions = dimensions
        self.index = None
        self.mutex = Lock()
        self.create_ind2id = create_ind2id
        self.create_ind2sent = create_ind2sent
        try:
            print("try to load " + self.name + " faiss index")
            self.index = faiss.read_index(FOLDER + self.name + '.index')
            print("successfuly loaded ", self.index.ntotal,
                  " entries in " + self.name + " index")
        except:
            print("failed to load  " + self.name + " faiss index")
            self.index = faiss.IndexFlat(self.dimensions)  # build the index

        if self.create_ind2id:
            # index to id
            self.ind2id = {}
            try:
                print("try load  " + self.name + " ind2id index")
                json_file = json.load(open(FOLDER + self.name + ".json", "r"))
                self.ind2id = {}
                for k, v in json_file.items():
                    self.ind2id[int(k)] = v
                print("successfuly  " + self.name + " loaded ",
                      len(self.ind2id), " entries in ind2id")
            except:
                print("failed to load  " + self.name + " ind2id")
                self.ind2id = {}

        # index to sentence
        if self.create_ind2sent:
            self.ind2sent = {}
            try:
                print("try load  " + self.name + " ind2sent index")
                json_file = json.load(
                    open(FOLDER + self.name + "-sentence.json", "r"))
                self.ind2sent = {}
                for k, v in json_file.items():
                    self.ind2sent[int(k)] = v
                print("successfuly  " + self.name + " loaded ",
                      len(self.ind2sent), " entries in ind2sent")
            except:
                print("failed to load  " + self.name + " ind2sent index")
                self.ind2sent = {}
Ejemplo n.º 4
0
 def build(self, vectors):
     t0 = time.time()
     if self.add_noise:
         vectors += np.random.randn(vectors.shape[0],
                                    vectors.shape[1]) * self.noise_amount
     self.num_points += vectors.shape[0]
     if not self.assume_unit_normed:
         vectors = unit_norm(vectors)
     import faiss
     self.index = faiss.IndexFlat(vectors.shape[1])
     self.index.verbose = True
     self.index.metric_type = faiss.METRIC_INNER_PRODUCT
     self.index.add(vectors)
     t1 = time.time()
     self.total_insert_time += t1 - t0
Ejemplo n.º 5
0
 def test_remove_id_map(self):
     sub_index = faiss.IndexFlat(5)
     xb = np.zeros((10, 5), dtype='float32')
     xb[:, 0] = np.arange(10) + 1000
     index = faiss.IndexIDMap2(sub_index)
     index.add_with_ids(xb, np.arange(10) + 100)
     assert index.reconstruct(104)[0] == 1004
     index.remove_ids(np.array([103]))
     assert index.reconstruct(104)[0] == 1004
     try:
         index.reconstruct(103)
     except:
         pass
     else:
         assert False, 'should have raised an exception'
Ejemplo n.º 6
0
def make_indices_copy_from_cpu(nlist, d, qtype, by_residual, metric, clamp):
    to_train = make_t(10000, d, clamp)

    quantizer_cp = faiss.IndexFlat(d, metric)
    idx_cpu = faiss.IndexIVFScalarQuantizer(quantizer_cp, d, nlist,
                                            qtype, metric, by_residual)

    idx_cpu.train(to_train)
    idx_cpu.add(to_train)

    res = faiss.StandardGpuResources()
    res.noTempMemory()
    idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, idx_cpu)

    return idx_cpu, idx_gpu
Ejemplo n.º 7
0
    def subtest(self, mt):
        d = 32
        xt, xb, xq = get_dataset_2(d, 1000, 2000, 200)
        nlist = 64

        gt_index = faiss.IndexFlat(d, mt)
        gt_index.add(xb)
        gt_D, gt_I = gt_index.search(xq, 10)
        quantizer = faiss.IndexFlat(d, mt)
        for qname in '8bit 4bit 8bit_uniform 4bit_uniform fp16'.split():
            qtype = getattr(faiss.ScalarQuantizer, 'QT_' + qname)
            index = faiss.IndexIVFScalarQuantizer(quantizer, d, nlist, qtype,
                                                  mt)
            index.train(xt)
            index.add(xb)
            index.nprobe = 4  # hopefully more robust than 1
            D, I = index.search(xq, 10)
            ninter = faiss.eval_intersection(I, gt_I)
            print('(%d, %s): %d, ' % (mt, repr(qname), ninter))
            assert ninter >= self.ref_results[(mt, qname)] - 4

            D2, I2 = self.subtest_add2col(xb, xq, index, qname)

            assert np.all(I2 == I)
Ejemplo n.º 8
0
    def do_test_range(self, metric):
        ds = datasets.SyntheticDataset(32, 0, 1000, 10)
        xq = ds.get_queries()
        xb = ds.get_database()
        D, I = faiss.knn(xq, xb, 10, metric=metric)
        threshold = float(D[:, -1].mean())

        index = faiss.IndexFlat(32, metric)
        index.add(xb)
        ref_lims, ref_D, ref_I = index.range_search(xq, threshold)

        new_lims, new_D, new_I = range_ground_truth(
            xq, ds.database_iterator(bs=100), threshold, metric_type=metric)

        evaluation.test_ref_range_results(ref_lims, ref_D, ref_I, new_lims,
                                          new_D, new_I)
Ejemplo n.º 9
0
 def __init__(self, dim: int, save_path: str, num_threads: int = None):
     """
     Constructor.
     :param dim:
     :param save_path:
     :param num_threads
     """
     self.dim = dim
     if num_threads is not None and num_threads > 0:
         faiss.omp_set_num_threads(num_threads)
     if isfile(save_path):
         logging.debug("restore: %s", save_path)
         self._index = faiss.read_index(save_path)
     else:
         self._sub_index = faiss.IndexFlat(dim)
         self._index = faiss.IndexIDMap2(self._sub_index)
Ejemplo n.º 10
0
 def faiss_search_impl(emb_q, emb_id, emb_size, shift, k=50, search_batch_sz=50000, gpu=True):
     index = faiss.IndexFlat(emb_size)
     if gpu:
         index = faiss.index_cpu_to_all_gpus(index)
     index.add(emb_id)
     print('Total index =', index.ntotal)
     vals, inds = [], []
     for i_batch in range(0, len(emb_q), search_batch_sz):
         val, ind = index.search(emb_q[i_batch:min(i_batch + search_batch_sz, len(emb_q))], k)
         val = 1 - val
         vals.append(val)
         inds.append(ind + shift)
         # print(vals[-1].size())
         # print(inds[-1].size())
     del index, emb_id, emb_q
     vals, inds = np.concatenate(vals), np.concatenate(inds)
     return vals, inds
def build_index(X: np.ndarray,
                pct_probe: float = DEFAULT_PCT_PROBE,
                approximate: bool = True,
                use_gpu: bool = True):
    """Buid a FAISS index from a reference dataframe.

    Args:
        X: The vectors to add to the index.
        pct_probe: The minimum fraction of nearest lists to search. If
            the product of pct_probe and the number of lists is less
            than 1, one list will be searched.
        approximate: Whether to build an approximate or exact index.

    Returns:
        An (index, lookup) tuple where the lookup returns the filepath
        for a given entry in the index.
    """
    if X is None:
        return None
    d = X.shape[1]
    if approximate:
        ntotal = X.shape[0]
        nlist = int(max(min(4 * np.sqrt(ntotal), ntotal / 39), 1))
        quantizer = faiss.IndexFlatL2(d)
        index = faiss.IndexIVFFlat(quantizer, d, nlist)
        gpu = False
        if use_gpu:
            try:
                res = faiss.StandardGpuResources()
                index = faiss.index_cpu_to_gpu(res, 0, index)
                gpu = True
            except AttributeError:
                LOGGER.info("Building approximate FAISS index on CPU.")
        index.train(X)
        batch_size = 10_000
        for i in range(0, X.shape[0], batch_size):
            index.add(X[i:i + batch_size])
        if gpu:
            index = faiss.index_gpu_to_cpu(index)  # pylint: disable=no-member
        nprobe = max(math.ceil(pct_probe * nlist), 1)
        faiss.ParameterSpace().set_index_parameter(index, "nprobe", nprobe)
    else:
        index = faiss.IndexFlat(d)
        index.add(X)  # pylint: disable=no-value-for-parameter
    return index
Ejemplo n.º 12
0
    def test_PR_multiple(self):
        metric = faiss.METRIC_L2
        ds = datasets.SyntheticDataset(32, 1000, 1000, 10)
        xq = ds.get_queries()
        xb = ds.get_database()

        # good for ~10k results
        threshold = 15

        index = faiss.IndexFlat(32, metric)
        index.add(xb)
        ref_lims, ref_D, ref_I = index.range_search(xq, threshold)

        # now make a slightly suboptimal index
        index2 = faiss.index_factory(32, "PCA16,Flat")
        index2.train(ds.get_train())
        index2.add(xb)

        # PCA reduces distances so will have more results
        new_lims, new_D, new_I = index2.range_search(xq, threshold)

        all_thr = np.array([5.0, 10.0, 12.0, 15.0])
        for mode in "overall", "average":
            ref_precisions = np.zeros_like(all_thr)
            ref_recalls = np.zeros_like(all_thr)

            for i, thr in enumerate(all_thr):

                lims2, _, I2 = evaluation.filter_range_results(
                    new_lims, new_D, new_I, thr)

                prec, recall = evaluation.range_PR(ref_lims,
                                                   ref_I,
                                                   lims2,
                                                   I2,
                                                   mode=mode)

                ref_precisions[i] = prec
                ref_recalls[i] = recall

            precisions, recalls = evaluation.range_PR_multiple_thresholds(
                ref_lims, ref_I, new_lims, new_D, new_I, all_thr, mode=mode)

            np.testing.assert_array_almost_equal(ref_precisions, precisions)
            np.testing.assert_array_almost_equal(ref_recalls, recalls)
Ejemplo n.º 13
0
    def do_test(self, nq, metric_type=faiss.METRIC_L2, k=10):
        d = 32
        nb = 1000
        nt = 0

        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)
        index = faiss.IndexFlat(d, metric_type)

        ### k-NN search

        index.add(xb)
        D1, I1 = index.search(xq, k)

        if metric_type == faiss.METRIC_L2:
            all_dis = ((xq.reshape(nq, 1, d) - xb.reshape(1, nb, d)) ** 2).sum(2)
            Iref = all_dis.argsort(axis=1)[:, :k]
        else:
            all_dis = np.dot(xq, xb.T)
            Iref = all_dis.argsort(axis=1)[:, ::-1][:, :k]

        Dref = all_dis[np.arange(nq)[:, None], Iref]
        self.assertLessEqual((Iref != I1).sum(), Iref.size * 0.0001)
        #  np.testing.assert_equal(Iref, I1)
        np.testing.assert_almost_equal(Dref, D1, decimal=5)

        ### Range search

        radius = float(np.median(Dref[:, -1]))

        lims, D2, I2 = index.range_search(xq, radius)

        for i in range(nq):
            l0, l1 = lims[i:i + 2]
            _, Il = D2[l0:l1], I2[l0:l1]
            if metric_type == faiss.METRIC_L2:
                Ilref, = np.where(all_dis[i] < radius)
            else:
                Ilref, = np.where(all_dis[i] > radius)
            Il.sort()
            Ilref.sort()
            np.testing.assert_equal(Il, Ilref)
            np.testing.assert_almost_equal(
                all_dis[i, Ilref], D2[l0:l1],
                decimal=5
            )
Ejemplo n.º 14
0
    def do_test_knn(self, mt):
        d = 10
        nb = 100
        nq = 50
        nt = 0
        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        index = faiss.IndexFlat(d, mt)
        index.add(xb)

        D, I = index.search(xq, 10)

        dis = faiss.pairwise_distances(xq, xb, mt)
        o = dis.argsort(axis=1)
        assert np.all(I == o[:, :10])

        for q in range(nq):
            assert np.all(D[q] == dis[q, I[q]])
Ejemplo n.º 15
0
    def subtest_build(self, knn_graph, thresh, metric=faiss.METRIC_L2):
        d = self.xq.shape[1]
        metrics = {faiss.METRIC_L2: 'L2', faiss.METRIC_INNER_PRODUCT: 'IP'}

        flat_index = faiss.IndexFlat(d, metric)
        flat_index.add(self.xb)
        Dref, Iref = flat_index.search(self.xq, 1)

        index = faiss.IndexNSGFlat(d, 16, metric)
        index.verbose = True

        index.build(self.xb, knn_graph)
        Dnsg, Insg = index.search(self.xq, 1)

        recalls = (Iref == Insg).sum()
        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
        self.assertGreaterEqual(recalls, thresh)
        self.subtest_connectivity(index, self.xb.shape[0])
Ejemplo n.º 16
0
    def fit(self, data):
        data = data.astype('float32')
        factors = data.shape[1]

        if self.gpu:
            self.res = faiss.StandardGpuResources()
            self.index = faiss.GpuIndexIVFFlat(self.res, factors, self.nlist,
                                               faiss.METRIC_INNER_PRODUCT)

        else:
            self.quantizer = faiss.IndexFlat(factors)
            self.index = faiss.IndexIVFFlat(self.quantizer, factors,
                                            self.nlist,
                                            faiss.METRIC_INNER_PRODUCT)

        self.index.train(data)
        self.index.add(data)
        self.index.nprobe = self.nprobe
Ejemplo n.º 17
0
def evaluate(model, dataloader, writer, epoch, device, loss_type='bce'):
    contexts = []
    answers = []

    model.eval()
    loss_history = []
    for (context, context_len), (answer, answer_len) in dataloader:
        context_embeddings = model(
            context.to(device))  # [batch_size, emb_size]
        answer_embeddings = model(answer.to(device))  # [batch_size, emb_size]

        if loss_type == 'bce':
            loss = bce(context_embeddings, answer_embeddings)
        elif loss_type == 'triplet':
            loss = triplet_loss(context_embeddings, answer_embeddings)
        else:
            raise NotImplemented('No such loss')
        loss_history.append(loss.item())

        contexts.append(context_embeddings.cpu().detach().numpy())
        answers.append(answer_embeddings.cpu().detach().numpy())

    loss_value = np.mean(loss_history)

    contexts = np.array(contexts).reshape(-1, contexts[-1].shape[-1])
    answers = np.array(answers).reshape(-1, answers[-1].shape[-1])

    emb_size = answers.shape[1]
    faiss_index = faiss.IndexFlat(emb_size)

    faiss_index.verbose = True
    faiss_index.add(answers)
    _, indexes = faiss_index.search(contexts, k=100)

    mrr = calculate_mrr(y_true=np.arange(indexes.shape[0]).reshape(-1, 1),
                        preds=indexes)
    write_metrics(writer,
                  epoch * len(dataloader),
                  loss_value,
                  mrr=mrr,
                  prefix='eval')
    print(
        f'Epoch = {epoch}, step = {epoch * len(dataloader)}, eval_loss = {loss_value}, mrr = {mrr}'
    )
Ejemplo n.º 18
0
    def __init__(self, args: Namespace, dim: int = 2048) -> None:
        self.data_dir = args.data_dir
        self.images_dir = args.images_dir
        with open(path.join(args.data_dir, args.captions)) as infile:
            self.captions = infile.readlines()
        self.embeddings = np.load(path.join(args.data_dir, args.embeddings))
        self.k = k
        self.metric = metric

        if self.metric == -1:
            # Cosine similarity
            self.index = faiss.IndexFlatIP(dim)
            faiss.normalize_L2(self.embeddings)
            self.index.add(self.embeddings)
        elif self.metric == 1:
            # Euclidean distance (no square root)
            self.index = faiss.IndexFlatL2(dim)
            self.index.add(self.embeddings)
        elif self.metric == 23:
            # Mahalanobis distance
            self.index = faiss.IndexFlatL2(dim)
            x_centered = self.embeddings - self.embeddings.mean(0)
            self.transform = np.linalg.inv(np.linalg.cholesky(
                np.dot(x_centered.T, x_centered) / x_centered.shape[0])).T
            self.index.add(
                np.dot(self.embeddings, self.transform).astype(np.float32))
        elif self.metric == 0:
            # Inner project
            self.index = faiss.IndexFlatIP(dim)
            self.index.add(self.embeddings)
        else:
            self.index = faiss.IndexFlat(dim, self.metric)
            self.index.add(self.embeddings)

        self.model = wide_resnet101_2(pretrained=True, progress=True)
        self.model.eval()  # Don't forget to put model in evaluation mode!
        self.model.fc = Identity()
        # Use recommended sequence of transforms for ImageNet pretrained models
        self.transforms = Compose([Resize(256, interpolation=Image.BICUBIC),  # Default is bilinear
                                   CenterCrop(224),
                                   ToTensor(),
                                   Normalize(mean=[0.485, 0.456, 0.406],
                                             std=[0.229, 0.224, 0.225])])
Ejemplo n.º 19
0
    def make_knn_graph(self, metric):
        n = self.xb.shape[0]
        d = self.xb.shape[1]
        index = faiss.IndexFlat(d, metric)
        index.add(self.xb)
        _, I = index.search(self.xb, self.GK + 1)
        knn_graph = np.zeros((n, self.GK), dtype=np.int64)

        # For the inner product distance, the distance between a vector and itself
        # may not be the smallest, so it is not guaranteed that I[:, 0] is the query itself.
        for i in range(n):
            cnt = 0
            for j in range(self.GK + 1):
                if I[i, j] != i:
                    knn_graph[i, cnt] = I[i, j]
                    cnt += 1
                if cnt == self.GK:
                    break
        return knn_graph
Ejemplo n.º 20
0
    def test_hnsw(self):

        d = 10
        nb = 1000
        nq = 100
        nt = 0
        xt, xb, xq = get_dataset_2(d, nt, nb, nq)

        mt = faiss.METRIC_L1

        index = faiss.IndexHNSW(faiss.IndexFlat(d, mt))
        index.add(xb)

        D, I = index.search(xq, 10)

        dis = faiss.pairwise_distances(xq, xb, mt)

        for q in range(nq):
            assert np.all(D[q] == dis[q, I[q]])
Ejemplo n.º 21
0
def build_coo_graph(ids, embeds, k=100):
    # find kNN
    X = np.vstack(embeds)
    X /= np.linalg.norm(embeds, axis=1)[:, np.newaxis]
    d = X.shape[1]
    n_cells = int(math.sqrt(X.shape[0]))
    n_probe = 50
    quantizer = faiss.IndexFlat(d, faiss.METRIC_INNER_PRODUCT)
    index = faiss.IndexIVFFlat(quantizer, d, n_cells,
                               faiss.METRIC_INNER_PRODUCT)
    index.train(X)
    index.nprobe = n_probe
    index.add(X)
    D, I = index.search(X, k)

    # build sparse coo graph
    dim = max(ids) + 1
    v_remap_ids = np.vectorize(lambda i: ids[i])
    I = v_remap_ids(I)
    rows = np.tile(np.asarray(ids)[:, np.newaxis],
                   (1, I.shape[1])).reshape(-1, )
    cols = I.reshape(-1, )
    data = D.reshape(-1, )
    not_self_loop_mask = (rows != cols)
    rows = rows[not_self_loop_mask]
    cols = cols[not_self_loop_mask]
    data = data[not_self_loop_mask]

    # add in fake edge
    rows = np.concatenate((rows, np.arange(dim - 1), np.arange(1, dim)),
                          axis=0)
    cols = np.concatenate((cols, np.arange(1, dim), np.arange(dim - 1)),
                          axis=0)
    data = np.concatenate((data, np.repeat(1e-12, 2 * (dim - 1))), axis=0)

    # make the graph symmetric
    _rows = np.concatenate((rows, cols), axis=0)
    _cols = np.concatenate((cols, rows), axis=0)
    _data = np.concatenate((data, data), axis=0)
    coo = csr_matrix((_data, (_rows, _cols)), shape=(dim, dim)).tocoo()

    return coo
Ejemplo n.º 22
0
def prepare_coarse_quantizer(preproc, cent_cachefile, ncent, is_gpu):

    if cent_cachefile and os.path.exists(cent_cachefile):
        print("load centroids", cent_cachefile)
        centroids = np.load(cent_cachefile)
    else:
        nt = max(1000000, 256 * ncent)
        print("train coarse quantizer...")
        t0 = time.time()
        centroids = train_coarse_quantizer(xt, ncent, preproc, is_gpu)
        print("centroids:", centroids[128])
        print("Coarse train time: %.3f s" % (time.time() - t0))
        if cent_cachefile:
            print("store centroids", cent_cachefile)
            np.save(cent_cachefile, centroids)

    coarse_quantizer = faiss.IndexFlat(preproc.d_out, fmetric)
    coarse_quantizer.add(centroids)

    return coarse_quantizer
Ejemplo n.º 23
0
    def do_test_compute_GT(self, metric=faiss.METRIC_L2):
        d = 64
        xt, xb, xq = get_dataset_2(d, 0, 10000, 100)

        index = faiss.IndexFlat(d, metric)
        index.add(xb)
        Dref, Iref = index.search(xq, 10)

        # iterator function on the matrix

        def matrix_iterator(xb, bs):
            for i0 in range(0, xb.shape[0], bs):
                yield xb[i0:i0 + bs]

        Dnew, Inew = knn_ground_truth(xq, matrix_iterator(xb, 1000), 10,
                                      metric)

        np.testing.assert_array_equal(Iref, Inew)
        # decimal = 4 required when run on GPU
        np.testing.assert_almost_equal(Dref, Dnew, decimal=4)
Ejemplo n.º 24
0
 def __init__(self,
              vec_dimension: int,
              transformation: Callable = None,
              metric: str = 'l2',
              num_clusters: int = None,
              num_probe: int = None,
              num_bytes: int = None):
     self.quantizer = faiss.IndexFlat(vec_dimension, self.metrics[metric])
     if num_bytes is not None:
         self.index = faiss.IndexIVFPQ(self.quantizer, vec_dimension,
                                       num_clusters, num_bytes,
                                       self.metrics[metric])
     else:
         self.index = faiss.IndexIVFFlat(self.quantizer, vec_dimension,
                                         num_clusters, self.metrics[metric])
     self.index.nprobe = num_probe
     self.index.make_direct_map()
     self.transformation = transformation
     self.mapper = {}
     self.inverted_mapper = {}
Ejemplo n.º 25
0
def range_search_gpu(xq, r2, index_gpu, index_cpu):
    """GPU does not support range search, so we emulate it with
    knn search + fallback to CPU index.

    The index_cpu can either be a CPU index or a numpy table that will
    be used to construct a Flat index if needed.
    """
    nq, d = xq.shape
    LOG.debug("GPU search %d queries" % nq)
    k = min(index_gpu.ntotal, 1024)
    D, I = index_gpu.search(xq, k)
    if index_gpu.metric_type == faiss.METRIC_L2:
        mask = D[:, k - 1] < r2
    else:
        mask = D[:, k - 1] > r2
    if mask.sum() > 0:
        LOG.debug("CPU search remain %d" % mask.sum())
        if isinstance(index_cpu, np.ndarray):
            # then it in fact an array that we have to make flat
            xb = index_cpu
            index_cpu = faiss.IndexFlat(d, index_gpu.metric_type)
            index_cpu.add(xb)
        lim_remain, D_remain, I_remain = index_cpu.range_search(xq[mask], r2)
    LOG.debug("combine")
    D_res, I_res = [], []
    nr = 0
    for i in range(nq):
        if not mask[i]:
            if index_gpu.metric_type == faiss.METRIC_L2:
                nv = (D[i, :] < r2).sum()
            else:
                nv = (D[i, :] > r2).sum()
            D_res.append(D[i, :nv])
            I_res.append(I[i, :nv])
        else:
            l0, l1 = lim_remain[nr], lim_remain[nr + 1]
            D_res.append(D_remain[l0:l1])
            I_res.append(I_remain[l0:l1])
            nr += 1
    lims = np.cumsum([0] + [len(di) for di in D_res])
    return lims, np.hstack(D_res), np.hstack(I_res)
Ejemplo n.º 26
0
    def test_IndexFlat(self):
        d = 32
        nb = 1000
        nt = 0
        nq = 200

        (xt, xb, xq) = get_dataset_2(d, nt, nb, nq)
        index = faiss.IndexFlat(d, faiss.METRIC_L2)

        index.add(xb)

        # invalid k
        k = -5
        self.assertRaises(AssertionError, index.search, xq, k)

        # valid k
        k = 5
        D, I = index.search(xq, k)

        self.assertEqual(D.shape[0], nq)
        self.assertEqual(D.shape[1], k)
Ejemplo n.º 27
0
    def fit(self, dataloader):
        idx = 0
        for (_, _), (answer, _) in tqdm(dataloader):
            #if (idx % 100):
            #    print(idx)
            device = torch.device(
                'cuda' if torch.cuda.is_available() else 'cpu')
            answer = answer.to(device)
            vector = self.model(answer)
            self.vect2text[vector] = answer
            idx += 1

        vectors = torch.stack(list(self.vect2text.keys()))

        emb_size = vectors.shape[2]
        self.faiss_index = faiss.IndexFlat(emb_size)

        self.faiss_index.verbose = True
        #print(vectors.shape[0], vectors.shape[2])
        self.faiss_index.add(vectors.detach().cpu().numpy().reshape(
            vectors.shape[0], vectors.shape[2]))
Ejemplo n.º 28
0
    def subtest_add2col(self, xb, xq, index, qname):
        """Test with 2 additional dimensions to take also the non-SIMD
        codepath. We don't retrain anything but add 2 dims to the
        queries, the centroids and the trained ScalarQuantizer.
        """
        nb, d = xb.shape

        d2 = d + 2
        xb2 = self.add2columns(xb)
        xq2 = self.add2columns(xq)

        nlist = index.nlist
        quantizer = faiss.downcast_index(index.quantizer)
        quantizer2 = faiss.IndexFlat(d2, index.metric_type)
        centroids = faiss.vector_to_array(quantizer.xb).reshape(nlist, d)
        centroids2 = self.add2columns(centroids)
        quantizer2.add(centroids2)
        index2 = faiss.IndexIVFScalarQuantizer(
            quantizer2, d2, index.nlist, index.sq.qtype,
            index.metric_type)
        index2.nprobe = 4
        if qname in ('8bit', '4bit'):
            trained = faiss.vector_to_array(index.sq.trained).reshape(2, -1)
            nt = trained.shape[1]
            # 2 lines: vmins and vdiffs
            new_nt = int(nt * d2 / d)
            trained2 = np.hstack((
                trained,
                np.zeros((2, new_nt - nt), dtype='float32')
            ))
            trained2[1, nt:] = 1.0   # set vdiff to 1 to avoid div by 0
            faiss.copy_array_to_vector(trained2.ravel(), index2.sq.trained)
        else:
            index2.sq.trained = index.sq.trained

        index2.is_trained = True
        index2.add(xb2)
        return index2.search(xq2, 10)
Ejemplo n.º 29
0
    def test_query_iterator(self, metric=faiss.METRIC_L2):
        ds = datasets.SyntheticDataset(32, 0, 1000, 1000)
        xq = ds.get_queries()
        xb = ds.get_database()
        D, I = faiss.knn(xq, xb, 10, metric=metric)
        threshold = float(D[:, -1].mean())
        print(threshold)

        index = faiss.IndexFlat(32, metric)
        index.add(xb)
        ref_lims, ref_D, ref_I = index.range_search(xq, threshold)

        def matrix_iterator(xb, bs):
            for i0 in range(0, xb.shape[0], bs):
                yield xb[i0:i0 + bs]

        # check repro OK
        _, new_lims, new_D, new_I = range_search_max_results(
            index, matrix_iterator(xq, 100), threshold)

        evaluation.test_ref_range_results(
            ref_lims, ref_D, ref_I,
            new_lims, new_D, new_I
        )

        max_res = ref_lims[-1] // 2

        new_threshold, new_lims, new_D, new_I = range_search_max_results(
            index, matrix_iterator(xq, 100), threshold, max_results=max_res)

        self.assertLessEqual(new_lims[-1], max_res)

        ref_lims, ref_D, ref_I = index.range_search(xq, new_threshold)

        evaluation.test_ref_range_results(
            ref_lims, ref_D, ref_I,
            new_lims, new_D, new_I
        )
Ejemplo n.º 30
0
    def do_test(self, metric):
        d = 32
        xt, xb, xq = get_dataset_2(d, 2000, 1000, 200)
        index1 = faiss.index_factory(d, "PQ4x4np", metric)
        Dref, Iref = faiss.knn(xq, xb, 10, metric)

        index1.train(xt)
        index1.add(xb)

        D1, I1 = index1.search(xq, 100)

        recall1 = (I1 == Iref[:, :1]).sum()

        # add refine index on top
        index_flat = faiss.IndexFlat(d, metric)
        index_flat.add(xb)

        index2 = faiss.IndexRefine(index1, index_flat)
        index2.k_factor = 10.0
        D2, I2 = index2.search(xq, 10)

        # check distance is computed properly
        for i in range(len(xq)):
            x1 = xq[i]
            x2 = xb[I2[i, 5]]
            if metric == faiss.METRIC_L2:
                dref = ((x1 - x2) ** 2).sum()
            else:
                dref = np.dot(x1, x2)
            np.testing.assert_almost_equal(dref, D2[i, 5], decimal=5)

        # check that with refinement, the recall@10 is the same as
        # the original recall@100
        recall2 = (I2 == Iref[:, :1]).sum()
        # print("recalls", recall1, recall2)
        self.assertEquals(recall1, recall2)