コード例 #1
0
ファイル: nn_search.py プロジェクト: facebookresearch/muss
def get_nearest_sentence_ids(query_index, db_index, topk, nprobe, batch_size=1024, use_gpu=True):
    try:
        faiss.ParameterSpace().set_index_parameter(db_index, 'nprobe', nprobe)
    except RuntimeError as e:
        if 'could not set parameter nprobe' in str(e):
            pass
        else:
            raise e
    if use_gpu:
        db_index = faiss.index_cpu_to_all_gpus(db_index)
    all_distances = np.empty((query_index.ntotal, topk))
    all_sentence_ids = np.empty((query_index.ntotal, topk), dtype=int)
    for batch_idx in range((query_index.ntotal // batch_size) + 1):
        start_idx = batch_idx * batch_size
        end_idx = min(start_idx + batch_size, query_index.ntotal)
        actual_batch_size = end_idx - start_idx
        query_embeddings = query_index.reconstruct_n(start_idx, actual_batch_size)  # TODO: Do this in the background
        distances, sentence_ids = db_index.search(query_embeddings, topk)
        all_distances[start_idx:end_idx] = distances
        all_sentence_ids[start_idx:end_idx] = sentence_ids
    # If distances are sorted in descending order, we make them ascending instead for the following code to work
    if np.all(np.diff(all_distances) <= 0):
        # This is taylored for transforming cosine similarity into a pseudo-distance: the maximum cosine similarity is 1 (vectors are equal).
        # Hence distance = 1 - cosine will always be positive and will be be equal to 0 when vectors are equal.
        all_distances = 1 - all_distances
    return all_distances, all_sentence_ids.astype(int)
コード例 #2
0
    def __init__(self, dim=10, nlist=100, gpu=-1):
        self.dim = dim
        self.nlist = nlist  #聚类中心的个数
        #self.index = faiss.IndexFlatL2(dim)    # build the index
        quantizer = faiss.IndexFlatL2(dim)  # the other index

        # faiss.METRIC_L2: faiss定义了两种衡量相似度的方法(metrics),
        # 分别为faiss.METRIC_L2 欧式距离、 faiss.METRIC_INNER_PRODUCT 向量内积
        # here we specify METRIC_L2, by default it performs inner-product search
        self.index = faiss.IndexIVFFlat(quantizer, dim, self.nlist,
                                        faiss.METRIC_L2)

        try:
            if gpu >= 0:
                if gpu == 0:
                    # use a single GPU
                    res = faiss.StandardGpuResources()
                    gpu_index = faiss.index_cpu_to_gpu(res, 0, self.index)
                else:
                    gpu_index = faiss.index_cpu_to_all_gpus(self.index)

                self.index = gpu_index
        except:
            pass

        # data
        self.xb = None
コード例 #3
0
    def __init__(self,
                 iterator=None,
                 filename=None,
                 embeddings=None,
                 shape=None,
                 device="cpu"):

        self.iterator = iterator

        if os.path.exists(filename) == True:

            print(f'Index file {filename}')
            self.index = faiss.read_index(
                filename)  # index2 is identical to index

        else:

            self.index = faiss.index_factory(shape, "Flat",
                                             faiss.METRIC_INNER_PRODUCT)
            faiss.normalize_L2(embeddings)
            self.index.add(embeddings)
            faiss.write_index(self.index, filename)
            print(f'Index written at {filename}')

        if device == "cuda":
            print('Now running on CUDA')
            self.index = faiss.index_cpu_to_all_gpus(self.index)

        print(f'Index trained - {self.index.is_trained}')
コード例 #4
0
ファイル: test_gpu_basics.py プロジェクト: zjw0304/faiss
        def create_gpu(dim):
            gpu_quantizer = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(dim))

            index = create_cpu(dim)
            index.clustering_index = gpu_quantizer
            index.dont_dealloc_me = gpu_quantizer
            return index
コード例 #5
0
def get_nearestneighbors_faiss(xq,
                               xb,
                               k,
                               device,
                               needs_exact=True,
                               verbose=False):
    assert device in ["cpu", "cuda"]

    if verbose:
        print("Computing nearest neighbors (Faiss)")

    if needs_exact or device == 'cuda':
        index = faiss.IndexFlatL2(xq.shape[1])
    else:
        index = faiss.index_factory(xq.shape[1], "HNSW32")
        index.hnsw.efSearch = 64
    if device == 'cuda':
        index = faiss.index_cpu_to_all_gpus(index)

    start = time.time()
    index.add(xb)
    _, I = index.search(xq, k)
    if verbose:
        print("  NN search (%s) done in %.2f s" %
              (device, time.time() - start))

    return I
コード例 #6
0
ファイル: similarity_search.py プロジェクト: TIBHannover/VIVA
def load_index(path_index, mode="cpu"):
    index = faiss.read_index(path_index)
    if mode == "gpu":
        ngpus = faiss.get_num_gpus()
        if ngpus > 0:
            index = faiss.index_cpu_to_all_gpus(index)
    return index
コード例 #7
0
def get_knn(reference_embeddings,
            test_embeddings,
            k,
            embeddings_come_from_same_source=False):
    """
    Finds the k elements in reference_embeddings that are closest to each
    element of test_embeddings.
    Args:
        reference_embeddings: numpy array of size (num_samples, dimensionality).
        test_embeddings: numpy array of size (num_samples2, dimensionality).
        k: int, number of nearest neighbors to find
        embeddings_come_from_same_source: if True, then the nearest neighbor of
                                         each element (which is actually itself)
                                         will be ignored.
    """
    d = reference_embeddings.shape[1]
    logging.info("running k-nn with k=%d" % k)
    logging.info("embedding dimensionality is %d" % d)
    index = faiss.IndexFlatL2(d)
    if faiss.get_num_gpus() > 0:
        index = faiss.index_cpu_to_all_gpus(index)
    index.add(reference_embeddings)
    _, indices = index.search(test_embeddings, k + 1)
    if embeddings_come_from_same_source:
        return indices[:, 1:]
    return indices[:, :k]
コード例 #8
0
def faiss_knn(feats_train, targets_train, feats_val, targets_val, k):
    feats_train = feats_train.numpy()
    targets_train = targets_train.numpy()
    feats_val = feats_val.numpy()
    targets_val = targets_val.numpy()

    d = feats_train.shape[-1]

    index = faiss.IndexFlatL2(d)  # build the index
    co = faiss.GpuMultipleClonerOptions()
    co.useFloat16 = True
    co.shard = True
    gpu_index = faiss.index_cpu_to_all_gpus(index, co)
    gpu_index.add(feats_train)

    D, I = gpu_index.search(feats_val, k)

    pred = np.zeros(I.shape[0])
    for i in range(I.shape[0]):
        votes = list(Counter(targets_train[I[i]]).items())
        shuffle(votes)
        pred[i] = max(votes, key=lambda x: x[1])[0]

    acc = 100.0 * (pred == targets_val).mean()

    return acc
コード例 #9
0
ファイル: memory.py プロジェクト: ferna11i/scan_unsupervised
    def mine_nearest_neighbors(self, topk, calculate_accuracy=True):
        # mine the topk nearest neighbors for every sample
        import faiss
        features = self.features.cpu().numpy()
        n, dim = features.shape[0], features.shape[1]
        index = faiss.IndexFlatIP(dim)
        index = faiss.index_cpu_to_all_gpus(index)
        index.add(features)
        distances, indices = index.search(features, topk +
                                          1)  # Sample itself is included

        np.save(
            "/scratch/b/bkantarc/jfern090/Projects/Lytica/results/tabledb/pretext/features.npy",
            features)

        # evaluate
        if calculate_accuracy:
            targets = self.targets.cpu().numpy()
            neighbor_targets = np.take(
                targets, indices[:,
                                 1:], axis=0)  # Exclude sample itself for eval
            anchor_targets = np.repeat(targets.reshape(-1, 1), topk, axis=1)
            accuracy = np.mean(neighbor_targets == anchor_targets)
            return indices, accuracy

        else:
            return indices
コード例 #10
0
def build_faiss_index(nd_feats_array, mode):
    """
    build index on multi GPUs
    :param nd_feats_array:
    :param mode: 0: CPU; 1: GPU; 2: Multi-GPU
    :return:
    """
    d = nd_feats_array.shape[1]

    cpu_index = faiss.IndexFlatL2(d)  # build the index on CPU
    if mode == 0:
        print("[INFO] Is trained? >> {}".format(cpu_index.is_trained))
        cpu_index.add(nd_feats_array)  # add vectors to the index
        print("[INFO] Capacity of gallery: {}".format(cpu_index.ntotal))

        return cpu_index
    elif mode == 1:
        ngpus = faiss.get_num_gpus()
        print("[INFO] number of GPUs:", ngpus)
        res = faiss.StandardGpuResources()  # use a single GPU
        gpu_index = faiss.index_cpu_to_gpu(res, 0, cpu_index)
        gpu_index.add(nd_feats_array)  # add vectors to the index
        print("[INFO] Capacity of gallery: {}".format(gpu_index.ntotal))

        return gpu_index
    elif mode == 2:
        multi_gpu_index = faiss.index_cpu_to_all_gpus(
            cpu_index)  # build the index on multi GPUs
        multi_gpu_index.add(nd_feats_array)  # add vectors to the index
        print("[INFO] Capacity of gallery: {}".format(multi_gpu_index.ntotal))

        return multi_gpu_index
コード例 #11
0
def get_k(bases, xb, params, k_data, dates):
    if params['kLineFirst']:
        dim = len(params['pickedStockKLine']['values'])
    else:
        dim = len(params['pickedStockTicks']['values'])
    #query
    kLine = params['pickedStockKLine']['values']
    open = list(map(lambda x: x[0], kLine))
    close = list(map(lambda x: x[1], kLine))
    low = list(map(lambda x: x[2], kLine))
    high = list(map(lambda x: x[3], kLine))
    volume = params['pickedStockKLine']['volumes']
    query = list(map(lambda x: x / open[0], open)) + list(
        map(lambda x: x / open[0], close)) + list(
            map(lambda x: x / open[0], low)) + list(
                map(lambda x: x / open[0], high)) + list(
                    map(lambda x: x[1] / volume[0][1], volume))
    xq = np.array([np.array(query)]).astype('float32')
    ngpus = faiss.get_num_gpus()
    #build index
    start = clock()
    cpu_index = faiss.IndexFlatL2(dim * 5)
    gpu_index = faiss.index_cpu_to_all_gpus(cpu_index)
    gpu_index.add(xb)

    D, I = gpu_index.search(xq, 10)
    #have not done
    end = clock()
    print(end - start)
    results = list(map(lambda x: bases[x], I[0]))
    print(results)
    return jsonify(back_and_front(results, k_data, dates))  #not yet
コード例 #12
0
def faiss_search_impl(emb_q,
                      emb_id,
                      emb_size,
                      shift,
                      k=50,
                      search_batch_sz=50000,
                      gpu=True):
    index = faiss.IndexFlat(emb_size)
    if gpu:
        index = faiss.index_cpu_to_all_gpus(index)
    index.add(emb_id)
    print('Total index =', index.ntotal)
    vals, inds = [], []
    for i_batch in tqdm(range(0, len(emb_q), search_batch_sz)):
        val, ind = index.search(
            emb_q[i_batch:min(i_batch + search_batch_sz, len(emb_q))], k)
        val = torch.from_numpy(val)
        val = 1 - val
        vals.append(val)
        inds.append(torch.from_numpy(ind) + shift)
        # print(vals[-1].size())
        # print(inds[-1].size())
    del index, emb_id, emb_q
    vals, inds = torch.cat(vals), torch.cat(inds)
    return vals, inds
コード例 #13
0
    def _set_mips_index(self):
        """
        Create a Faiss Flat index with inner product as the metric
        to search against
        """
        try:
            import faiss
        except ImportError:
            raise Exception(
                "Error: Please install faiss to use FaissMIPSIndex")

        if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
            print("\n> Building index", flush=True)

        cpu_index = faiss.IndexFlatIP(self.embed_size)

        if self.use_gpu:
            # create resources and config for GpuIndex
            config = faiss.GpuMultipleClonerOptions()
            config.shard = True
            config.useFloat16 = True
            gpu_index = faiss.index_cpu_to_all_gpus(cpu_index, co=config)
            self.mips_index = faiss.IndexIDMap(gpu_index)
            if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
                print(">> Initialized index on GPU", flush=True)
        else:
            # CPU index supports IDs so wrap with IDMap
            self.mips_index = faiss.IndexIDMap(cpu_index)
            if mpu.is_unitialized() or mpu.get_data_parallel_rank() == 0:
                print(">> Initialized index on CPU", flush=True)

        # if we were constructed with a BlockData, then automatically load it
        # when the FAISS structure is built
        if self.embed_data is not None:
            self.add_embed_data(self.embed_data)
コード例 #14
0
 def __init__(self, database, method):
     super().__init__(database, method)
     self.index = {'cosine': faiss.IndexFlatIP,
                   'euclidean': faiss.IndexFlatL2}[method](self.D)
     if os.environ.get('CUDA_VISIBLE_DEVICES'):
         self.index = faiss.index_cpu_to_all_gpus(self.index)
     self.add()
コード例 #15
0
def compute_GT_sliced(xb, xq, k):
    print "compute GT"
    t0 = time.time()
    nb, d = xb.shape
    nq, d = xq.shape
    rh = ResultHeap(nq, k)
    bs = 10 ** 5

    xqs = sanitize(xq)

    db_gt = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))

    # compute ground-truth by blocks of bs, and add to heaps
    for i0 in range(0, nb, bs):
        i1 = min(nb, i0 + bs)
        xsl = sanitize(xb[i0:i1])
        db_gt.add(xsl)
        D, I = db_gt.search(xqs, k)
        rh.add_batch_result(D, I, i0)
        db_gt.reset()
        print "\r   %d/%d, %.3f s" % (i0, nb, time.time() - t0),
        sys.stdout.flush()
    print
    rh.finalize()
    gt_I = rh.I

    print "GT time: %.3f s" % (time.time() - t0)
    return gt_I
コード例 #16
0
def knn_gpu(x, y, k, mem=5 * 1024 * 1024 * 1024):
    # Adapted From: https://github.com/google-research/xtreme/blob/
    #               522434d1aece34131d997a97ce7e9242a51a688a/third_party/utils_retrieve.py
    import faiss

    dim = x.shape[1]
    batch_size = mem // (dim * 4)
    sim = np.zeros((x.shape[0], k), dtype=np.float32)
    ind = np.zeros((x.shape[0], k), dtype=np.int64)
    for xfrom in range(0, x.shape[0], batch_size):
        xto = min(xfrom + batch_size, x.shape[0])
        bsims, binds = [], []
        for yfrom in range(0, y.shape[0], batch_size):
            yto = min(yfrom + batch_size, y.shape[0])
            idx = faiss.IndexFlatIP(dim)
            idx = faiss.index_cpu_to_all_gpus(idx)
            idx.add(y[yfrom:yto])
            bsim, bind = idx.search(x[xfrom:xto], min(k, yto - yfrom))
            bsims.append(bsim)
            binds.append(bind + yfrom)
            del idx
        bsims = np.concatenate(bsims, axis=1)
        binds = np.concatenate(binds, axis=1)
        aux = np.argsort(-bsims, axis=1)
        for i in range(xfrom, xto):
            for j in range(k):
                sim[i, j] = bsims[i - xfrom, aux[i - xfrom, j]]
                ind[i, j] = binds[i - xfrom, aux[i - xfrom, j]]
    return sim, ind
コード例 #17
0
    def _faiss_index_to_device(
            index: "faiss.Index",
            device: Optional[Union[int, List[int]]] = None) -> "faiss.Index":
        """
        Sends a faiss index to a device.
        A device can either be a positive integer (GPU id), a negative integer (all GPUs),
            or a list of positive integers (select GPUs to use), or `None` for CPU.
        """

        # If device is not specified, then it runs on CPU.
        if device is None:
            return index

        import faiss  # noqa: F811

        # If the device id is given as an integer
        if isinstance(device, int):
            # Positive integers are directly mapped to GPU ids
            if device > -1:
                faiss_res = faiss.StandardGpuResources()
                index = faiss.index_cpu_to_gpu(faiss_res, device, index)
            # And negative integers mean using all GPUs
            else:
                index = faiss.index_cpu_to_all_gpus(index)
        # Device ids given as a list mean mapping to those devices specified.
        elif isinstance(device, (list, tuple)):
            index = faiss.index_cpu_to_gpus_list(index, gpus=list(device))
        else:
            raise TypeError(
                f"The argument type: {type(device)} is not expected. " +
                "Please pass in either nothing, a positive int, a negative int, or a list of positive ints."
            )

        return index
コード例 #18
0
    def __init__(self,
                 model,
                 *,
                 num_documents,
                 doc_seq_len,
                 documents_memmap_path,
                 masks_memmap_path=None,
                 num_evidence=4,
                 reindex_batch_size=4,
                 use_faiss_ann=False):
        super().__init__()
        self.dim = model.dim
        self.num_evidence = num_evidence

        self.model = model.cuda()
        self.num_docs = num_documents
        self.doc_shape = (num_documents, doc_seq_len)
        self.documents_path = documents_memmap_path
        self.knn_path = f'{self.documents_path}.knn'

        self.use_faiss_ann = use_faiss_ann
        if use_faiss_ann:
            self.index = FaissANN(self.dim, self.num_docs)
        else:
            index = faiss.IndexFlatL2(self.dim)
            self.index = faiss.index_cpu_to_all_gpus(index)

        self.reindex_batch_size = reindex_batch_size
        self.reindex()

        self.dataset = DocumentDataset(num_documents, doc_seq_len,
                                       num_evidence, documents_memmap_path,
                                       masks_memmap_path)

        self.dataset.set_knn_path(self.knn_path)
コード例 #19
0
ファイル: search.py プロジェクト: waterbearbee/pymetric
def search_gpu(query_path, refer_path, output, topk=100):
    queryfeas, queryconts = loadFeaFromPickle(query_path)
    referfeas, referconts = loadFeaFromPickle(refer_path)
    assert(queryfeas.shape[1] == referfeas.shape[1])
    dim = int(queryfeas.shape[1])
    print("=> query feature shape: {}".format(queryfeas.shape), file=sys.stderr)
    print("=> refer feature shape: {}".format(referfeas.shape), file=sys.stderr)
    
    start = time.time()
    ngpus = faiss.get_num_gpus()
    print("=> search use gpu number of GPUs: {}".format(ngpus), file=sys.stderr)
    cpu_index = faiss.IndexFlat(dim, faiss.METRIC_INNER_PRODUCT)   # build the index
    gpu_index = faiss.index_cpu_to_all_gpus(  # build the index
            cpu_index
            )
    gpu_index.add(referfeas)                  # add vectors to the index print(index.ntotal)
    print("=> building gpu index success, \
           total index number: {}".format(gpu_index), file=sys.stderr)
    distance, ind = gpu_index.search(queryfeas, int(topk))
    assert(distance.shape == ind.shape)
    end = time.time()
    print("=> searching total use time {}".format(end - start), file=sys.stderr)
    outdic = {}
    for key_id in range(queryfeas.shape[0]):
        querycont = queryconts[key_id]
        searchresult = [(referconts[ind[key_id][i]], distance[key_id][i]) \
                         for i in range(len(distance[key_id]))]
        outdic[querycont] = searchresult
    print("=> convert search gpu result to output format success")
    pickle.dump(outdic, open(output,"wb"), protocol=2)
コード例 #20
0
def create_faiss_index(vecs, method, n_gpu):
    """
    Create FAISS index on GPU(s).
    To create a GPU index with FAISS, one first needs to create it on CPU then copy it on GPU. 
    Note that a "flat" index means that it is brute-force, with no approximation techniques.
    """
    # Build flat CPU index given the chosen method.
    if method=='l2':
        index = faiss.IndexFlatL2(vecs.shape[1])  # Exact Search for L2
    elif method=='ip':
        index = faiss.IndexFlatIP(vecs.shape[1])  # Exact Search for Inner Product
    elif method=='cos':
        # Cosime similarity comes down to normalizing the embeddings beforehand and then applying inner product.
        vecs = preprocessing.normalize(vecs, norm='l2')
        index = faiss.IndexFlatIP(vecs.shape[1])
    else:
        print("Error: Please choose between L2 Distance ('l2'), Inner Product Distance ('ip') or Cosine Distance ('cos') as brute-force method for exact search. Exiting...")
        sys.exit(0)
    
    # Convert to flat GPU index.
    if n_gpu > 0:
        co = faiss.GpuMultipleClonerOptions()  # If using multiple GPUs, enable sharding so that the dataset is divided across the GPUs rather than replicated.
        co.shard = True
        index = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=n_gpu)  # Convert CPU index to GPU index.
    
    # Add vectors to GPU index.
    index.add(vecs)
    
    # Convert back to cpu index (needed for saving it to disk).
    index = faiss.index_gpu_to_cpu(index)

    return index
コード例 #21
0
def knn_ground_truth(xq, db_iterator, k, metric_type=faiss.METRIC_L2):
    """Computes the exact KNN search results for a dataset that possibly
    does not fit in RAM but for which we have an iterator that
    returns it block by block.
    """
    LOG.info("knn_ground_truth queries size %s k=%d" % (xq.shape, k))
    t0 = time.time()
    nq, d = xq.shape
    rh = faiss.ResultHeap(nq, k)

    index = faiss.IndexFlat(d, metric_type)
    if faiss.get_num_gpus():
        LOG.info('running on %d GPUs' % faiss.get_num_gpus())
        index = faiss.index_cpu_to_all_gpus(index)

    # compute ground-truth by blocks, and add to heaps
    i0 = 0
    for xbi in db_iterator:
        ni = xbi.shape[0]
        index.add(xbi)
        D, I = index.search(xq, k)
        I += i0
        rh.add_result(D, I)
        index.reset()
        i0 += ni
        LOG.info("%d db elements, %.3f s" % (i0, time.time() - t0))

    rh.finalize()
    LOG.info("GT time: %.3f s (%d vectors)" % (time.time() - t0, i0))

    return rh.D, rh.I
コード例 #22
0
ファイル: mine_bitexts_wei.py プロジェクト: ftakanashi/LASER
def knnGPU(x, y, k, mem=5 * 1024 * 1024 * 1024):
    dim = x.shape[1]
    batch_size = mem // (dim * 4)
    sim = np.zeros((x.shape[0], k), dtype=np.float32)
    ind = np.zeros((x.shape[0], k), dtype=np.int64)
    for xfrom in range(0, x.shape[0], batch_size):
        xto = min(xfrom + batch_size, x.shape[0])
        bsims, binds = [], []
        for yfrom in range(0, y.shape[0], batch_size):
            yto = min(yfrom + batch_size, y.shape[0])
            # print('{}-{}  ->  {}-{}'.format(xfrom, xto, yfrom, yto))
            idx = faiss.IndexFlatIP(dim)
            idx = faiss.index_cpu_to_all_gpus(idx)
            idx.add(y[yfrom:yto])
            bsim, bind = idx.search(x[xfrom:xto], min(k, yto - yfrom))
            bsims.append(bsim)
            binds.append(bind + yfrom)
            del idx
        bsims = np.concatenate(bsims, axis=1)
        binds = np.concatenate(binds, axis=1)
        aux = np.argsort(-bsims, axis=1)
        for i in range(xfrom, xto):
            for j in range(k):
                sim[i, j] = bsims[i - xfrom, aux[i - xfrom, j]]
                ind[i, j] = binds[i - xfrom, aux[i - xfrom, j]]
    return sim, ind
コード例 #23
0
def run_kmeans(x, nmb_clusters):
    """
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    x = c_f.to_numpy(x).astype(np.float32)
    n_data, d = x.shape
    logging.info("running k-means clustering with k=%d" % nmb_clusters)
    logging.info("embedding dimensionality is %d" % d)

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    index = faiss.IndexFlatL2(d)
    if faiss.get_num_gpus() > 0:
        index = faiss.index_cpu_to_all_gpus(index)
    # perform the training
    clus.train(x, index)
    _, idxs = index.search(x, 1)

    return [int(n[0]) for n in idxs]
コード例 #24
0
    def mine_nearest_neighbors(self, topk, calculate_accuracy=True):
        #features = self.features.cpu().numpy()
        #print(features.shape)
        #similarities = squareform(pdist(features))
        #indices = np.argpartition(similarities, topk)[:,:topk]

        import faiss
        features = self.features.cpu().numpy()
        _, dim = features.shape[0], features.shape[1]
        index = faiss.IndexFlatIP(dim)
        index = faiss.index_cpu_to_all_gpus(index)
        index.add(features)
        _, indices = index.search(features,
                                  topk + 1)  # Sample itself is included

        # evaluate
        if calculate_accuracy:
            targets = self.targets.cpu().numpy()
            #neighbor_targets = np.take(targets, indices[:,:], axis=0) # Exclude sample itself for eval
            neighbor_targets = np.take(
                targets, indices[:,
                                 1:], axis=0)  # Exclude sample itself for eval
            anchor_targets = np.repeat(targets.reshape(-1, 1), topk, axis=1)
            accuracy = np.mean(neighbor_targets == anchor_targets)
            return indices, accuracy

        else:
            return indices
コード例 #25
0
    def run(self,
            data: Dict[str, np.ndarray],
            faiss_task,
            output_dir,
            opts={}):
        save_path = f"{output_dir}/{faiss_task}.in"
        if os.path.isfile(save_path):
            logger.info(f"Reading from saved path: {save_path}")
            index = faiss.read_index(save_path)
            index = faiss.index_cpu_to_all_gpus(index)
        else:
            logger.info(f"Constructing Faiss: {opts}")
            # data_db = reduce(
            #     lambda data, val: np.vstack((data, val)) if data is not None else val,
            #     data.values(),
            #     None,
            # )
            if type(data) is dict:
                data_db = np.array(list(data.values()))
            else:
                data_db = data
            logger.info(f"Shape of db {data_db.shape}")
            n_gpus = faiss.get_num_gpus()
            logger.info(f"Number of GPUs available: {n_gpus}")
            if opts.get("mips", True):
                logger.info("Building MIPS indexes")
                index = faiss.IndexFlatIP(data_db.shape[1])
                index = faiss.IndexIVFFlat(
                    index,
                    data_db.shape[1],
                    opts.get("nlist", int(4 * math.sqrt(len(data)))),
                    faiss.METRIC_INNER_PRODUCT,
                )
            else:
                index = faiss.IndexFlatL2(data_db.shape[1])
            if n_gpus > 0:
                logger.info("Building GPU model")
                index = faiss.index_cpu_to_all_gpus(index)

            logger.info("Builing Indexes")
            data_db = np.float32(data_db)
            if opts.get("mips", True):
                index.train(data_db)
            index.add(data_db)
            logger.info(f"Gpu Index: {index.ntotal}")
            faiss.write_index(faiss.index_gpu_to_cpu(index), save_path)
        return index
コード例 #26
0
ファイル: faiss_gpu.py プロジェクト: yuk12/dgl
    def __init__(self,
                 target,
                 nprobe=128,
                 index_factory_str=None,
                 verbose=False,
                 mode='proxy',
                 using_gpu=True):
        self._res_list = []

        num_gpu = faiss.get_num_gpus()
        print('[faiss gpu] #GPU: {}'.format(num_gpu))

        size, dim = target.shape
        assert size > 0, "size: {}".format(size)
        index_factory_str = "IVF{},PQ{}".format(
            min(8192, 16 * round(np.sqrt(size))),
            32) if index_factory_str is None else index_factory_str
        cpu_index = faiss.index_factory(dim, index_factory_str)
        cpu_index.nprobe = nprobe

        if mode == 'proxy':
            co = faiss.GpuClonerOptions()
            co.useFloat16 = True
            co.usePrecomputed = False

            index = faiss.IndexProxy()
            for i in range(num_gpu):
                res = faiss.StandardGpuResources()
                self._res_list.append(res)
                sub_index = faiss.index_cpu_to_gpu(
                    res, i, cpu_index, co) if using_gpu else cpu_index
                index.addIndex(sub_index)
        elif mode == 'shard':
            co = faiss.GpuMultipleClonerOptions()
            co.useFloat16 = True
            co.usePrecomputed = False
            co.shard = True
            index = faiss.index_cpu_to_all_gpus(cpu_index, co, ngpu=num_gpu)
        else:
            raise KeyError("Unknown index mode")

        index = faiss.IndexIDMap(index)
        index.verbose = verbose

        # get nlist to decide how many samples used for training
        nlist = int(
            float([
                item for item in index_factory_str.split(",") if 'IVF' in item
            ][0].replace("IVF", "")))

        # training
        if not index.is_trained:
            indexes_sample_for_train = np.random.randint(0, size, nlist * 256)
            index.train(target[indexes_sample_for_train])

        # add with ids
        target_ids = np.arange(0, size)
        index.add_with_ids(target, target_ids)
        self.index = index
コード例 #27
0
ファイル: exhaustive_search.py プロジェクト: zhyali/faiss
def range_ground_truth(xq,
                       db_iterator,
                       threshold,
                       metric_type=faiss.METRIC_L2,
                       shard=False,
                       ngpu=-1):
    """Computes the range-search search results for a dataset that possibly
    does not fit in RAM but for which we have an iterator that
    returns it block by block.
    """
    nq, d = xq.shape
    t0 = time.time()
    xq = np.ascontiguousarray(xq, dtype='float32')

    index = faiss.IndexFlat(d, metric_type)
    if ngpu == -1:
        ngpu = faiss.get_num_gpus()
    if ngpu:
        LOG.info('running on %d GPUs' % ngpu)
        co = faiss.GpuMultipleClonerOptions()
        co.shard = shard
        index_gpu = faiss.index_cpu_to_all_gpus(index, co=co, ngpu=ngpu)

    # compute ground-truth by blocks
    i0 = 0
    D = [[] for _i in range(nq)]
    I = [[] for _i in range(nq)]
    all_lims = []
    for xbi in db_iterator:
        ni = xbi.shape[0]
        if ngpu > 0:
            index_gpu.add(xbi)
            lims_i, Di, Ii = range_search_gpu(xq, threshold, index_gpu, xbi)
            index_gpu.reset()
        else:
            index.add(xbi)
            lims_i, Di, Ii = index.range_search(xq, threshold)
            index.reset()
        Ii += i0
        for j in range(nq):
            l0, l1 = lims_i[j], lims_i[j + 1]
            if l1 > l0:
                D[j].append(Di[l0:l1])
                I[j].append(Ii[l0:l1])
        i0 += ni
        LOG.info("%d db elements, %.3f s" % (i0, time.time() - t0))

    empty_I = np.zeros(0, dtype='int64')
    empty_D = np.zeros(0, dtype='float32')
    # import pdb; pdb.set_trace()
    D = [(np.hstack(i) if i != [] else empty_D) for i in D]
    I = [(np.hstack(i) if i != [] else empty_I) for i in I]
    sizes = [len(i) for i in I]
    assert len(sizes) == nq
    lims = np.zeros(nq + 1, dtype="uint64")
    lims[1:] = np.cumsum(sizes)
    return lims, np.hstack(D), np.hstack(I)
コード例 #28
0
ファイル: build_and_query_index.py プロジェクト: vadam5/NeMo
def build_index(cfg: DictConfig, model: object): 
    """
    Builds faiss index from index dataset specified in the config.
        
    Args:
        cfg: Config file specifying index parameters
    """
    # Get index dataset embeddings 
    # PCA model exists and index embeddings have already been PCAed, no need to re-extract/PCA them
    if cfg.apply_pca and os.path.isfile(cfg.pca.pca_save_name) and os.path.isfile(cfg.pca_embeddings_save_name):
        logging.info("Loading reduced dimensionality embeddings")
        embeddings = h5py.File(cfg.pca_embeddings_save_name, "r")
        embeddings = embeddings[cfg.index_ds.name][:] 

    elif os.path.isfile(cfg.embedding_save_name):
        logging.info("Loading previously extracted index dataset embeddings")
        embeddings = h5py.File(cfg.embedding_save_name, "r")
        embeddings = embeddings[cfg.index_ds.name][:]

    else:
        logging.info("Encoding index dataset, this may take a while")
        index_dataloader = model.setup_dataloader(cfg.index_ds, is_index_data=True)
        embeddings, concept_ids = get_index_embeddings(cfg, index_dataloader, model)


    # Create pca model to reduce dimensionality of index dataset and decrease memory footprint
    if cfg.apply_pca:

        # Need to train PCA model and apply PCA transformation with newly trained model
        if not os.path.isfile(cfg.pca.pca_save_name):
            logging.info("Fitting PCA model for embedding dimensionality reduction")
            pca_train_set = random.sample(list(embeddings), k=int(len(embeddings)*cfg.pca.sample_fraction))
            pca = PCA(n_components=cfg.pca.output_dim)
            pca.fit(pca_train_set)
            pkl.dump(pca, open(cfg.pca.pca_save_name, "wb"))
            embeddings = reduce_embedding_dim(pca, embeddings, cfg)
        
        #PCA model already trained, just need to reduce dimensionality of all embeddings
        elif not os.path.isfile(cfg.pca_embeddings_save_name):
            pca = pkl.load(open(cfg.pca.pca_save_name, "rb"))
            embeddings = reduce_embedding_dim(pca, embeddings, cfg)


    # Build faiss index from embeddings
    logging.info(f"Training index with embedding dim size {cfg.dims} using {faiss.get_num_gpus()} gpus")
    quantizer = faiss.IndexFlatL2(cfg.dims)
    index = faiss.IndexIVFFlat(quantizer, cfg.dims, cfg.nlist)
    index = faiss.index_cpu_to_all_gpus(index)
    index.train(embeddings)

    logging.info("Adding dataset embeddings to index")
    for i in tqdm(range(0, embeddings.shape[0], cfg.index_batch_size)):
        index.add(embeddings[i:i+cfg.index_batch_size])

    logging.info("Saving index")
    faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index_save_name)
    logging.info("Index built and saved")
コード例 #29
0
ファイル: distributed_kmeans.py プロジェクト: zz198808/milvus
 def __init__(self, x, gpu_id, verbose=False):
     DatasetAssign.__init__(self, x)
     index = faiss.IndexFlatL2(x.shape[1])
     if gpu_id >= 0:
         self.index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(),
                                             gpu_id, index)
     else:
         # -1 -> assign to all GPUs
         self.index = faiss.index_cpu_to_all_gpus(index)
コード例 #30
0
ファイル: eval_knn.py プロジェクト: UMBCvision/SSL-Backdoor
def faiss_knn(feats_train, targets_train, feats_val, targets_val,
              feats_val_poisoned, targets_val_poisoned, k):
    feats_train = feats_train.numpy()
    targets_train = targets_train.numpy()
    feats_val = feats_val.numpy()
    targets_val = targets_val.numpy()

    d = feats_train.shape[-1]

    index = faiss.IndexFlatL2(d)  # build the index
    co = faiss.GpuMultipleClonerOptions()
    co.useFloat16 = True
    co.shard = True
    gpu_index = faiss.index_cpu_to_all_gpus(index, co)
    gpu_index.add(feats_train)

    # Val clean
    D, I = gpu_index.search(feats_val, k)

    # create confusion matrix ROWS ground truth COLUMNS pred
    conf_matrix_clean = np.zeros(
        (int(targets_val.max()) + 1, int(targets_val.max()) + 1))

    pred = np.zeros(I.shape[0])
    for i in range(I.shape[0]):
        votes = list(Counter(targets_train[I[i]]).items())
        shuffle(votes)
        pred[i] = max(votes, key=lambda x: x[1])[0]
        # update confusion matrix
        conf_matrix_clean[targets_val[i], int(pred[i])] += 1

    acc = 100.0 * (pred == targets_val).mean()

    # Val poisoned
    feats_val_poisoned = feats_val_poisoned.numpy()
    targets_val_poisoned = targets_val_poisoned.numpy()

    D, I = gpu_index.search(feats_val_poisoned, k)

    # create confusion matrix ROWS ground truth COLUMNS pred
    conf_matrix_poisoned = np.zeros((int(targets_val_poisoned.max()) + 1,
                                     int(targets_val_poisoned.max()) + 1))

    pred_poisoned = np.zeros(I.shape[0])
    for i in range(I.shape[0]):
        votes = list(Counter(targets_train[I[i]]).items())
        shuffle(votes)
        pred_poisoned[i] = max(votes, key=lambda x: x[1])[0]
        # update confusion matrix
        conf_matrix_poisoned[targets_val_poisoned[i],
                             int(pred_poisoned[i])] += 1

    acc_poisoned = 100.0 * (pred_poisoned == targets_val_poisoned).mean()

    return acc, conf_matrix_clean, acc_poisoned, conf_matrix_poisoned
コード例 #31
0
ファイル: 5-Multiple-GPUs.py プロジェクト: mrlocker/faiss
import numpy as np

d = 64                           # dimension
nb = 100000                      # database size
nq = 10000                       # nb of queries
np.random.seed(1234)             # make reproducible
xb = np.random.random((nb, d)).astype('float32')
xb[:, 0] += np.arange(nb) / 1000.
xq = np.random.random((nq, d)).astype('float32')
xq[:, 0] += np.arange(nq) / 1000.

import faiss                     # make faiss available

ngpus = faiss.get_num_gpus()

print("number of GPUs:", ngpus)

cpu_index = faiss.IndexFlatL2(d)

gpu_index = faiss.index_cpu_to_all_gpus(  # build the index
    cpu_index
)

gpu_index.add(xb)              # add vectors to the index
print(gpu_index.ntotal)

k = 4                          # we want to see 4 nearest neighbors
D, I = gpu_index.search(xq, k) # actual search
print(I[:5])                   # neighbors of the 5 first queries
print(I[-5:])                  # neighbors of the 5 last queries