Exemple #1
0
    def test_knn_cpu(self):
        xb = np.random.rand(200, 32).astype('float32')
        xq = np.random.rand(100, 32).astype('float32')

        index = faiss.IndexFlatL2(32)
        index.add(xb)
        Dref, Iref = index.search(xq, 10)

        Dnew, Inew = knn(xq, xb, 10)

        assert np.all(Inew == Iref)
        assert np.allclose(Dref, Dnew)


        index = faiss.IndexFlatIP(32)
        index.add(xb)
        Dref, Iref = index.search(xq, 10)

        Dnew, Inew = knn(xq, xb, 10, metric=faiss.METRIC_INNER_PRODUCT)

        assert np.all(Inew == Iref)
        assert np.allclose(Dref, Dnew)
    def test_knn_gpu_datatypes(self):
        torch.manual_seed(10)
        d = 10
        nb = 1024
        nq = 5
        k = 10
        res = faiss.StandardGpuResources()

        # make GT on torch cpu and test using IndexFlatL2
        xb = torch.rand(nb, d, dtype=torch.float32)
        xq = torch.rand(nq, d, dtype=torch.float32)

        index = faiss.IndexFlatL2(d)
        index.add(xb)
        gt_D, gt_I = index.search(xq, k)

        xb_c = xb.cuda().half()
        xq_c = xq.cuda().half()

        # use i32 output indices
        D = torch.zeros(nq, k, device=xb_c.device, dtype=torch.float32)
        I = torch.zeros(nq, k, device=xb_c.device, dtype=torch.int32)

        faiss.knn_gpu(res, xq_c, xb_c, k, D, I)

        self.assertTrue(torch.equal(I.long().cpu(), gt_I))
        self.assertLess((D.float().cpu() - gt_D).abs().max(), 1.5e-3)

        # Test using numpy
        D = np.zeros((nq, k), dtype=np.float32)
        I = np.zeros((nq, k), dtype=np.int32)

        xb_c = xb.half().numpy()
        xq_c = xq.half().numpy()

        faiss.knn_gpu(res, xq_c, xb_c, k, D, I)

        self.assertTrue(torch.equal(torch.from_numpy(I).long(), gt_I))
        self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1.5e-3)
Exemple #3
0
    def test_tutorial(self):
        np.random.seed(FIXED_SEED)  # make reproducible

        database = np.random.random(
            (DATABASE_SIZE, DIMENSIONS)).astype('float32')
        database[:, 0] += np.arange(DATABASE_SIZE) / NORMALIZATION_FACTOR

        query_array = np.random.random(
            (NUM_QUERIES, DIMENSIONS)).astype('float32')
        query_array[:, 0] += np.arange(NUM_QUERIES) / NORMALIZATION_FACTOR

        index = faiss.IndexFlatL2(DIMENSIONS)  # build the index
        assert index.is_trained  # should be true
        index.add(database)  # add vectors to the index

        assert index.ntotal == DATABASE_SIZE  # assert the index size is the same as database size

        D, I = index.search(database[:SMALL_NUM_QUERIES],
                            NUM_NEIGHBORS)  # sanity check

        assert (I[:SMALL_NUM_QUERIES, 0] == np.arange(SMALL_NUM_QUERIES)).all()

        self._test_index_results(
            SMALL_NUM_QUERIES,
            DATABASE_SIZE,
            NUM_NEIGHBORS,
            D,
            I,
        )

        D, I = index.search(query_array, NUM_NEIGHBORS)  # actual search

        self._test_index_results(
            NUM_QUERIES,
            DATABASE_SIZE,
            NUM_NEIGHBORS,
            D,
            I,
        )
Exemple #4
0
def get_neighborhood(data, sample_idcs, buffer=0., use_faiss=True):
    """Determine all data points within the sphere in data space defined by the samples.
    Args:
        data (np.ndarray): NxD array containing N D-dimensional data vectors
        sample_idcs (iterable ints): indices of the data points that define the sphere
        buffer (optional, float): fraction of radius which to additionally include in sphere
        use_faiss (optional, bool): whether to use faiss library for distance calculation
        """
    # get center of samples
    center = np.mean(data[sample_idcs], axis=0, keepdims=True)

    if use_faiss:
        index = faiss.IndexFlatL2(data.shape[1])  # build the index
        index.add(data.astype('float32'))  # add vectors to the index
        distances, indices = index.search(center.astype('float32'), len(data))
        distances, indices = np.sqrt(
            distances[0]), indices[0]  # faiss returns squared distances

        radius = max(
            [d for d, i in zip(distances, indices) if i in sample_idcs])
        radius += buffer * radius

        local_idcs = []
        for d, i in zip(distances, indices):
            if d > radius:
                break
            local_idcs.append(i)
        local_idcs = np.array(local_idcs)

    else:
        distances = np.array([euclidean(d, center) for d in data])

        radius = max(distances[sample_idcs])
        radius += buffer * radius

        local_idcs = np.where(distances <= radius)[0]

    return local_idcs, center, radius
def predict_landmark_id(ids_query,
                        feats_query,
                        ids_train,
                        feats_train,
                        landmark_dict,
                        topk=3):
    print('build index...')
    cpu_index = faiss.IndexFlatL2(feats_train.shape[1])
    cpu_index.add(feats_train)
    dists, topk_idx = cpu_index.search(x=feats_query, k=topk)
    print('query search done.')

    df = pd.DataFrame(ids_query, columns=['id'])
    df['images'] = np.apply_along_axis(' '.join,
                                       axis=1,
                                       arr=ids_train[topk_idx])

    rows = []
    for imidx, (_, r) in tqdm.tqdm(enumerate(df.iterrows()), total=len(df)):
        image_ids = [name.split('/')[-1] for name in r.images.split(' ')]
        counter = Counter()
        for i, image_id in enumerate(image_ids[:topk]):
            landmark_id = landmark_dict[image_id]

            counter[landmark_id] += 1.0 - dists[imidx, i]

        landmark_id, score = counter.most_common(1)[0]
        rows.append({
            'id': r['id'],
            'landmarks': f'{landmark_id} {score:.9f}',
        })

    pred = pd.DataFrame(rows).set_index('id')
    pred['landmark_id'], pred['score'] = list(
        zip(*pred['landmarks'].apply(lambda x: str(x).split(' '))))
    pred['score'] = pred['score'].astype(np.float32)

    return pred
    def __init__(self, config={}, images_info=None, load_database=False):
        super().__init__()
        default_config = {
            'saved_model_path':
            '/media/li/lavie/dataset/birdview_dataset/saved_models',
            'num_clusters': 64,
            'final_dim': 256,
            'save_dir': None,
            'num_results': 3,
        }
        config = {**default_config, **config}

        base_model = BaseModel()
        net_vlad = NetVLAD(num_clusters=config["num_clusters"],
                           dim=256,
                           alpha=1.0,
                           outdim=config["final_dim"])
        self.model_ = EmbedNet(base_model, net_vlad)
        saved_model_file = os.path.join(config["saved_model_path"],
                                        'model-to-check-top1.pth.tar')
        model_checkpoint = torch.load(
            saved_model_file, map_location=lambda storage, loc: storage)
        self.model_.load_state_dict(model_checkpoint)

        self.save_dir_ = config['save_dir']
        self.images_info_ = [] if images_info is None else images_info
        self.index_ = faiss.IndexFlatL2(config['final_dim'])
        self.device_ = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # self.device_ = torch.device("cpu")
        self.model_.to(self.device_)
        self.input_transforms_ = input_transforms()
        self.num_results_ = config["num_results"]

        if load_database:
            self._generate_database()
        pass
Exemple #7
0
    def _fit_array(self, embs_arr, labels_arr):
        '''
        Fit current matcher to new embs and labels
        :param embs: list of embs
        :param labels: list of label (face id) for each emb
        '''
        length = embs_arr.shape[0]
        if length > 0:
            # only fit if we have data
            self._rwlock.writer_acquire()
            cpu_classifier = faiss.IndexFlatL2(Config.Matcher.EMB_LENGTH)

            # This line allow faiss to run on GPU, there is another function
            # that allow a certainly number of GPUs, e.g: 4
            # but we use this function in this code because our GPUtil limited
            # number of GPU used
            self._classifier = faiss.index_cpu_to_all_gpus(cpu_classifier)
            self._classifier.add(embs_arr.astype('float32'))
            self._matcher_tup = FaissTuple(embs_arr, labels_arr, length)
            self._rwlock.writer_release()
        else:
            self._classifier = None
            self._matcher_tup = None
Exemple #8
0
    def __init__(self,
                 domain,
                 candidates=10,
                 seed=None,
                 nList=100,
                 nProbe=10,
                 dataset=False):
        self.domain = domain
        self.d = len(self.domain)
        self.candNum = candidates
        self.k = 1  # number of nearest neighbors to search
        self.nlist = nList  # faiss parameter
        quantizer = faiss.IndexFlatL2(self.d)  # the other index
        self.faissIndex = faiss.IndexIVFFlat(quantizer, self.d, self.nlist)
        if not dataset:
            tempRandData = np.random.random((100000, self.d)).astype('float32')
        else:
            tempRandData = dataset

        self.faissIndex.train(tempRandData)
        self.faissIndex.nprobe = nProbe
        np.random.seed(seed)
        self.selected_set = []  # todo: make it numpy array
Exemple #9
0
    def test_read_buffer(self):
        d, n = 32, 1000
        x = np.random.uniform(size=(n, d)).astype('float32')
        index = faiss.IndexFlatL2(d)
        index.add(x)

        fd, fname = tempfile.mkstemp()
        os.close(fd)
        try:
            faiss.write_index(index, fname)

            reader = faiss.BufferedIOReader(faiss.FileIOReader(fname), 1234)

            index2 = faiss.read_index(reader)

            self.assertEqual(index.d, index2.d)
            np.testing.assert_array_equal(faiss.vector_to_array(index.xb),
                                          faiss.vector_to_array(index2.xb))

        finally:
            del reader
            if os.path.exists(fname):
                os.unlink(fname)
Exemple #10
0
def IndexCreate(dname,
                idx_type,
                verbose=False,
                normalize=True,
                save_index=False,
                dim=1024):

    assert idx_type == 'FlatL2', 'only FlatL2 index is currently supported'
    x = np.fromfile(dname, dtype=np.float32, count=-1)
    nbex = x.shape[0] // dim
    print(' - embedding: {:s} {:d} examples of dim {:d}'.format(
        dname, nbex, dim))
    x.resize(nbex, dim)
    print(' - creating FAISS index')
    idx = faiss.IndexFlatL2(dim)
    if normalize:
        faiss.normalize_L2(x)
    idx.add(x)
    if save_index:
        iname = 'TODO'
        print(' - saving index into ' + iname)
        faiss.write_index(idx, iname)
    return x, idx
    def __init__(self, INDEX_FILE):
        # read in indexed images' feature vectors and corresponding image names
        h5f = h5py.File(INDEX_FILE, 'r')
        self.feats = h5f['dataset_1'][:]
        self.imgNames = h5f['dataset_2'][:]
        h5f.close()

        # # -----------------------------
        # index_flat = faiss.IndexFlatL2(self.feats[0].size)
        # index_flat.add(self.feats)
        # ------------------------------------
        nlist = NLIST
        self.quantizer = faiss.IndexFlatL2(
            self.feats[0].size)  # the other index
        self.index_ivfflat = faiss.IndexIVFFlat(self.quantizer,
                                                self.feats[0].size, nlist,
                                                faiss.METRIC_L2)
        # here we specify METRIC_L2, by default it performs inner-product search
        assert not self.index_ivfflat.is_trained
        self.index_ivfflat.train(self.feats)
        assert self.index_ivfflat.is_trained

        self.index_ivfflat.add(self.feats)  # add may be a bit slower as well
Exemple #12
0
    def initFaiss(self, nlist, nprobe, bytesPerVec, bytesPerSubVec, dim, matrix):
        self.nlist = nlist
        self.nprobe = nprobe
        self.bytesPerVec = bytesPerVec
        self.bytesPerSubVec = bytesPerSubVec
        self.dim = dim

        self.train_data = np.matrix(matrix).astype('float32')
        print('FAISS init quantizer', self.train_data, self.train_data.shape)
        self.f_quantizer = faiss.IndexFlatL2(self.dim)
        # Lock index read / wtite until it is built
        with self._lock:
            print('FAISS init index')
            self.f_index = faiss.IndexIVFPQ(self.f_quantizer, self.dim, self.nlist, self.bytesPerVec, self.bytesPerSubVec)
            print('FAISS train index')
            self.f_index.train(self.train_data)
            print('FAISS train index finished')

            # write index to disk
            self.modelLoaded = self.saveModelToDisk(model_location, self.f_index)
        self.is_initiated = self.modelLoaded

        return self.is_initiated
Exemple #13
0
    def _build_index(self, xb):
        """Build faiss index from a given set of samples

        Parameters
        ----------
        xb : torch.tensor
            tensor of samples to build the search index, shape is
            (num_samples, dim)

        Returns
        -------
        index
            faiss index built on the given samples
        """

        d = xb.size(-1)
        # brute-force search on GPU (GPU generally doesn't have enough memory)
        # res = faiss.StandardGpuResources()
        # index = faiss.GpuIndexFlatIP(res, d)

        # brute-force search on CPU
        self.index = faiss.IndexFlatL2(d)
        self.index.add(xb.detach().cpu().numpy())
Exemple #14
0
    def test_int64(self):
        # see https://github.com/facebookresearch/faiss/issues/1529
        sizeof_long = array.array("l").itemsize
        if sizeof_long == 4:
            v = faiss.LongLongVector()
        elif sizeof_long == 8:
            v = faiss.LongVector()
        else:
            raise AssertionError("weird long size")

        for i in range(10):
            v.push_back(i)
        a = faiss.vector_to_array(v)
        assert a.dtype == 'int64'
        np.testing.assert_array_equal(a, np.arange(10, dtype='int64'))

        # check if it works in an IDMap
        idx = faiss.IndexIDMap(faiss.IndexFlatL2(32))
        idx.add_with_ids(
            np.random.rand(10, 32).astype('float32'),
            np.random.randint(1000, size=10, dtype='int64')
        )
        faiss.vector_to_array(idx.id_map)
Exemple #15
0
    def fit(self, X):
        # faiss doesn't support installing, hack around by just importing
        # from the build directory by setting up PYTHONPATH
        import sys
        import os
        sys.path.append(
            os.path.join(os.path.dirname(os.path.realpath(__file__)), "..",
                         "install", "faiss"))
        import faiss

        if self._metric == 'angular':
            X = sklearn.preprocessing.normalize(X, axis=1, norm='l2')

        if X.dtype != numpy.float32:
            X = X.astype(numpy.float32)

        self.quantizer = faiss.IndexFlatL2(X.shape[1])
        index = faiss.IndexIVFFlat(self.quantizer, X.shape[1], self._n_list,
                                   faiss.METRIC_L2)
        index.train(X)
        index.add(X)
        index.nprobe = self._n_probe
        self._index = index
    def creating_and_saving_IVFFlat(self):
        try:
            log_instance.info('stp0: loading data')
            log_instance.info("data rows: %s column: %s" % self.np_data.shape)
            log_instance.info('stp1: set faiss quantizer')
            quantizer = faiss.IndexFlatL2(VEC_DIMENSION)
            log_instance.info('stp2: init IndexIVFFlat')
            ivf_index = faiss.IndexIVFFlat(quantizer, VEC_DIMENSION, NLIST,
                                           FAISS_METRIC_TYPE)
            log_instance.info('stp3: IndexIVFFlat train')
            normalize_L2(self.np_data)
            ivf_index.train(self.np_data)
            log_instance.info('stp4: IndexIVFFlat add data')
            ivf_index.add(self.np_data)
            log_instance.info('stp5: faiss index saving')
            faiss.write_index(ivf_index, self.faiss_model_save_path)

            with open(self.sku_idx_save_path, "w") as f2:
                f2.write("\n".join(self.sku_lst))
                f2.flush()
        except Exception as e:
            self.log_instance.error(traceback.format_exc())
            raise e
Exemple #17
0
def FlatL2Index_kb():
    index = faiss.IndexFlatL2(dim_size)
    assert index.is_trained
    index.add(index_kb)

    file = open('./data/flagL2.dat', 'w')
    start_time = time.time()
    D, I = index.search(query_list[:1], recall_size)
    print("FlatL2 index %s seconds" % (time.time() - start_time))
    D, I = index.search(query_list, recall_size)  # actual search

    with open('./data.20181219/jiayi.csv', 'w') as f:
        for idx, recall_list in enumerate(I):
            kb_str_list = []
            for kb_idx in recall_list:
                if kb_stan[kb_dict[int(kb_idx)]] not in kb_str_list:
                    kb_str_list.append(kb_stan[kb_dict[int(kb_idx)]])
                if len(kb_str_list) >= 12:
                    break
            f.write(revised_query_list[idx] + '\t' +
                    ('####').join(kb_str_list) + '\n')
        f.close()
    '''
Exemple #18
0
def recover_closest_standard(feature_matrix_all, image_paths, save_path, n_image_samples=10, n_closest=3):
    image_paths = np.array([x[0] for x in image_paths])
    sample_idxs = np.random.choice(np.arange(len(feature_matrix_all)), n_image_samples)

    faiss_search_index = faiss.IndexFlatL2(feature_matrix_all.shape[-1])
    faiss_search_index.add(feature_matrix_all)
    _, closest_feature_idxs = faiss_search_index.search(feature_matrix_all, n_closest+1)

    sample_paths = image_paths[closest_feature_idxs][sample_idxs]

    f,axes = plt.subplots(n_image_samples, n_closest+1)
    for i,(ax,plot_path) in enumerate(zip(axes.reshape(-1), sample_paths.reshape(-1))):
        ax.imshow(np.array(Image.open(plot_path)))
        ax.set_xticks([])
        ax.set_yticks([])
        if i%(n_closest+1):
            ax.axvline(x=0, color='g', linewidth=13)
        else:
            ax.axvline(x=0, color='r', linewidth=13)
    f.set_size_inches(10,20)
    f.tight_layout()
    f.savefig(save_path)
    plt.close()
Exemple #19
0
def search(search_embd):
    db = database.EmbdingDatabase()
    db.open(drop_exist=False)
    embd = db.get_embd()
    num = np.asarray(embd)
    arr = num[:, 1]
    float_arr = []
    for i in range(arr.shape[0]):
        temp = arr[i]
        float_arr.append(temp)
    float_arr = np.asarray(float_arr)

    index = faiss.IndexFlatL2(128)
    #print(index.is_trained)
    float_arr = float_arr.astype('float32')
    index.add(float_arr)  # add vectors to the index
    k = 4  # we want to see 4 nearest neighbors
    #print(index.ntotal)

    temp = []
    temp.append(search_embd)
    query = np.asarray(temp)
    D, I = index.search(query, k)
def simulate_mee_runtime(n_videos=1000000, d=256, n_query=100, max_neighbors=100, n_runs=5, n_warmup_runs=10):
    """ Search over a database of shape [n_videos, d] with query of shape [n_query, d].
    For each query, return max_neighbors results.
    """
    import faiss
    torch.cuda.synchronize()
    st_time = time.time()
    fake_database = faiss.rand((n_videos, d))
    fake_query = faiss.rand((n_query, d))
    torch.cuda.synchronize()
    logger.info("Construct fake database + query time {}".format(time.time() - st_time))

    torch.cuda.synchronize()
    st_time = time.time()
    index = faiss.index_factory(d, "IVF4096,Flat", faiss.METRIC_L2)
    index_ivf = faiss.extract_index_ivf(index)
    clustering_index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
    index_ivf.clustering_index = clustering_index
    torch.cuda.synchronize()
    logger.info("Build/Move to GPU? index time {}".format(time.time() - st_time))

    st_time = time.time()
    torch.cuda.synchronize()
    index_ivf.train(fake_database)
    torch.cuda.synchronize()
    logger.info("Train index time {}".format(time.time() - st_time))

    times = []
    for _ in range(n_warmup_runs+n_runs):
        torch.cuda.synchronize()
        st_time = time.time()
        D, I = index_ivf.search(fake_query, max_neighbors)
        torch.cuda.synchronize()
        times.append(time.time() - st_time)
    avg_time = np.mean(times[n_warmup_runs:]) * 2  # video + sub
    logger.info("Avg searching time ({} runs) {}".format(n_runs, avg_time))
    return avg_time
Exemple #21
0
def faiss_knn(feats_train, targets_train, feats_val, targets_val, k):
    feats_train = feats_train.numpy()
    targets_train = targets_train.numpy()
    feats_val = feats_val.numpy()
    targets_val = targets_val.numpy()

    d = feats_train.shape[-1]

    index = faiss.IndexFlatL2(d)  # build the index
    co = faiss.GpuMultipleClonerOptions()
    co.useFloat16 = True
    co.shard = True
    gpu_index = faiss.index_cpu_to_all_gpus(index, co)
    gpu_index.add(feats_train)

    D, I = gpu_index.search(feats_val, k)

    pred = np.zeros(I.shape[0], dtype=np.int)
    conf_mat = np.zeros((1000, 1000), dtype=np.int)
    for i in range(I.shape[0]):
        votes = list(Counter(targets_train[I[i]]).items())
        shuffle(votes)
        pred[i] = max(votes, key=lambda x: x[1])[0]
        conf_mat[targets_val[i], pred[i]] += 1

    acc = 100.0 * (pred == targets_val).mean()
    assert acc == (100.0 * (np.trace(conf_mat) / np.sum(conf_mat)))

    # per_cat_acc = 100.0 * (np.diag(conf_mat) / np.sum(conf_mat, axis=1))
    # sparse_cats = [58, 155, 356, 747, 865, 234, 268, 384, 385, 491, 498, 538, 646, 650, 726, 860, 887, 15, 170, 231]
    # s = ' '.join('{}'.format(c) for c in sparse_cats)
    # print('==> cats: {}'.format(s))
    # s = ' '.join('{:.1f}'.format(a) for a in per_cat_acc[sparse_cats])
    # print('==> acc/cat: {}'.format(s))
    # print('==> mean acc: {}'.format(per_cat_acc[sparse_cats].mean()))

    return acc
Exemple #22
0
 def use_faiss_backend(
     self,
     gpu=False,
     ann=False,
     ann_center=10,
     ann_nprob=1,
 ):
     # The method is just a running example of the faiss library which supports
     #   - batch queries
     #   - approximate nearest neighbours
     #   - several distances
     # However, for most of the time, the vanilla pytorch version is enough.
     #
     # Some details:
     # 1. GPU is not always better than CPU, see:
     #       https://github.com/facebookresearch/faiss/wiki/Comparing-GPU-vs-CPU
     #    From my preliminary experiment (vocab size 20000+, query size 1):
     #       when ann=True, you may set gpu=False
     #       when ann=False, you may set gpu=True
     # 2. There are some overheads in the code. The faiss lib is based on numpy,
     #    while in this code there are some conversions between:
     #       [tensor(GPU) <->] tensor <-> numpy [<-> GPU]
     #
     import faiss
     data = self.embed.cpu().numpy()
     dim = self.embed.size(1)
     index = faiss.IndexFlatL2(dim)
     if ann:
         fast_index = faiss.IndexIVFFlat(index, dim, ann_center)
         fast_index.train(data)
         fast_index.nprobe = ann_nprob
         index = fast_index
     index.add(data)
     if gpu:
         res = faiss.StandardGpuResources()  # use a single GPU
         index = faiss.index_cpu_to_gpu(res, 0, index)
     self.faiss_index = index
    def open(self):
        self.db = plyvel.DB(self.path, create_if_missing=True)

        self.graph = nx.Graph()

        # self.index = hnswlib.Index(space=self.space, dim=self.dim)
        # self.index.init_index(max_elements=self.max_elements, ef_construction=self.ef, M=self.M)
        # self.index.set_ef(self.ef)

        ######################

        print("Starting")
        quantizer = faiss.IndexFlatL2(512)
        self.index = faiss.IndexIVFFlat(quantizer, 512, 100)

        assert not self.index.is_trained
        print("Loading Training Data")
        samples = np.load("../../data/samples.npy")
        print("Training")
        self.index.train(samples)
        assert self.index.is_trained

        ######################

        print("MemoryGraph: loading nodes")
        nodes = self.load_all_nodes()
        for node in nodes:
            self.graph.add_node(node["id"], f=node["f"])
            self.index.add_with_ids(np.array([node["f"]]),
                                    np.array([node["id"]]))

        print("MemoryGraph: loading edges")
        edges = self.load_all_edges()
        for from_node_id, to_node_id in edges:
            self.graph.add_edge(from_node_id, to_node_id)

        print("MemoryGraph: loaded", len(nodes), "nodes,", len(edges), "edges")
Exemple #24
0
def train_kmeans(x, k, ngpu, max_points_per_centroid=256):
    "Runs kmeans on one or several GPUs"
    d = x.shape[1]
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    clus.niter = 20
    clus.max_points_per_centroid = max_points_per_centroid

    if ngpu == 0:
        index = faiss.IndexFlatL2(d)
    else:
        res = [faiss.StandardGpuResources() for i in range(ngpu)]

        flat_config = []
        for i in range(ngpu):
            cfg = faiss.GpuIndexFlatConfig()
            cfg.useFloat16 = False
            cfg.device = i
            flat_config.append(cfg)

        if ngpu == 1:
            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
        else:
            indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                       for i in range(ngpu)]
            index = faiss.IndexProxy()
            for sub_index in indexes:
                index.addIndex(sub_index)

    # perform the training
    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)

    obj = faiss.vector_float_to_array(clus.obj)
    print "final objective: %.4g" % obj[-1]

    return centroids.reshape(k, d)
Exemple #25
0
    def test_IndexIVFPQ(self):
        (xt, xb, xq) = self.get_dataset()
        d = xt.shape[1]

        dev_no = 0
        usePrecomputed = True

        res = faiss.StandardGpuResources()

        flat_config = faiss.GpuIndexFlatConfig()
        flat_config.device = dev_no

        gt_index = faiss.GpuIndexFlatL2(res, d, flat_config)
        gt_index.add(xb)
        D, gt_nns = gt_index.search(xq, 1)

        coarse_quantizer = faiss.IndexFlatL2(d)
        ncentroids = int(np.sqrt(xb.shape[0])) * 4

        index = faiss.IndexIVFPQ(coarse_quantizer, d, ncentroids, 32, 8)
        # add implemented on GPU but not train
        index.train(xt)

        ivfpq_config = faiss.GpuIndexIVFPQConfig()
        ivfpq_config.device = dev_no
        ivfpq_config.usePrecomputedTables = usePrecomputed

        gpuIndex = faiss.GpuIndexIVFPQ(res, index, ivfpq_config)
        gpuIndex.setNumProbes(64)
        index.add(xb)

        D, nns = index.search(xq, 10)
        n_ok = (nns == gt_nns).sum()
        nq = xq.shape[0]
        print ncentroids, n_ok, nq

        self.assertGreater(n_ok, nq * 0.2)
Exemple #26
0
def run_kmeans(x, nmb_clusters, verbose=False):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)

    # Change faiss seed at each k-means so that the randomly picked
    # initialization centroids do not correspond to the same feature ids
    # from an epoch to another.
    clus.seed = np.random.randint(1234)

    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    try:
        res = faiss.StandardGpuResources()
        flat_config = faiss.GpuIndexFlatConfig()
        flat_config.useFloat16 = False
        flat_config.device = 0
        index = faiss.GpuIndexFlatL2(res, d, flat_config)
    except:
        index = faiss.IndexFlatL2(d)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1]
Exemple #27
0
def search_index_pytorch(database, x, k):
    """
    KNN search via Faiss
    :param
        database BxNxC
        x BxMxC
    :return
        D BxMxK
        I BxMxK
    """
    Dptr = database.data_ptr()
    is_cuda = False
    if not (x.is_cuda or database.is_cuda):
        index = faiss.IndexFlatL2(database.size(-1))
    else:
        is_cuda = True
        index = faiss.GpuIndexFlatL2(GPU_RES,
                                     database.size(-1))  # dimension is 3
    index.add_c(database.size(0), faiss.cast_integer_to_float_ptr(Dptr))

    assert x.is_contiguous()
    n, d = x.size()
    assert d == index.d

    D = torch.empty((n, k), dtype=torch.float32, device=x.device)
    I = torch.empty((n, k), dtype=torch.int64, device=x.device)

    if is_cuda:
        torch.cuda.synchronize()
    xptr = __swig_ptr_from_FloatTensor(x)
    Iptr = __swig_ptr_from_LongTensor(I)
    Dptr = __swig_ptr_from_FloatTensor(D)
    index.search_c(n, xptr, k, Dptr, Iptr)
    if is_cuda:
        torch.cuda.synchronize()
    index.reset()
    return D, I
Exemple #28
0
def eval_json(img_paths,num_query,dist,query_features,gallery_features,output_dir,use_distmat):
    query_paths = img_paths[:num_query]
    gallery_paths = img_paths[num_query:]

    num_q, num_g = dist.shape
    #ensemble dist
    np.save(output_dir+'/dist.npy', dist)
    print("save dist")
    np.save(output_dir+'/query_paths.npy', query_paths)
    print("save query_paths")
    np.save(output_dir+'/gallery_paths.npy', gallery_paths)
    print("save gallery_paths")
    dim = query_features.shape[1]

    index = faiss.IndexFlatL2(dim)
    index.add(gallery_features)
    if(use_distmat):
        indices = np.argsort(dist, axis=1)
    else:
        _, indices = index.search(query_features, k=num_g)
    m,n = indices.shape
    res={}
    for i in range(m):
        tmp=[]
        for j in indices[i][:200]:
            tmp.append(gallery_paths[j][-12:])
        res[query_paths[i][-12:]]=tmp
    if use_distmat:
        print(output_dir)
        with open(output_dir+'/res_rr.json','w') as f:
            json.dump(res, f, indent=4, separators=(',', ': '))
        print("writed")
    else:
        print(output_dir)
        with open(output_dir+'/res.json','w') as f:
            json.dump(res, f, indent=4, separators=(',', ': '))
        print("writed")
Exemple #29
0
    def test_sharded(self):
        d = 32
        nb = 1000
        nq = 200
        k = 10
        rs = np.random.RandomState(123)
        xb = rs.rand(nb, d).astype('float32')
        xq = rs.rand(nq, d).astype('float32')

        index_cpu = faiss.IndexFlatL2(d)

        assert faiss.get_num_gpus() > 1

        co = faiss.GpuMultipleClonerOptions()
        co.shard = True
        index = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2)

        index.add(xb)
        D, I = index.search(xq, k)

        index_cpu.add(xb)
        D_ref, I_ref = index_cpu.search(xq, k)

        assert np.all(I == I_ref)

        del index
        index2 = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2)
        D2, I2 = index2.search(xq, k)

        assert np.all(I2 == I_ref)

        try:
            index2.add(xb)
        except RuntimeError:
            pass
        else:
            assert False, "this call should fail!"
Exemple #30
0
def main():
    # parse params:
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_path', default='EN_10MSENTS/Skipthoughts-2018_01_23-05_25_13-80.372-final', type=str)
    parser.add_argument('--books_path', default='data/EN', type=str)
    parser.add_argument('--tokenize', action='store_true')
    parser.add_argument('--gpu', action='store_true')

    parser.add_argument('--max_sent_len', default=25, type=int)
    parser.add_argument('--min_sent_len', default=5, type=int)
    parser.add_argument('--sents_per_book', default=100, type=int)
    parser.add_argument('--books_per_genre', default=20, type=int)

    args = parser.parse_args()

    # load model and dict
    model = u.load_model(args.model_path + '/model.pt')
    vocab_dict = u.load_model(args.model_path + '/model.dict.pt')

    data = make_dataframe(args, model=model, vocab=vocab_dict)

    X = data.filter(regex='neur').as_matrix()
    X = np.array(X, dtype=np.float32)

    import faiss
    index = faiss.IndexFlatL2(X.shape[1])
    index.add(X)
    print(index.ntotal)

    n_examples = 100
    n_neighbors = 5
    D, I = index.search(X[:n_examples], n_neighbors)

    for i in range(n_examples):
        print('-> src sent:', data['sentences'][i][:120])
        for cnt, (dist, idx) in enumerate(zip(D[i, :], I[i, :])):
            print(f'  {cnt + 1} @{dist:.3f}: {data["sentences"][idx][:120]}')