Example #1
0
def train_kmeans(x, k, ngpu):
    "Runs kmeans on one or several GPUs"
    d = x.shape[1]
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    clus.niter = 20
    # otherwise the kmeans implementation sub-samples the training set
    clus.max_points_per_centroid = 10000000
    res = [faiss.StandardGpuResources() for i in range(ngpu)]
    flat_config = []
    for i in range(ngpu):
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = i
        flat_config.append(cfg)
    if ngpu == 1:
        index = faiss.GpuIndexFlatIP(res[-1], d, flat_config[0])
    else:
        indexes = [
            faiss.GpuIndexFlatIP(res[i], d, flat_config[i])
            for i in range(ngpu)
        ]
        index = faiss.IndexProxy()
        for sub_index in indexes:
            index.addIndex(sub_index)
    # perform the training
    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)
    obj = faiss.vector_float_to_array(clus.obj)
    print("final objective: %.4g" % obj[-1])
    return centroids.reshape(k, d)
    def test_update_codebooks(self):
        """Test codebooks updatation."""
        d = 16
        n = 500
        M = 4
        nbits = 6
        K = (1 << nbits)

        # set a larger value to make the updating process more stable
        lambd = 1e-2

        rs = np.random.RandomState(123)
        x = rs.rand(n, d).astype(np.float32)
        codes = rs.randint(0, K, (n, M)).astype(np.int32)

        lsq = faiss.LocalSearchQuantizer(d, M, nbits)
        lsq.lambd = lambd
        lsq.train(x)  # just for allocating memory for codebooks

        codebooks = faiss.vector_float_to_array(lsq.codebooks)
        codebooks = codebooks.reshape(M, K, d).copy()

        lsq.update_codebooks(sp(x), sp(codes), n)
        new_codebooks = faiss.vector_float_to_array(lsq.codebooks)
        new_codebooks = new_codebooks.reshape(M, K, d).copy()

        ref_codebooks = update_codebooks_ref(x, codes, K, lambd)

        np.testing.assert_allclose(new_codebooks, ref_codebooks, atol=1e-3)
def train_kmeans(x, k, ngpu):
    "Runs kmeans on one or several GPUs"
    d = x.shape[1]
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    clus.niter = 20

    # otherwise the kmeans implementation sub-samples the training set
    clus.max_points_per_centroid = 10000000

    res = [faiss.StandardGpuResources() for i in range(ngpu)]

    useFloat16 = False

    if ngpu == 1:
        index = faiss.GpuIndexFlatL2(res[0], 0, d, useFloat16)
    else:
        indexes = [faiss.GpuIndexFlatL2(res[i], i, d, useFloat16)
                   for i in range(ngpu)]
        index = faiss.IndexProxy()
        for sub_index in indexes:
            index.addIndex(sub_index)

    # perform the training
    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)

    obj = faiss.vector_float_to_array(clus.obj)
    print "final objective: %.4g" % obj[-1]

    return centroids.reshape(k, d)
Example #4
0
def train_kmeans(x,
                 num_clusters=1000,
                 gpu_ids=None,
                 niter=100,
                 nredo=1,
                 verbose=0):
    """
    Runs k-means clustering on one or several GPUs
    """
    assert np.all(~np.isnan(x)), 'x contains NaN'
    assert np.all(np.isfinite(x)), 'x contains Inf'
    if isinstance(gpu_ids, int):
        gpu_ids = [gpu_ids]
    assert gpu_ids is None or len(gpu_ids)

    d = x.shape[1]
    kmeans = faiss.Clustering(d, num_clusters)
    kmeans.verbose = bool(verbose)
    kmeans.niter = niter
    kmeans.nredo = nredo

    # otherwise the kmeans implementation sub-samples the training set
    kmeans.max_points_per_centroid = 10000000

    if gpu_ids is not None:
        res = [faiss.StandardGpuResources() for i in gpu_ids]

        flat_config = []
        for i in gpu_ids:
            cfg = faiss.GpuIndexFlatConfig()
            cfg.useFloat16 = False
            cfg.device = i
            flat_config.append(cfg)

        if len(gpu_ids) == 1:
            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
        else:
            indexes = [
                faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                for i in range(len(gpu_ids))
            ]
            index = faiss.IndexProxy()
            for sub_index in indexes:
                index.addIndex(sub_index)
    else:
        index = faiss.IndexFlatL2(d)

    # perform the training
    kmeans.train(x, index)
    centroids = faiss.vector_float_to_array(kmeans.centroids)

    objective = faiss.vector_float_to_array(kmeans.obj)
    #logging.debug("Final objective: %.4g" % objective[-1])

    return centroids.reshape(num_clusters, d)
 def train(self, x):
     if self.index is None:
         return super().train(x)
     else:
         n, d = x.shape
         assert d == self.d
         clus = Clustering(d, self.k, self.cp)
         clus.train(x, self.index)
         centroids = vector_float_to_array(clus.centroids)
         self.centroids = centroids.reshape(self.k, d)
         self.obj = vector_float_to_array(clus.obj)
         return self.obj[-1] if self.obj.size > 0 else 0.0
    def build_faiss(self):
        # FAISS build
        num_clusters = 16
        niter = 5

        # 1. Clustering
        p_emb = self.p_embedding.toarray().astype(np.float32)
        emb_dim = p_emb.shape[-1]
        index_flat = faiss.IndexFlatL2(emb_dim)

        clus = faiss.Clustering(emb_dim, num_clusters)
        clus.verbose = True
        clus.niter = niter
        clus.train(p_emb, index_flat)

        centroids = faiss.vector_float_to_array(clus.centroids)
        centroids = centroids.reshape(num_clusters, emb_dim)

        quantizer = faiss.IndexFlatL2(emb_dim)
        quantizer.add(centroids)

        # 2. SQ8 + IVF indexer (IndexIVFScalarQuantizer)
        self.indexer = faiss.IndexIVFScalarQuantizer(quantizer, quantizer.d,
                                                     quantizer.ntotal,
                                                     faiss.METRIC_L2)
        self.indexer.train(p_emb)
        self.indexer.add(p_emb)
def run_pca(scaled_features, pca_frac):
    """
    Run PCA on scaled features using either faiss (if available) or scikit-learn (otherwise).

    :param scaled_features: Scaled features to run PCA on
    :param pca_frac: Percent of the variance to retain
    :return: pca_features: scaled_features projected to the lower dimensional space
    :return: pca: PCA object from faiss or scikit-learn
    """
    print('Running PCA...')
    if USE_FAISS:
        d = np.size(scaled_features, 1)
        pca = faiss.PCAMatrix(d, d)
        x = np.ascontiguousarray(scaled_features).astype('float32')
        pca.train(x)
        assert pca.is_trained
        eigs = faiss.vector_float_to_array(pca.eigenvalues)
        explained_var = np.cumsum(eigs) / sum(eigs)
        num_retain = np.where(explained_var >= pca_frac)[0][0]
        pca_features = pca.apply_py(x)
        pca_features = pca_features[:, 0:(num_retain + 1)]
        pca.transform = pca.apply_py
    else:
        pca = sklearn.decomposition.PCA(pca_frac, svd_solver='full')
        pca.fit(scaled_features)
        pca_features = pca.transform(scaled_features)

    return pca_features, pca
def train_coarse_quantizer(data,
                           quantizer_path,
                           num_clusters,
                           hnsw=False,
                           niter=10,
                           cuda=False):
    d = data.shape[1]

    index_flat = faiss.IndexFlatL2(d)
    # make it into a gpu index
    if cuda:
        res = faiss.StandardGpuResources()
        index_flat = faiss.index_cpu_to_gpu(res, 0, index_flat)
    clus = faiss.Clustering(d, num_clusters)
    clus.verbose = True
    clus.niter = niter
    clus.train(data, index_flat)
    centroids = faiss.vector_float_to_array(clus.centroids)
    centroids = centroids.reshape(num_clusters, d)

    if hnsw:
        quantizer = faiss.IndexHNSWFlat(d, 32)
        quantizer.hnsw.efSearch = 128
        quantizer.train(centroids)
        quantizer.add(centroids)
    else:
        quantizer = faiss.IndexFlatL2(d)
        quantizer.add(centroids)

    faiss.write_index(quantizer, quantizer_path)
    def test_decode(self):
        """Test LSQ decode"""
        d = 16
        n = 500
        M = 4
        nbits = 6
        K = (1 << nbits)

        rs = np.random.RandomState(123)
        x = rs.rand(n, d).astype(np.float32)
        codes = rs.randint(0, K, (n, M)).astype(np.int32)
        lsq = faiss.LocalSearchQuantizer(d, M, nbits)
        lsq.train(x)

        # decode x
        pack_codes = np.zeros((n, lsq.code_size)).astype(np.uint8)
        decoded_x = np.zeros((n, d)).astype(np.float32)
        lsq.pack_codes(n, sp(codes), sp(pack_codes))
        lsq.decode_c(sp(pack_codes), sp(decoded_x), n)

        # decode in Python
        codebooks = faiss.vector_float_to_array(lsq.codebooks)
        codebooks = codebooks.reshape(M, K, d).copy()
        decoded_x_ref = decode_ref(x, codebooks, codes)

        np.testing.assert_allclose(decoded_x, decoded_x_ref, rtol=1e-6)
    def test_icm_encode(self):
        d = 16
        n = 500
        M = 4
        nbits = 4
        K = (1 << nbits)

        rs = np.random.RandomState(123)
        x = rs.rand(n, d).astype(np.float32)

        lsq = faiss.LocalSearchQuantizer(d, M, nbits)
        lsq.train(x)  # just for allocating memory for codebooks

        # compute binary terms
        binaries = np.zeros((M, M, K, K)).astype(np.float32)
        lsq.compute_binary_terms(sp(binaries))

        # compute unary terms
        unaries = np.zeros((M, n, K)).astype(np.float32)
        lsq.compute_unary_terms(sp(x), sp(unaries), n)

        # randomly generate codes
        codes = rs.randint(0, K, (n, M)).astype(np.int32)
        new_codes = codes.copy()

        # do icm encoding given binary and unary terms
        lsq.icm_encode_step(sp(new_codes), sp(unaries), sp(binaries), n, 1)

        # do icm encoding without pre-computed unary and bianry terms in Python
        codebooks = faiss.vector_float_to_array(lsq.codebooks)
        codebooks = codebooks.reshape(M, K, d).copy()
        ref_codes = icm_encode_ref(x, codebooks, codes)

        np.testing.assert_array_equal(new_codes, ref_codes)
Example #11
0
def kmeans(data, k, nrestarts=10, niters=100):
    """
    Run k-means on the input data.

    :param data: Data to be clustered
    :param k: Number of clusters
    :param nrestarts: Number of restarts of k-means to try
    :param niters: Number of iterations to perform
    :return: centroids from k-means
    """
    data = np.ascontiguousarray(data.cpu().numpy()).astype('float32')
    d = data.shape[1]

    clus = faiss.Clustering(d, k)
    clus.verbose = False
    clus.niter = niters
    clus.nredo = nrestarts
    clus.seed = defaults.seed
    clus.spherical = False
    index = faiss.IndexFlatL2(d)

    clus.train(data, index)
    centroids = faiss.vector_float_to_array(clus.centroids).reshape(k, d)
    centroids = torch.Tensor(centroids).to(defaults.device)

    return centroids
Example #12
0
def spherical_kmeans(data, k, nrestarts=10, niters=100):
    """
    Run spherical k-means on the input data.

    :param data: Data to be clustered
    :param k: Number of clusters
    :param nrestarts: Number of restarts of spherical k-means to try
    :param niters: Number of iterations to perform
    :return: centroids from spherical k-means
    """
    data = np.ascontiguousarray(data.cpu().numpy()).astype('float32')
    data /= np.linalg.norm(data, axis=1)[:, np.newaxis]
    d = data.shape[1]

    clus = faiss.Clustering(d, k)
    clus.verbose = False
    clus.niter = niters
    clus.nredo = nrestarts
    clus.seed = defaults.seed
    clus.spherical = True
    index = faiss.IndexFlatIP(d)

    clus.train(data, index)
    centroids = faiss.vector_float_to_array(clus.centroids).reshape(k, d)
    centroids = torch.Tensor(centroids).to(defaults.device)

    return centroids / torch.norm(centroids, 2, 1).unsqueeze(1)
Example #13
0
def clusering(data, niter=1000, verbose=True, ncentroids=1024, max_points_per_centroid=10000000, gpu_id=0, spherical=False):
    # use one gpu
    '''
    res = faiss.StandardGpuResources()
    cfg = faiss.GpuIndexFlatConfig()
    cfg.useFloat16 = False
    cfg.device = gpu_id

    d = data.shape[1]
    if spherical:
        index = faiss.GpuIndexFlatIP(res, d, cfg)
    else:
        index = faiss.GpuIndexFlatL2(res, d, cfg)
    '''
    d = data.shape[1]
    if spherical:
        index = faiss.IndexFlatIP(d)
    else:
        index = faiss.IndexFlatL2(d)

    clus = faiss.Clustering(d, ncentroids)
    clus.verbose = True
    clus.niter = niter
    clus.max_points_per_centroid = max_points_per_centroid

    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)
    centroids = centroids.reshape(ncentroids, d)

    index.reset()
    index.add(centroids)
    D, I = index.search(data, 1)

    return D, I
def run_kmeans(x, nmb_clusters, verbose=False, knn=1):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    nmb_clusters = int(nmb_clusters)
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 30
    clus.max_points_per_centroid = 10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    dists, labels = index.search(x, knn)
    losses = faiss.vector_to_array(clus.obj)
    centroids = faiss.vector_float_to_array(clus.centroids).reshape(
        nmb_clusters, d)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in labels], losses[-1], centroids, dists
Example #15
0
 def cluster(self, points, k, **index_kwargs):
     """Clustering given points into k clusters"""
     index = self._faiss_index_flat(points.shape[1], **index_kwargs)
     clus = faiss.Clustering(points.shape[1], k)
     clus.verbose = False
     clus.niter = 10
     clus.train(np.ascontiguousarray(points, dtype=np.float32), index)
     return faiss.vector_float_to_array(clus.centroids).reshape(
         clus.k, clus.d)
Example #16
0
    def test_remove(self):
        # only tests the python interface

        index = faiss.IndexFlat(5)
        xb = np.zeros((10, 5), dtype='float32')
        xb[:, 0] = np.arange(10) + 1000
        index.add(xb)
        index.remove_ids(np.arange(5) * 2)
        xb2 = faiss.vector_float_to_array(index.xb).reshape(5, 5)
        assert np.all(xb2[:, 0] == xb[np.arange(5) * 2 + 1, 0])
Example #17
0
    def test_remove(self):
        # only tests the python interface

        index = faiss.IndexFlat(5)
        xb = np.zeros((10, 5), dtype='float32')
        xb[:, 0] = np.arange(10) + 1000
        index.add(xb)
        index.remove_ids(np.arange(5) * 2)
        xb2 = faiss.vector_float_to_array(index.xb).reshape(5, 5)
        assert np.all(xb2[:, 0] == xb[np.arange(5) * 2 + 1, 0])
Example #18
0
def train_kmeans(x, num_clusters=1000, num_gpus=1):
    """
    Runs k-means clustering on one or several GPUs
    """
    d = x.shape[1]
    kmeans = faiss.Clustering(d, num_clusters)
    kmeans.verbose = True
    kmeans.niter = 20

    # otherwise the kmeans implementation sub-samples the training set
    kmeans.max_points_per_centroid = 10000000

    res = [faiss.StandardGpuResources() for i in range(num_gpus)]

    flat_config = []
    for i in range(num_gpus):
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = i
        flat_config.append(cfg)

    if num_gpus == 1:
        index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
    else:
        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                   for i in range(num_gpus)]
        index = faiss.IndexProxy()
        for sub_index in indexes:
            index.addIndex(sub_index)

    # perform the training
    kmeans.train(x, index)
    print 'Total number of indexed vectors (after kmeans.train()):', index.ntotal
    centroids = faiss.vector_float_to_array(kmeans.centroids)

    objective = faiss.vector_float_to_array(kmeans.obj)
    print 'Objective values per iter:', objective
    print "Final objective: %.4g" % objective[-1]

    # TODO: return cluster assignment

    return centroids.reshape(num_clusters, d)
Example #19
0
def train_kmeans(x, k, ngpu, max_points_per_centroid=256):
    "Runs kmeans on one or several GPUs"
    d = x.shape[1]
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    clus.niter = 20
    clus.max_points_per_centroid = max_points_per_centroid

    if ngpu == 0:
        index = faiss.IndexFlatL2(d)
    else:
        res = [faiss.StandardGpuResources() for i in range(ngpu)]

        flat_config = []
        for i in range(ngpu):
            cfg = faiss.GpuIndexFlatConfig()
            cfg.useFloat16 = False
            cfg.device = i
            flat_config.append(cfg)

        if ngpu == 1:
            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
        else:
            indexes = [
                faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                for i in range(ngpu)
            ]
            index = faiss.IndexReplicas()
            for sub_index in indexes:
                index.addIndex(sub_index)

    # perform the training
    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)

    obj = faiss.vector_float_to_array(clus.obj)
    print "final objective: %.4g" % obj[-1]

    return centroids.reshape(k, d)
    def update_pseudo_labels(self, model, dataloader, device):
        import faiss, time
        #### Reset Classifier Weights
        torch.cuda.empty_cache()
        self.classifier.weight.data.normal_(0, 1 / model.feature_dim)

        with torch.no_grad():
            _ = model.eval()
            _ = model.to(device)

            memory_queue = []
            for i, input_tuple in enumerate(
                    tqdm(dataloader,
                         'Getting DC Embeddings...',
                         total=len(dataloader))):
                embed = model(input_tuple[1].type(
                    torch.FloatTensor).to(device))[-1]
                memory_queue.append(embed)
            memory_queue = torch.cat(memory_queue, dim=0).cpu().numpy()

        #PERFORM PCA
        print('Computing PCA... ', end='')
        start = time.time()
        pca_mat = faiss.PCAMatrix(memory_queue.shape[-1], self.red_dim)
        pca_mat.train(memory_queue)
        memory_queue = pca_mat.apply_py(memory_queue)
        print('Done in {}s.'.format(time.time() - start))
        #
        #
        print('Computing Pseudolabels... ', end='')
        start = time.time()
        cpu_cluster_index = faiss.IndexFlatL2(memory_queue.shape[-1])
        kmeans = faiss.Clustering(memory_queue.shape[-1], self.ncluster)
        kmeans.niter = 20
        kmeans.min_points_per_centroid = 1
        kmeans.max_points_per_centroid = 1000000000
        ### Train Kmeans
        kmeans.train(memory_queue, cpu_cluster_index)
        centroids = faiss.vector_float_to_array(kmeans.centroids).reshape(
            self.ncluster, memory_queue.shape[-1])
        ###
        faiss_search_index = faiss.IndexFlatL2(centroids.shape[-1])
        faiss_search_index.add(centroids)
        _, computed_cluster_labels = faiss_search_index.search(memory_queue, 1)
        print('Done in {}s.'.format(time.time() - start))
        ###
        self.pseudo_labels = computed_cluster_labels
        ###
        torch.cuda.empty_cache()
Example #21
0
def train_kmeans(x, k, ngpu):
    "Runs kmeans on one or several GPUs"
    d = x.shape[1]
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    clus.niter = 20

    # otherwise the kmeans implementation sub-samples the training set
    clus.max_points_per_centroid = 10000000

    res = [faiss.StandardGpuResources() for i in range(ngpu)]

    flat_config = []
    for i in range(ngpu):
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = i
        flat_config.append(cfg)

    if ngpu == 1:
        index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
    else:
        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                   for i in range(ngpu)]
        index = faiss.IndexProxy()
        for sub_index in indexes:
            index.addIndex(sub_index)

    # perform the training
    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)

    obj = faiss.vector_float_to_array(clus.obj)
    print "final objective: %.4g" % obj[-1]

    return centroids.reshape(k, d)
Example #22
0
def train_coarse_quantizer(x, k, preproc):
    d = preproc.d_out
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    # clus.niter = 2
    clus.max_points_per_centroid = 10000000

    print "apply preproc on shape", x.shape, 'k=', k
    t0 = time.time()
    x = preproc.apply_py(sanitize(x))
    print "   preproc %.3f s output shape %s" % (time.time() - t0, x.shape)

    vres, vdev = make_vres_vdev()
    index = faiss.index_cpu_to_gpu_multiple(vres, vdev, faiss.IndexFlatL2(d))

    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)

    return centroids.reshape(k, d)
Example #23
0
def run_kmeans(x, nmb_clusters, verbose=True):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape
    print("x.shape:", x.shape)

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))
        centroids = faiss.vector_float_to_array(clus.centroids)
        print("centroids:")
        # print(clus.centroids)
        print("type:", type(centroids))
        print("len:", len(centroids))
        print("shape:", centroids.shape)
        # print(centroids)
        centroids_rs = centroids.reshape(nmb_clusters, d)
        print("centroids_reshape:")
        print("type:", type(centroids_rs))
        print("len:", len(centroids_rs))
        print("shape:", centroids_rs.shape)
        #print(centroids_rs)
        #assert 1 == 0

    return [int(n[0]) for n in I], losses[-1]
Example #24
0
def train_coarse_quantizer(x, k, preproc):
    d = preproc.d_out
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    # clus.niter = 2
    clus.max_points_per_centroid = 10000000

    print "apply preproc on shape", x.shape, 'k=', k
    t0 = time.time()
    x = preproc.apply_py(sanitize(x))
    print "   preproc %.3f s output shape %s" % (
        time.time() - t0, x.shape)

    vres, vdev = make_vres_vdev()
    index = faiss.index_cpu_to_gpu_multiple(
        vres, vdev, faiss.IndexFlatL2(d))

    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)

    return centroids.reshape(k, d)
    def test_compute_unary_terms(self):
        d = 16
        n = 500
        M = 4
        nbits = 6
        K = (1 << nbits)

        rs = np.random.RandomState(123)
        x = rs.rand(n, d).astype(np.float32)
        unaries = np.zeros((M, n, K)).astype(np.float32)

        lsq = faiss.LocalSearchQuantizer(d, M, nbits)
        lsq.train(x)  # just for allocating memory for codebooks

        lsq.compute_unary_terms(sp(x), sp(unaries), n)

        codebooks = faiss.vector_float_to_array(lsq.codebooks)
        codebooks = codebooks.reshape(M, K, d).copy()
        ref_unaries = compute_unary_terms_ref(codebooks, x)

        np.testing.assert_allclose(unaries, ref_unaries, atol=1e-4)
Example #26
0
def kmeans(data, k, nrestarts=10, niters=100):
    """
    Run k-means on the input data.
    """
    data = np.ascontiguousarray(data.cpu().numpy()).astype('float32')
    d = data.shape[1]

    clus = faiss.Clustering(d, k)
    clus.verbose = False
    clus.niter = niters
    clus.nredo = nrestarts
    clus.seed = defaults.seed
    clus.spherical = False

    index = faiss.IndexFlatL2(d)

    clus.train(data, index)
    centroids = faiss.vector_float_to_array(clus.centroids).reshape(k, d)
    centroids = torch.Tensor(centroids).to(defaults.device)

    return centroids
    def nmi(self, embeddings, labels):
        if isinstance(embeddings, list):
            embeddings = np.concatenate(embeddings, axis=0)
            labels = np.concatenate(labels, axis=0)

        faiss_search_index = faiss.IndexFlatL2(embeddings.shape[-1])
        faiss_cluster_index = faiss.Clustering(embeddings.shape[-1],
                                               self.num_classes)
        faiss_cluster_index.n_iter, faiss_cluster_index.min_points_per_centroid, faiss_cluster_index.max_points_per_centroid = 20, 1, 1000000000
        #
        faiss_cluster_index.train(embeddings, faiss_search_index)
        embedding_centroids = faiss.vector_float_to_array(
            faiss_cluster_index.centroids).reshape(self.num_classes,
                                                   embeddings.shape[-1])
        #
        faiss_search_index = faiss.IndexFlatL2(embedding_centroids.shape[-1])
        faiss_search_index.add(embedding_centroids)
        _, centroids_cluster_labels = faiss_search_index.search(embeddings, 1)
        #
        NMI = metrics.cluster.normalized_mutual_info_score(
            centroids_cluster_labels.reshape(-1), labels.reshape(-1))
        #
        return [NMI]
Example #28
0
    target_labels   = np.array(target_labels)
    list_of_metrics = [metrics.select(metricname) for metricname in opt.evaluation_metrics]
    metrics_list    = [{} for _ in range(len(weightslist))]
    n_classes       = opt.n_classes
    for k,(weights,features) in enumerate(zip(weightslist,feature_colls)):
        features = np.vstack(features).astype('float32')

        ####################################
        cpu_cluster_index = faiss.IndexFlatL2(features.shape[-1])
        kmeans            = faiss.Clustering(features.shape[-1], n_classes)
        kmeans.niter = 20
        kmeans.min_points_per_centroid = 1
        kmeans.max_points_per_centroid = 1000000000
        ### Train Kmeans
        kmeans.train(features, cpu_cluster_index)
        centroids = faiss.vector_float_to_array(kmeans.centroids).reshape(n_classes, features.shape[-1])

        ###################################
        faiss_search_index = faiss.IndexFlatL2(centroids.shape[-1])
        faiss_search_index.add(centroids)
        _, computed_cluster_labels = faiss_search_index.search(features, 1)

        faiss_search_index  = faiss.IndexFlatL2(features.shape[-1])
        faiss_search_index.add(features)

        ##################################
        max_kval            = np.max([int(x.split('@')[-1]) for x in opt.evaluation_metrics if 'recall' in x])
        _, k_closest_points = faiss_search_index.search(features, int(max_kval+1))
        k_closest_classes   = target_labels.reshape(-1)[k_closest_points[:,1:]]

Example #29
0
    def compute_standard(self, opt, model, dataloader, evaltypes, device,
                         **kwargs):
        evaltypes = copy.deepcopy(evaltypes)

        n_classes = opt.n_classes
        image_paths = np.array([x[0] for x in dataloader.dataset.image_list])
        _ = model.eval()

        ###
        feature_colls = {key: [] for key in evaltypes}

        ###
        with torch.no_grad():
            target_labels = []
            final_iter = tqdm(dataloader,
                              desc='Embedding Data...'.format(len(evaltypes)))
            image_paths = [x[0] for x in dataloader.dataset.image_list]
            for idx, inp in enumerate(final_iter):
                input_img, target = inp[1], inp[0]
                target_labels.extend(target.numpy().tolist())
                out = model(input_img.to(device))
                if isinstance(out, tuple): out, aux_f = out

                ### Include Metrics for separate linear layers.
                if hasattr(model, 'merge_linear'):
                    merged_features = model.merge_linear(
                        torch.cat([feat for feat in out.values()], dim=-1))
                    if 'merged_discriminative' not in feature_colls:
                        feature_colls['merged_discriminative'] = []
                    feature_colls['merged_discriminative'].extend(
                        merged_features.cpu().detach().numpy().tolist())
                if hasattr(model, 'separate_linear'):
                    sep_features = model.separate_linear(aux_f)
                    if 'separate_discriminative' not in feature_colls:
                        feature_colls['separate_discriminative'] = []
                    feature_colls['separate_discriminative'].extend(
                        sep_features.cpu().detach().numpy().tolist())
                if hasattr(model, 'supervised_embed'):
                    sup_features = model.supervised_embed(aux_f)
                    if 'supervised_discriminative' not in feature_colls:
                        feature_colls['supervised_discriminative'] = []
                    feature_colls['supervised_discriminative'].extend(
                        sup_features.cpu().detach().numpy().tolist())

                ### Include embeddings of all output features
                for evaltype in evaltypes:
                    if 'Combined' not in evaltype and 'Sum' not in evaltype:
                        if isinstance(out, dict):
                            feature_colls[evaltype].extend(
                                out[evaltype].cpu().detach().numpy().tolist())
                        else:
                            feature_colls[evaltype].extend(
                                out.cpu().detach().numpy().tolist())

                ### Include desired combination embeddings
                for evaltype in evaltypes:
                    ### By Weighted Concatenation
                    if 'Combined' in evaltype:
                        weights = [float(x) for x in evaltype.split('-')[1:]]
                        subevaltypes = evaltype.split('Combined_')[-1].split(
                            '-')[0].split('_')
                        weighted_subfeatures = [
                            weights[i] * out[subevaltype]
                            for i, subevaltype in enumerate(subevaltypes)
                        ]
                        if 'normalize' in model.name:
                            feature_colls[evaltype].extend(
                                torch.nn.functional.normalize(
                                    torch.cat(weighted_subfeatures, dim=-1),
                                    dim=-1).cpu().detach().numpy().tolist())
                        else:
                            feature_colls[evaltype].extend(
                                torch.cat(
                                    weighted_subfeatures,
                                    dim=-1).cpu().detach().numpy().tolist())

                    ### By Weighted Sum
                    if 'Sum' in evaltype:
                        weights = [float(x) for x in evaltype.split('-')[1:]]
                        subevaltypes = evaltype.split('Sum')[-1].split(
                            '-')[0].split('_')
                        weighted_subfeatures = [
                            weights[i] * out[subevaltype]
                            for i, subevaltype in subevaltypes
                        ]
                        if 'normalize' in model.name:
                            feature_colls[evaltype].extend(
                                torch.nn.functional.normalize(
                                    torch.sum(weighted_subfeatures, dim=-1),
                                    dim=-1).cpu().detach().numpy().tolist())
                        else:
                            feature_colls[evaltype].extend(
                                torch.sum(
                                    weighted_subfeatures,
                                    dim=-1).cpu().detach().numpy().tolist())

            target_labels = np.hstack(target_labels).reshape(-1, 1)

        if hasattr(model, 'merge_linear'):
            evaltypes.append('merged_discriminative')
        if hasattr(model, 'separate_linear'):
            evaltypes.append('separate_discriminative')
        if hasattr(model, 'supervised_embed'):
            evaltypes.append('supervised_discriminative')

        computed_metrics = {evaltype: {} for evaltype in evaltypes}
        extra_infos = {evaltype: {} for evaltype in evaltypes}

        ###
        for evaltype in evaltypes:
            features = np.vstack(feature_colls[evaltype]).astype('float32')

            if 'kmeans' in self.requires:
                ### Set CPU Cluster index
                cpu_cluster_index = faiss.IndexFlatL2(features.shape[-1])
                kmeans = faiss.Clustering(features.shape[-1], n_classes)
                kmeans.niter = 20
                kmeans.min_points_per_centroid = 1
                kmeans.max_points_per_centroid = 1000000000
                ### Train Kmeans
                kmeans.train(features, cpu_cluster_index)
                centroids = faiss.vector_float_to_array(
                    kmeans.centroids).reshape(n_classes, features.shape[-1])

            if 'kmeans_nearest' in self.requires:
                faiss_search_index = faiss.IndexFlatL2(centroids.shape[-1])
                faiss_search_index.add(centroids)
                _, computed_cluster_labels = faiss_search_index.search(
                    features, 1)

            if 'nearest_features' in self.requires:
                faiss_search_index = faiss.IndexFlatL2(features.shape[-1])
                faiss_search_index.add(features)

                max_kval = np.max([
                    int(x.split('@')[-1]) for x in self.metric_names
                    if 'recall' in x
                ])
                _, k_closest_points = faiss_search_index.search(
                    features, int(max_kval + 1))
                k_closest_classes = target_labels.reshape(-1)[
                    k_closest_points[:, 1:]]

            ###
            for metric in self.list_of_metrics:
                input_dict = {}
                if 'features' in metric.requires:
                    input_dict['features'] = features
                if 'target_labels' in metric.requires:
                    input_dict['target_labels'] = target_labels
                if 'kmeans' in metric.requires:
                    input_dict['centroids'] = centroids
                if 'kmeans_nearest' in metric.requires:
                    input_dict[
                        'computed_cluster_labels'] = computed_cluster_labels
                if 'nearest_features' in metric.requires:
                    input_dict['k_closest_classes'] = k_closest_classes

                computed_metrics[evaltype][metric.name] = metric(**input_dict)

            extra_infos[evaltype] = {
                'features': features,
                'target_labels': target_labels,
                'image_paths': dataloader.dataset.image_paths,
                'query_image_paths': None,
                'gallery_image_paths': None
            }

        return computed_metrics, extra_infos

        def compute_query_gallery(self, opt, model, query_dataloader,
                                  gallery_dataloader, evaltypes, device,
                                  **kwargs):
            n_classes = opt.n_classes
            query_image_paths = np.array(
                [x[0] for x in query_dataloader.dataset.image_list])
            gallery_image_paths = np.array(
                [x[0] for x in gallery_dataloader.dataset.image_list])
            _ = model.eval()

            ###
            query_feature_colls = {evaltype: [] for evaltype in evaltypes}
            gallery_feature_colls = {evaltype: [] for evaltype in evaltypes}

            ### For all test images, extract features
            with torch.no_grad():
                ### Compute Query Embedding Features
                query_target_labels = []
                query_iter = tqdm(query_dataloader,
                                  desc='Extraction Query Features')
                for idx, inp in enumerate(query_iter):
                    input_img, target = inp[1], inp[0]
                    query_target_labels.extend(target.numpy().tolist())
                    out = model(input_img.to(device))
                    if isinstance(out, tuple): out, aux_f = out

                    ### Include Metrics for separate linear layers.
                    if hasattr(model, 'merge_linear'):
                        merged_features = model.merge_linear(
                            torch.cat([feat for feat in out.values()], dim=-1))
                        if 'merged_discriminative' not in query_feature_colls:
                            query_feature_colls['merged_discriminative'] = []
                        query_feature_colls['merged_discriminative'].extend(
                            merged_features.cpu().detach().numpy().tolist())
                    if hasattr(model, 'separate_linear'):
                        sep_features = model.separate_linear(aux_f)
                        if 'separate_discriminative' not in query_feature_colls:
                            query_feature_colls['separate_discriminative'] = []
                        query_feature_colls['separate_discriminative'].extend(
                            merged_features.cpu().detach().numpy().tolist())

                    for evaltype in evaltypes:
                        if 'Combined' not in evaltype:
                            if isinstance(out, dict):
                                query_feature_colls[evaltype].extend(
                                    out[evaltype].cpu().detach().numpy(
                                    ).tolist())
                            else:
                                query_feature_colls[evaltype].extend(
                                    out.cpu().detach().numpy().tolist())

                    for evaltype in evaltypes:
                        if 'Combined' in evaltype:
                            weights = [
                                float(x) for x in evaltype.split('-')[1:]
                            ]
                            subevaltypes = evaltype.split(
                                'Combined_')[-1].split('-')[0].split('_')
                            weighted_subfeatures = [
                                weights[i] * out[subevaltype]
                                for i, subevaltype in subevaltypes
                            ]
                            query_feature_colls[evaltype].extend(
                                torch.nn.functional.normalize(
                                    torch.cat(weighted_subfeatures, dim=-1),
                                    dim=-1).cpu().detach().numpy().tolist())

                ### Compute Gallery Embedding Features
                gallery_target_labels = []
                gallery_iter = tqdm(gallery_dataloader,
                                    desc='Extraction Gallery Features')
                for idx, inp in enumerate(gallery_iter):
                    input_img, target = inp[1], inp[0]
                    gallery_target_labels.extend(target.numpy().tolist())
                    out = model(input_img.to(device))
                    if isinstance(out, tuple): out, aux_f = out

                    ### Include Metrics for separate linear layers.
                    if hasattr(model, 'merge_linear'):
                        merged_features = model.merge_linear(
                            torch.cat([feat for feat in out.values()], dim=-1))
                        if 'merged_discriminative' not in gallery_feature_colls:
                            gallery_feature_colls['merged_discriminative'] = []
                        gallery_feature_colls['merged_discriminative'].extend(
                            merged_features.cpu().detach().numpy().tolist())
                    if hasattr(model, 'separate_linear'):
                        sep_features = model.separate_linear(aux_f)
                        if 'separate_discriminative' not in gallery_feature_colls:
                            gallery_feature_colls[
                                'separate_discriminative'] = []
                        gallery_feature_colls['separate_discriminative'].extend(
                            merged_features.cpu().detach().numpy().tolist())

                    for evaltype in evaltypes:
                        if 'Combined' not in evaltype:
                            if isinstance(out, dict):
                                gallery_feature_colls[evaltype].extend(
                                    out[evaltype].cpu().detach().numpy(
                                    ).tolist())
                            else:
                                gallery_feature_colls[evaltype].extend(
                                    out.cpu().detach().numpy().tolist())

                    for evaltype in evaltypes:
                        if 'Combined' in evaltype:
                            weights = [
                                float(x) for x in evaltype.split('-')[1:]
                            ]
                            subevaltypes = evaltype.split(
                                'Combined_')[-1].split('-')[0].split('_')
                            weighted_subfeatures = [
                                weights[i] * out[subevaltype]
                                for i, subevaltype in subevaltypes
                            ]
                            gallery_feature_colls[evaltype].extend(
                                torch.nn.functional.normalize(
                                    torch.cat(weighted_subfeatures, dim=-1),
                                    dim=-1).cpu().detach().numpy().tolist())

                ###
                query_target_labels, gallery_target_labels = np.hstack(
                    query_target_labels).reshape(
                        -1,
                        1), np.hstack(gallery_target_labels).reshape(-1, 1)
                computed_metrics = {evaltype: {} for evaltype in evaltypes}
                extra_infos = {evaltype: {} for evaltype in evaltypes}

                if hasattr(model, 'merge_linear'):
                    evaltypes.append('merged_discriminative')
                if hasattr(model, 'separate_linear'):
                    evaltypes.append('separate_discriminative')

                ###
                for evaltype in evaltypes:
                    query_features = np.vstack(
                        query_feature_colls[evaltype]).astype('float32')
                    gallery_features = np.vstack(
                        gallery_feature_colls[evaltype]).astype('float32')

                    if 'kmeans' in self.requires:
                        ### Set CPU Cluster index
                        stackset = np.concatenate(
                            [query_features, gallery_features], axis=0)
                        stacklabels = np.concatenate(
                            [query_target_labels, gallery_target_labels],
                            axis=0)
                        cpu_cluster_index = faiss.IndexFlatL2(
                            stackset.shape[-1])
                        kmeans = faiss.Clustering(stackset.shape[-1],
                                                  n_classes)
                        kmeans.niter = 20
                        kmeans.min_points_per_centroid = 1
                        kmeans.max_points_per_centroid = 1000000000
                        ### Train Kmeans
                        kmeans.train(stackset, cpu_cluster_index)
                        centroids = faiss.vector_float_to_array(
                            kmeans.centroids).reshape(n_classes,
                                                      stackset.shape[-1])

                    if 'kmeans_nearest' in self.requires:
                        faiss_search_index = faiss.IndexFlatL2(
                            centroids.shape[-1])
                        faiss_search_index.add(centroids)
                        _, computed_cluster_labels = faiss_search_index.search(
                            stackset, 1)

                    if 'nearest_features' in self.requires:
                        faiss_search_index = faiss.IndexFlatL2(
                            gallery_features.shape[-1])
                        faiss_search_index.add(gallery_features)
                        _, k_closest_points = faiss_search_index.search(
                            query_features, int(np.max(k_vals)))
                        k_closest_classes = gallery_target_labels.reshape(
                            -1)[k_closest_points]

                    ###
                    for metric in self.list_of_metrics:
                        input_dict = {}
                        if 'features' in metric.requires:
                            input_dict['features'] = features
                        if 'target_labels' in metric.requires:
                            input_dict['target_labels'] = target_labels
                        if 'kmeans' in metric.requires:
                            input_dict['centroids'] = centroids
                        if 'kmeans_nearest' in metric.requires:
                            input_dict[
                                'computed_cluster_labels'] = computed_cluster_labels
                        if 'nearest_features' in metric.requires:
                            input_dict['k_closest_classes']

                        computed_metrics[evaltype][metric.name] = metric(
                            **input_dict)

                    ###
                    extra_infos[evaltype] = {
                        'features':
                        features,
                        'image_paths':
                        None,
                        'target_labels':
                        target_labels,
                        'query_image_paths':
                        query_dataloader.dataset.image_paths,
                        'gallery_image_paths':
                        gallery_dataloader.dataset.image_paths
                    }

                return computed_metrics, extra_info
def kmeans(features, nclusters, num_iters, ngpu, njobs, seed):
    """
    Run k-means on features, generating nclusters clusters. It will use, in order of preference, Faiss, pomegranate, or
    scikit-learn.

    :param features: Features to cluster.
    :param nclusters: Number of clusters to generate.
    :param num_iters: Maximum number of iterations to perform.
    :param ngpu: Number of GPUs to use (if GPUs are available).
    :param njobs: Number of threads to use.
    :param seed: Seed for reproducibility.
    :return: centroids: The centroids found with k-means.
    """
    print('Running k-means...')
    if USE_FAISS:
        d = features.shape[1]
        pca_features = np.ascontiguousarray(features).astype('float32')

        clus = faiss.Clustering(d, nclusters)
        clus.verbose = True
        clus.niter = num_iters
        if seed is not None:
            clus.seed = seed

        # otherwise the kmeans implementation sub-samples the training set
        clus.max_points_per_centroid = 10000000

        if USE_GPU:
            res = [faiss.StandardGpuResources() for i in range(ngpu)]

            flat_config = []
            for i in range(ngpu):
                cfg = faiss.GpuIndexFlatConfig()
                cfg.useFloat16 = False
                cfg.device = i
                flat_config.append(cfg)

            if ngpu == 1:
                index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
            else:
                indexes = [
                    faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                    for i in range(ngpu)
                ]
                index = faiss.IndexProxy()
                for sub_index in indexes:
                    index.addIndex(sub_index)
        else:
            index = faiss.IndexFlatL2(d)

        clus.train(pca_features, index)
        centroids = faiss.vector_float_to_array(clus.centroids)
        centroids = centroids.reshape(nclusters, d)

    elif USE_POMEGRANATE and seed is None:
        kmeans = pomegranate.kmeans.Kmeans(nclusters,
                                           init='kmeans++',
                                           n_init=10)
        kmeans.fit(features, max_iterations=num_iters, n_jobs=njobs)
        centroids = kmeans.centroids
    else:
        if USE_POMEGRANATE and seed is not None:
            print(
                'Pomegranate does not currently support k-means with a seed. Switching to scikit-learn instead.'
            )
        print('Using scikit-learn. This may be slow!')
        kmeans = sklearn.cluster.KMeans(n_clusters=nclusters,
                                        random_state=seed).fit(features)
        centroids = kmeans.cluster_centers_

    return centroids
Example #31
0
    def compute_standard(self, opt, model, dataloader, evaltypes, device, **kwargs):
        evaltypes = copy.deepcopy(evaltypes)

        n_classes = opt.n_classes
        image_paths     = np.array([x[0] for x in dataloader.dataset.image_list])
        _ = model.eval()

        ###
        feature_colls  = {key:[] for key in evaltypes}

        ###
        with torch.no_grad():
            target_labels = []
            final_iter = tqdm(dataloader, desc='Embedding Data...'.format(len(evaltypes)))
            image_paths= [x[0] for x in dataloader.dataset.image_list]
            for idx,inp in enumerate(final_iter):
                input_img,target = inp[1], inp[0]
                target_labels.extend(target.numpy().tolist())
                out_dict = model(input_img.to(device))
                out, extra_out = [out_dict[key] for key in ['embeds', 'extra_embeds']]

                ### Include embeddings of all output features
                for evaltype in evaltypes:
                    if isinstance(out, dict):
                        feature_colls[evaltype].extend(out[evaltype].cpu().detach().numpy().tolist())
                    else:
                        feature_colls[evaltype].extend(out.cpu().detach().numpy().tolist())


            target_labels = np.hstack(target_labels).reshape(-1,1)


        computed_metrics = {evaltype:{} for evaltype in evaltypes}
        extra_infos      = {evaltype:{} for evaltype in evaltypes}


        ###
        faiss.omp_set_num_threads(self.pars.kernels)
        # faiss.omp_set_num_threads(self.pars.kernels)
        res = None
        torch.cuda.empty_cache()
        if self.pars.evaluate_on_gpu:
            res = faiss.StandardGpuResources()


        import time
        for evaltype in evaltypes:
            features        = np.vstack(feature_colls[evaltype]).astype('float32')
            features_cosine = normalize(features, axis=1)

            start = time.time()

            """============ Compute k-Means ==============="""
            if 'kmeans' in self.requires:
                ### Set CPU Cluster index
                cluster_idx = faiss.IndexFlatL2(features.shape[-1])
                if res is not None: cluster_idx = faiss.index_cpu_to_gpu(res, 0, cluster_idx)
                kmeans            = faiss.Clustering(features.shape[-1], n_classes)
                kmeans.niter = 20
                kmeans.min_points_per_centroid = 1
                kmeans.max_points_per_centroid = 1000000000
                ### Train Kmeans
                kmeans.train(features, cluster_idx)
                centroids = faiss.vector_float_to_array(kmeans.centroids).reshape(n_classes, features.shape[-1])

            if 'kmeans_cosine' in self.requires:
                ### Set CPU Cluster index
                cluster_idx = faiss.IndexFlatL2(features_cosine.shape[-1])
                if res is not None: cluster_idx = faiss.index_cpu_to_gpu(res, 0, cluster_idx)
                kmeans            = faiss.Clustering(features_cosine.shape[-1], n_classes)
                kmeans.niter = 20
                kmeans.min_points_per_centroid = 1
                kmeans.max_points_per_centroid = 1000000000
                ### Train Kmeans
                kmeans.train(features_cosine, cluster_idx)
                centroids_cosine = faiss.vector_float_to_array(kmeans.centroids).reshape(n_classes, features_cosine.shape[-1])
                centroids_cosine = normalize(centroids,axis=1)


            """============ Compute Cluster Labels ==============="""
            if 'kmeans_nearest' in self.requires:
                faiss_search_index = faiss.IndexFlatL2(centroids.shape[-1])
                if res is not None: faiss_search_index = faiss.index_cpu_to_gpu(res, 0, faiss_search_index)
                faiss_search_index.add(centroids)
                _, computed_cluster_labels = faiss_search_index.search(features, 1)

            if 'kmeans_nearest_cosine' in self.requires:
                faiss_search_index = faiss.IndexFlatIP(centroids_cosine.shape[-1])
                if res is not None: faiss_search_index = faiss.index_cpu_to_gpu(res, 0, faiss_search_index)
                faiss_search_index.add(centroids_cosine)
                _, computed_cluster_labels_cosine = faiss_search_index.search(features_cosine, 1)



            """============ Compute Nearest Neighbours ==============="""
            if 'nearest_features' in self.requires:
                faiss_search_index  = faiss.IndexFlatL2(features.shape[-1])
                if res is not None: faiss_search_index = faiss.index_cpu_to_gpu(res, 0, faiss_search_index)
                faiss_search_index.add(features)

                max_kval            = np.max([int(x.split('@')[-1]) for x in self.metric_names if 'recall' in x])
                _, k_closest_points = faiss_search_index.search(features, int(max_kval+1))
                k_closest_classes   = target_labels.reshape(-1)[k_closest_points[:,1:]]

            if 'nearest_features_cosine' in self.requires:
                faiss_search_index  = faiss.IndexFlatIP(features_cosine.shape[-1])
                if res is not None: faiss_search_index = faiss.index_cpu_to_gpu(res, 0, faiss_search_index)
                faiss_search_index.add(normalize(features_cosine,axis=1))

                max_kval                   = np.max([int(x.split('@')[-1]) for x in self.metric_names if 'recall' in x])
                _, k_closest_points_cosine = faiss_search_index.search(normalize(features_cosine,axis=1), int(max_kval+1))
                k_closest_classes_cosine   = target_labels.reshape(-1)[k_closest_points_cosine[:,1:]]



            ###
            if self.pars.evaluate_on_gpu:
                features        = torch.from_numpy(features).to(self.pars.device)
                features_cosine = torch.from_numpy(features_cosine).to(self.pars.device)

            start = time.time()
            for metric in self.list_of_metrics:
                input_dict = {}
                if 'features' in metric.requires:         input_dict['features'] = features
                if 'target_labels' in metric.requires:    input_dict['target_labels'] = target_labels

                if 'kmeans' in metric.requires:           input_dict['centroids'] = centroids
                if 'kmeans_nearest' in metric.requires:   input_dict['computed_cluster_labels'] = computed_cluster_labels
                if 'nearest_features' in metric.requires: input_dict['k_closest_classes'] = k_closest_classes

                if 'features_cosine' in metric.requires:         input_dict['features_cosine'] = features_cosine

                if 'kmeans_cosine' in metric.requires:           input_dict['centroids_cosine'] = centroids_cosine
                if 'kmeans_nearest_cosine' in metric.requires:   input_dict['computed_cluster_labels_cosine'] = computed_cluster_labels_cosine
                if 'nearest_features_cosine' in metric.requires: input_dict['k_closest_classes_cosine'] = k_closest_classes_cosine

                computed_metrics[evaltype][metric.name] = metric(**input_dict)

            extra_infos[evaltype] = {'features':features, 'target_labels':target_labels,
                                     'image_paths': dataloader.dataset.image_paths,
                                     'query_image_paths':None, 'gallery_image_paths':None}

        torch.cuda.empty_cache()
        return computed_metrics, extra_infos
Example #32
0
def eval_metrics_query_and_gallery_dataset(model, query_dataloader, gallery_dataloader, device, k_vals, opt):
    """
    Compute evaluation metrics on test-dataset, e.g. NMI, F1 and Recall @ k.

    Args:
        model:               PyTorch network, network to compute evaluation metrics for.
        query_dataloader:    PyTorch Dataloader, dataloader for query dataset, for which nearest neighbours in the gallery dataset are retrieved.
        gallery_dataloader:  PyTorch Dataloader, dataloader for gallery dataset, provides target samples which are to be retrieved in correspondance to the query dataset.
        device:              torch.device, Device to run inference on.
        k_vals:              list of int, Recall values to compute
        opt:                 argparse.Namespace, contains all training-specific parameters.
    Returns:
        F1 score (float), NMI score (float), recall_at_ks (list of float), query data embedding (np.ndarray), gallery data embedding (np.ndarray)
    """
    torch.cuda.empty_cache()

    _ = model.eval()
    n_classes = len(query_dataloader.dataset.avail_classes)

    with torch.no_grad():
        ### For all query test images, extract features
        query_target_labels, query_feature_coll     = [],[]
        query_image_paths   = [x[0] for x in query_dataloader.dataset.image_list]
        query_iter = tqdm(query_dataloader, desc='Extraction Query Features')
        for idx,inp in enumerate(query_iter):
            input_img,target = inp[-1], inp[0]
            query_target_labels.extend(target.numpy().tolist())
            out = model(input_img.to(device))
            query_feature_coll.extend(out.cpu().detach().numpy().tolist())

        ### For all gallery test images, extract features
        gallery_target_labels, gallery_feature_coll = [],[]
        gallery_image_paths = [x[0] for x in gallery_dataloader.dataset.image_list]
        gallery_iter = tqdm(gallery_dataloader, desc='Extraction Gallery Features')
        for idx,inp in enumerate(gallery_iter):
            input_img,target = inp[-1], inp[0]
            gallery_target_labels.extend(target.numpy().tolist())
            out = model(input_img.to(device))
            gallery_feature_coll.extend(out.cpu().detach().numpy().tolist())


        query_target_labels, query_feature_coll     = np.hstack(query_target_labels).reshape(-1,1), np.vstack(query_feature_coll).astype('float32')
        gallery_target_labels, gallery_feature_coll = np.hstack(gallery_target_labels).reshape(-1,1), np.vstack(gallery_feature_coll).astype('float32')

        torch.cuda.empty_cache()

        ### Set CPU Cluster index
        stackset    = np.concatenate([query_feature_coll, gallery_feature_coll],axis=0)
        stacklabels = np.concatenate([query_target_labels, gallery_target_labels],axis=0)
        cpu_cluster_index = faiss.IndexFlatL2(stackset.shape[-1])
        kmeans            = faiss.Clustering(stackset.shape[-1], n_classes)
        kmeans.niter = 20
        kmeans.min_points_per_centroid = 1
        kmeans.max_points_per_centroid = 1000000000

        ### Train Kmeans
        kmeans.train(stackset, cpu_cluster_index)
        computed_centroids = faiss.vector_float_to_array(kmeans.centroids).reshape(n_classes, stackset.shape[-1])

        ### Assign feature points to clusters
        faiss_search_index = faiss.IndexFlatL2(computed_centroids.shape[-1])
        faiss_search_index.add(computed_centroids)
        _, model_generated_cluster_labels = faiss_search_index.search(stackset, 1)

        ### Compute NMI
        NMI = metrics.cluster.normalized_mutual_info_score(model_generated_cluster_labels.reshape(-1), stacklabels.reshape(-1))

        ### Recover max(k_vals) nearest neighbours to use for recall computation
        faiss_search_index  = faiss.IndexFlatL2(gallery_feature_coll.shape[-1])
        faiss_search_index.add(gallery_feature_coll)
        _, k_closest_points = faiss_search_index.search(query_feature_coll, int(np.max(k_vals)))
        k_closest_classes   = gallery_target_labels.reshape(-1)[k_closest_points]

        ### Compute Recall
        recall_all_k = []
        for k in k_vals:
            recall_at_k = np.sum([1 for target, recalled_predictions in zip(query_target_labels, k_closest_classes) if target in recalled_predictions[:k]])/len(query_target_labels)
            recall_all_k.append(recall_at_k)
        recall_str = ', '.join('@{0}: {1:.4f}'.format(k,rec) for k,rec in zip(k_vals, recall_all_k))

        ### Compute F1 score
        F1 = f1_score(model_generated_cluster_labels, stacklabels, stackset, computed_centroids)

    return F1, NMI, recall_all_k, query_feature_coll, gallery_feature_coll
Example #33
0
def eval_metrics_one_dataset(model, test_dataloader, device, k_vals, opt):
    """
    Compute evaluation metrics on test-dataset, e.g. NMI, F1 and Recall @ k.

    Args:
        model:              PyTorch network, network to compute evaluation metrics for.
        test_dataloader:    PyTorch Dataloader, dataloader for test dataset, should have no shuffling and correct processing.
        device:             torch.device, Device to run inference on.
        k_vals:             list of int, Recall values to compute
        opt:                argparse.Namespace, contains all training-specific parameters.
    Returns:
        F1 score (float), NMI score (float), recall_at_k (list of float), data embedding (np.ndarray)
    """
    torch.cuda.empty_cache()

    _ = model.eval()
    n_classes = len(test_dataloader.dataset.avail_classes)

    with torch.no_grad():
        ### For all test images, extract features
        target_labels, feature_coll = [],[]
        final_iter = tqdm(test_dataloader, desc='Computing Evaluation Metrics...')
        image_paths= [x[0] for x in test_dataloader.dataset.image_list]
        for idx,inp in enumerate(final_iter):
            input_img,target = inp[-1], inp[0]
            target_labels.extend(target.numpy().tolist())
            out = model(input_img.to(device))
            feature_coll.extend(out.cpu().detach().numpy().tolist())

        target_labels = np.hstack(target_labels).reshape(-1,1)
        feature_coll  = np.vstack(feature_coll).astype('float32')

        torch.cuda.empty_cache()

        ### Set Faiss CPU Cluster index
        cpu_cluster_index = faiss.IndexFlatL2(feature_coll.shape[-1])
        kmeans            = faiss.Clustering(feature_coll.shape[-1], n_classes)
        kmeans.niter = 20
        kmeans.min_points_per_centroid = 1
        kmeans.max_points_per_centroid = 1000000000

        ### Train Kmeans
        kmeans.train(feature_coll, cpu_cluster_index)
        computed_centroids = faiss.vector_float_to_array(kmeans.centroids).reshape(n_classes, feature_coll.shape[-1])

        ### Assign feature points to clusters
        faiss_search_index = faiss.IndexFlatL2(computed_centroids.shape[-1])
        faiss_search_index.add(computed_centroids)
        _, model_generated_cluster_labels = faiss_search_index.search(feature_coll, 1)

        ### Compute NMI
        NMI = metrics.cluster.normalized_mutual_info_score(model_generated_cluster_labels.reshape(-1), target_labels.reshape(-1))


        ### Recover max(k_vals) nearest neighbours to use for recall computation
        faiss_search_index  = faiss.IndexFlatL2(feature_coll.shape[-1])
        faiss_search_index.add(feature_coll)
        _, k_closest_points = faiss_search_index.search(feature_coll, int(np.max(k_vals)+1))
        k_closest_classes   = target_labels.reshape(-1)[k_closest_points[:,1:]]

        ### Compute Recall
        recall_all_k = []
        for k in k_vals:
            recall_at_k = np.sum([1 for target, recalled_predictions in zip(target_labels, k_closest_classes) if target in recalled_predictions[:k]])/len(target_labels)
            recall_all_k.append(recall_at_k)

        ### Compute F1 Score
        F1 = f1_score(model_generated_cluster_labels, target_labels, feature_coll, computed_centroids)

    return F1, NMI, recall_all_k, feature_coll