def test_wrapped_quantizer_HNSW(self):
        faiss.omp_set_num_threads(1)

        def bin2float(v):
            def byte2float(byte):
                return np.array(
                    [-1.0 + 2.0 * (byte & (1 << b) != 0) for b in range(0, 8)])

            return np.hstack([byte2float(byte)
                              for byte in v]).astype('float32')

        def floatvec2nparray(v):
            return np.array([np.float32(v.at(i)) for i in range(0, v.size())]) \
                     .reshape(-1, d)

        d = 256
        nt = 12800
        nb = 10000
        nq = 500
        (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)

        index_ref = faiss.IndexBinaryFlat(d)

        index_ref.add(xb)

        nlist = 256
        clus = faiss.Clustering(d, nlist)
        clus_index = faiss.IndexFlatL2(d)

        xt_f = np.array([bin2float(v) for v in xt])
        clus.train(xt_f, clus_index)

        centroids = floatvec2nparray(clus.centroids)
        hnsw_quantizer = faiss.IndexHNSWFlat(d, 32)
        hnsw_quantizer.add(centroids)
        hnsw_quantizer.is_trained = True
        wrapped_quantizer = faiss.IndexBinaryFromFloat(hnsw_quantizer)

        assert nlist == hnsw_quantizer.ntotal
        assert nlist == wrapped_quantizer.ntotal
        assert wrapped_quantizer.is_trained

        index = faiss.IndexBinaryIVF(wrapped_quantizer, d,
                                     hnsw_quantizer.ntotal)
        index.nprobe = 128

        assert index.is_trained

        index.add(xb)

        D_ref, I_ref = index_ref.search(xq, 10)
        D, I = index.search(xq, 10)

        recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \
                 / float(D_ref.shape[0])

        assert recall > 0.77, "recall = %g" % recall
Esempio n. 2
0
 def cluster(self, points, k):
     """Clustering given points into k clusters"""
     index = self._faiss_index_flat_l2(points.shape[1])
     clus = faiss.Clustering(points.shape[1], k)
     clus.verbose = False
     clus.niter = 10
     clus.train(np.ascontiguousarray(points, dtype=np.float32), index)
     return faiss.vector_float_to_array(clus.centroids).reshape(
         clus.k, clus.d)
Esempio n. 3
0
    def test_redo(self):
        d = 64
        n = 1000

        rs = np.random.RandomState(123)
        x = rs.uniform(size=(n, d)).astype('float32')

        clus = faiss.Clustering(d, 20)
        clus.nredo = 1
        clus.train(x, faiss.IndexFlatL2(d))
        obj1 = faiss.vector_to_array(clus.obj)

        clus = faiss.Clustering(d, 20)
        clus.nredo = 10
        clus.train(x, faiss.IndexFlatL2(d))
        obj10 = faiss.vector_to_array(clus.obj)

        self.assertGreater(obj1[-1], obj10[-1])
Esempio n. 4
0
def train_kmeans(x,
                 num_clusters=1000,
                 gpu_ids=None,
                 niter=100,
                 nredo=1,
                 verbose=0):
    """
    Runs k-means clustering on one or several GPUs
    """
    assert np.all(~np.isnan(x)), 'x contains NaN'
    assert np.all(np.isfinite(x)), 'x contains Inf'
    if isinstance(gpu_ids, int):
        gpu_ids = [gpu_ids]
    assert gpu_ids is None or len(gpu_ids)

    d = x.shape[1]
    kmeans = faiss.Clustering(d, num_clusters)
    kmeans.verbose = bool(verbose)
    kmeans.niter = niter
    kmeans.nredo = nredo

    # otherwise the kmeans implementation sub-samples the training set
    kmeans.max_points_per_centroid = 10000000

    if gpu_ids is not None:
        res = [faiss.StandardGpuResources() for i in gpu_ids]

        flat_config = []
        for i in gpu_ids:
            cfg = faiss.GpuIndexFlatConfig()
            cfg.useFloat16 = False
            cfg.device = i
            flat_config.append(cfg)

        if len(gpu_ids) == 1:
            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
        else:
            indexes = [
                faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                for i in range(len(gpu_ids))
            ]
            index = faiss.IndexProxy()
            for sub_index in indexes:
                index.addIndex(sub_index)
    else:
        index = faiss.IndexFlatL2(d)

    # perform the training
    kmeans.train(x, index)
    centroids = faiss.vector_float_to_array(kmeans.centroids)

    objective = faiss.vector_float_to_array(kmeans.obj)
    #logging.debug("Final objective: %.4g" % objective[-1])

    return centroids.reshape(num_clusters, d)
Esempio n. 5
0
    def test_redo(self):
        d = 64
        n = 1000

        rs = np.random.RandomState(123)
        x = rs.uniform(size=(n, d)).astype('float32')

        # make sure that doing 10 redos yields a better objective than just 1

        clus = faiss.Clustering(d, 20)
        clus.nredo = 1
        clus.train(x, faiss.IndexFlatL2(d))
        obj1 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj

        clus = faiss.Clustering(d, 20)
        clus.nredo = 10
        clus.train(x, faiss.IndexFlatL2(d))
        obj10 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj

        self.assertGreater(obj1, obj10)
Esempio n. 6
0
    def test_redo_cosine(self):
        # test redo with cosine distance (inner prod, so objectives are reversed)
        d = 64
        n = 1000

        rs = np.random.RandomState(123)
        x = rs.uniform(size=(n, d)).astype('float32')
        faiss.normalize_L2(x)

        # make sure that doing 10 redos yields a better objective than just 1
        # for cosine distance, it is IP so higher is better

        clus = faiss.Clustering(d, 20)
        clus.nredo = 1
        clus.train(x, faiss.IndexFlatIP(d))
        obj1 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj

        clus = faiss.Clustering(d, 20)
        clus.nredo = 10
        clus.train(x, faiss.IndexFlatIP(d))
        obj10 = clus.iteration_stats.at(clus.iteration_stats.size() - 1).obj

        self.assertGreater(obj10, obj1)
def kmeans(data, nmb_clusters, preprocess=True, verbose=False):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """

    if preprocess:
        x = preprocess_features(data)
    else:
        x = data

    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)

    # Change faiss seed at each k-means so that the randomly picked
    # initialization centroids do not correspond to the same feature ids
    # from an epoch to another.
    clus.seed = np.random.randint(1234)

    clus.niter = 20
    clus.max_points_per_centroid = 20000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)

    # elahe https://github.com/facebookresearch/faiss/issues/1179
    # losses = faiss.vector_to_array(clus.obj)
    stats = clus.iteration_stats
    losses = np.array([stats.at(i).obj for i in range(stats.size())])

    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    I = [int(n[0]) for n in I]
    images_lists = [[] for i in range(nmb_clusters)]
    for i in range(len(data)):
        images_lists[I[i]].append(i)

    return images_lists, losses[-1]
    def test_wrapped_quantizer_HNSW(self):
        def bin2float2d(v):
            n, d = v.shape
            vf = ((v.reshape(-1, 1) >> np.arange(8)) & 1).astype("float32")
            vf *= 2
            vf -= 1
            return vf.reshape(n, d * 8)

        d = 256
        nt = 12800
        nb = 10000
        nq = 500
        (xt, xb, xq) = make_binary_dataset(d, nb, nt, nq)

        index_ref = faiss.IndexBinaryFlat(d)

        index_ref.add(xb)

        nlist = 256
        clus = faiss.Clustering(d, nlist)
        clus_index = faiss.IndexFlatL2(d)

        xt_f = bin2float2d(xt)
        clus.train(xt_f, clus_index)

        centroids = faiss.vector_to_array(clus.centroids).reshape(-1, clus.d)
        hnsw_quantizer = faiss.IndexHNSWFlat(d, 32)
        hnsw_quantizer.add(centroids)
        hnsw_quantizer.is_trained = True
        wrapped_quantizer = faiss.IndexBinaryFromFloat(hnsw_quantizer)

        assert nlist == hnsw_quantizer.ntotal
        assert nlist == wrapped_quantizer.ntotal
        assert wrapped_quantizer.is_trained

        index = faiss.IndexBinaryIVF(wrapped_quantizer, d,
                                     hnsw_quantizer.ntotal)
        index.nprobe = 128

        assert index.is_trained

        index.add(xb)

        D_ref, I_ref = index_ref.search(xq, 10)
        D, I = index.search(xq, 10)

        recall = sum(gti[0] in Di[:10] for gti, Di in zip(D_ref, D)) \
                 / float(D_ref.shape[0])

        assert recall >= 0.77, "recall = %g" % recall
    def update_pseudo_labels(self, model, dataloader, device):
        import faiss, time
        #### Reset Classifier Weights
        torch.cuda.empty_cache()
        self.classifier.weight.data.normal_(0, 1 / model.feature_dim)

        with torch.no_grad():
            _ = model.eval()
            _ = model.to(device)

            memory_queue = []
            for i, input_tuple in enumerate(
                    tqdm(dataloader,
                         'Getting DC Embeddings...',
                         total=len(dataloader))):
                embed = model(input_tuple[1].type(
                    torch.FloatTensor).to(device))[-1]
                memory_queue.append(embed)
            memory_queue = torch.cat(memory_queue, dim=0).cpu().numpy()

        #PERFORM PCA
        print('Computing PCA... ', end='')
        start = time.time()
        pca_mat = faiss.PCAMatrix(memory_queue.shape[-1], self.red_dim)
        pca_mat.train(memory_queue)
        memory_queue = pca_mat.apply_py(memory_queue)
        print('Done in {}s.'.format(time.time() - start))
        #
        #
        print('Computing Pseudolabels... ', end='')
        start = time.time()
        cpu_cluster_index = faiss.IndexFlatL2(memory_queue.shape[-1])
        kmeans = faiss.Clustering(memory_queue.shape[-1], self.ncluster)
        kmeans.niter = 20
        kmeans.min_points_per_centroid = 1
        kmeans.max_points_per_centroid = 1000000000
        ### Train Kmeans
        kmeans.train(memory_queue, cpu_cluster_index)
        centroids = faiss.vector_float_to_array(kmeans.centroids).reshape(
            self.ncluster, memory_queue.shape[-1])
        ###
        faiss_search_index = faiss.IndexFlatL2(centroids.shape[-1])
        faiss_search_index.add(centroids)
        _, computed_cluster_labels = faiss_search_index.search(memory_queue, 1)
        print('Done in {}s.'.format(time.time() - start))
        ###
        self.pseudo_labels = computed_cluster_labels
        ###
        torch.cuda.empty_cache()
Esempio n. 10
0
def run_kmeans_multi_gpu(x,
                         nmb_clusters,
                         verbose=False,
                         seed=DEFAULT_KMEANS_SEED,
                         gpu_device=0):
    """
    Runs kmeans on multi GPUs.

    Args:
    -----
    x: data
    nmb_clusters (int): number of clusters

    Returns:
    --------
    list: ids of data in each cluster
    """
    n_data, d = x.shape
    ngpus = len(gpu_device)
    assert ngpus > 1

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    clus.seed = seed
    res = [faiss.StandardGpuResources() for i in range(ngpus)]
    flat_config = []
    for i in gpu_device:
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = i
        flat_config.append(cfg)

    indexes = [
        faiss.GpuIndexFlatL2(res[i], d, flat_config[i]) for i in range(ngpus)
    ]
    index = faiss.IndexReplicas()
    for sub_index in indexes:
        index.addIndex(sub_index)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1]
Esempio n. 11
0
def run_kmeans(x,
               nmb_clusters,
               verbose=False,
               seed=DEFAULT_KMEANS_SEED,
               gpu_device=0):
    """
    Runs kmeans on 1 GPU.

    Args:
    -----
    x: data
    nmb_clusters (int): number of clusters

    Returns:
    --------
    list: ids of data in each cluster
    """
    n_data, d = x.shape

    # niter = 20
    # kmeans = faiss.Kmeans(d, nmb_clusters, niter=niter, verbose=verbose, gpu=True)
    # kmeans.train(x)
    # _, I = kmeans.index.search(x, 1)
    # return [int(n[0]) for n in I], 0

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    clus.seed = seed
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = gpu_device

    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1]
Esempio n. 12
0
    def run_kmeans(self, x, nmb_clusters):
        _, d = x.shape
        # faiss implementation of k-means
        clus = faiss.Clustering(d, nmb_clusters)
        clus.niter = 20
        # clus.max_points_per_centroid = 10000000
        res = faiss.StandardGpuResources()
        flat_config = faiss.GpuIndexFlatConfig()
        flat_config.useFloat16 = False
        flat_config.device = 0
        self.index = faiss.GpuIndexFlatL2(res, d, flat_config)

        # perform the training
        clus.train(x, self.index)
        _, labels = self.index.search(x, 1)
        losses = faiss.vector_to_array(clus.obj)
        if self.verbose:
            print('k-means loss evolution: {0}'.format(losses))
        return labels.ravel(), losses[-1]
Esempio n. 13
0
def run_kmeans(x, nmb_clusters, verbose=True):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape
    print("x.shape:", x.shape)

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))
        centroids = faiss.vector_float_to_array(clus.centroids)
        print("centroids:")
        # print(clus.centroids)
        print("type:", type(centroids))
        print("len:", len(centroids))
        print("shape:", centroids.shape)
        # print(centroids)
        centroids_rs = centroids.reshape(nmb_clusters, d)
        print("centroids_reshape:")
        print("type:", type(centroids_rs))
        print("len:", len(centroids_rs))
        print("shape:", centroids_rs.shape)
        #print(centroids_rs)
        #assert 1 == 0

    return [int(n[0]) for n in I], losses[-1]
def run_kmeans(x, nmb_clusters):
    device = x.device
    x = c_f.to_numpy(x).astype(np.float32)
    n_data, d = x.shape
    logging.info("running k-means clustering with k=%d" % nmb_clusters)
    logging.info("embedding dimensionality is %d" % d)

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    index = faiss.IndexFlatL2(d)
    if faiss.get_num_gpus() > 0:
        index = faiss.index_cpu_to_all_gpus(index)
    # perform the training
    clus.train(x, index)
    _, idxs = index.search(x, 1)

    return torch.tensor([int(n[0]) for n in idxs], dtype=int, device=device)
Esempio n. 15
0
def train_coarse_quantizer(x, k, preproc):
    d = preproc.d_out
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    # clus.niter = 2
    clus.max_points_per_centroid = 10000000

    print "apply preproc on shape", x.shape, 'k=', k
    t0 = time.time()
    x = preproc.apply_py(sanitize(x))
    print "   preproc %.3f s output shape %s" % (time.time() - t0, x.shape)

    vres, vdev = make_vres_vdev()
    index = faiss.index_cpu_to_gpu_multiple(vres, vdev, faiss.IndexFlatL2(d))

    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)

    return centroids.reshape(k, d)
Esempio n. 16
0
def perform_clustering(step, features, accelerator):
	# num_cluster = [10000, 20000, 40000, 80000, 100000, 100000]
	num_cluster = [2500, 5000, 10000, 20000, 40000, 80000]
	if step>=0 and step<50000:
		i = step // 10000
		d = features.shape[1]
		k = num_cluster[i]
		clus = faiss.Clustering(d, k)
		clus.verbose = False
		clus.niter = 20
		clus.nredo = 5
		clus.seed = 0
		clus.max_points_per_centroid = 1000
		clus.min_points_per_centroid = 1
		if accelerator.is_local_main_process:	
			clus.verbose = True
			res = faiss.StandardGpuResources()
			flat_config = faiss.GpuIndexFlatConfig()
			flat_config.useFloat16 = False
			flat_config.device = accelerator.local_process_index
			index = faiss.GpuIndexFlatL2(res, d, flat_config)
			features = features.astype('float32')
			clus.train(features, index)
			num_inst = features.shape[0]
			bsz = 16
			nr_batch = int(math.ceil(num_inst / bsz))
			D_list, I_list = [], []
			for bidx in range(nr_batch):
				sidx = bidx * bsz
				eidx = min((bidx + 1) * bsz, num_inst)
				D, I = index.search(features[sidx:eidx], 1)
				D_list.append(D)
				I_list.append(I)
			idxs = np.concatenate(I_list)
			cluster_result = [int(n[0]) for n in idxs]
		else:
			cluster_result = [None for _ in range(features.shape[0])]
		torch.distributed.broadcast_object_list(cluster_result, src=0, group=None)
		cluster_result = torch.LongTensor(cluster_result).to(accelerator.device)
		return cluster_result
	else:
		return None
Esempio n. 17
0
def run_kmeans(x, nmb_clusters, verbose=False):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    print("run kmeans begin")
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)

    # Change faiss seed at each k-means so that the randomly picked
    # initialization centroids do not correspond to the same feature ids
    # from an epoch to another.
    clus.seed = np.random.randint(1234)

    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)
    #pdb.set_trace()

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    losses = np.array([
        clus.iteration_stats.at(i).obj
        for i in range(clus.iteration_stats.size())
    ])

    #losses = faiss.vector_to_array(obj)
    #pdb.set_trace()
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1]
Esempio n. 18
0
def train_kmeans(x, num_clusters=1000, num_gpus=1):
    """
    Runs k-means clustering on one or several GPUs
    """
    d = x.shape[1]
    kmeans = faiss.Clustering(d, num_clusters)
    kmeans.verbose = True
    kmeans.niter = 20

    # otherwise the kmeans implementation sub-samples the training set
    kmeans.max_points_per_centroid = 10000000

    res = [faiss.StandardGpuResources() for i in range(num_gpus)]

    flat_config = []
    for i in range(num_gpus):
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = i
        flat_config.append(cfg)

    if num_gpus == 1:
        index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
    else:
        indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                   for i in range(num_gpus)]
        index = faiss.IndexProxy()
        for sub_index in indexes:
            index.addIndex(sub_index)

    # perform the training
    kmeans.train(x, index)
    print 'Total number of indexed vectors (after kmeans.train()):', index.ntotal
    centroids = faiss.vector_float_to_array(kmeans.centroids)

    objective = faiss.vector_float_to_array(kmeans.obj)
    print 'Objective values per iter:', objective
    print "Final objective: %.4g" % objective[-1]

    # TODO: return cluster assignment

    return centroids.reshape(num_clusters, d)
Esempio n. 19
0
def train_kmeans(x, k, ngpu, max_points_per_centroid=256):
    "Runs kmeans on one or several GPUs"
    d = x.shape[1]
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    clus.niter = 20
    clus.max_points_per_centroid = max_points_per_centroid

    if ngpu == 0:
        index = faiss.IndexFlatL2(d)
    else:
        res = [faiss.StandardGpuResources() for i in range(ngpu)]

        flat_config = []
        for i in range(ngpu):
            cfg = faiss.GpuIndexFlatConfig()
            cfg.useFloat16 = False
            cfg.device = i
            flat_config.append(cfg)

        if ngpu == 1:
            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
        else:
            indexes = [
                faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
                for i in range(ngpu)
            ]
            index = faiss.IndexReplicas()
            for sub_index in indexes:
                index.addIndex(sub_index)

    # perform the training
    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)

    stats = clus.iteration_stats
    stats = [stats.at(i) for i in range(stats.size())]
    obj = np.array([st.obj for st in stats])
    print("final objective: %.4g" % obj[-1])

    return centroids.reshape(k, d)
def faiss_kmeans(train_feats, val_feats, nmb_clusters):
    train_feats = train_feats.numpy()
    val_feats = val_feats.numpy()

    d = train_feats.shape[-1]

    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000

    index = faiss.IndexFlatL2(d)
    co = faiss.GpuMultipleClonerOptions()
    co.useFloat16 = True
    co.shard = True
    index = faiss.index_cpu_to_all_gpus(index, co)

    clus.train(train_feats, index)
    _, train_a = index.search(train_feats, 1)
    _, val_a = index.search(val_feats, 1)

    return list(train_a[:, 0]), list(val_a[:, 0])
Esempio n. 21
0
def run_kmeans(x, nmb_clusters):
    """
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    cpu_index = faiss.IndexFlatL2(d)
    index = faiss.index_cpu_to_all_gpus(cpu_index)
    # perform the training
    clus.train(x, index)
    _, idxs = index.search(x, 1)

    return [int(n[0]) for n in idxs]
Esempio n. 22
0
def kmeans(data, k, nrestarts=10, niters=100):
    """
    Run k-means on the input data.
    """
    data = np.ascontiguousarray(data.cpu().numpy()).astype('float32')
    d = data.shape[1]

    clus = faiss.Clustering(d, k)
    clus.verbose = False
    clus.niter = niters
    clus.nredo = nrestarts
    clus.seed = defaults.seed
    clus.spherical = False

    index = faiss.IndexFlatL2(d)

    clus.train(data, index)
    centroids = faiss.vector_float_to_array(clus.centroids).reshape(k, d)
    centroids = torch.Tensor(centroids).to(defaults.device)

    return centroids
Esempio n. 23
0
def spherical_kmeans(data, k, nrestarts=10, niters=100):
    """
    Run spherical k-means on the input data.
    """
    data = np.ascontiguousarray(data.cpu().numpy()).astype('float32')
    data /= np.linalg.norm(data, axis=1)[:, np.newaxis]
    d = data.shape[1]

    clus = faiss.Clustering(d, k)
    clus.verbose = False
    clus.niter = niters
    clus.nredo = nrestarts
    clus.seed = defaults.seed
    clus.spherical = True

    index = faiss.IndexFlatIP(d)

    clus.train(data, index)
    centroids = faiss.vector_float_to_array(clus.centroids).reshape(k, d)
    centroids = torch.Tensor(centroids).to(defaults.device)

    return centroids / torch.norm(centroids, 2, 1).unsqueeze(1)
Esempio n. 24
0
def train_kmeans(x, k, ngpu):
    "Runs kmeans on one or several GPUs"
    d = x.shape[1]
    clus = faiss.Clustering(d, k)
    clus.verbose = True
    clus.niter = 20

    # otherwise the kmeans implementation sub-samples the training set
    clus.max_points_per_centroid = 10000000

    res = [faiss.StandardGpuResources() for i in range(ngpu)]

    flat_config = []
    for i in range(ngpu):
        cfg = faiss.GpuIndexFlatConfig()
        cfg.useFloat16 = False
        cfg.device = i
        flat_config.append(cfg)

    if ngpu == 1:
        index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
    else:
        indexes = [
            faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
            for i in range(ngpu)
        ]
        index = faiss.IndexProxy()
        for sub_index in indexes:
            index.addIndex(sub_index)

    # perform the training
    clus.train(x, index)
    centroids = faiss.vector_float_to_array(clus.centroids)

    obj = faiss.vector_float_to_array(clus.obj)
    print "final objective: %.4g" % obj[-1]

    return centroids.reshape(k, d)
Esempio n. 25
0
def cluster(features, num_cluster):
    n_samples, dim = features.shape

    kmeans_clustering = faiss.Clustering(dim, num_cluster)
    kmeans_clustering.n_iter = 20
    kmeans_clustering.max_points_per_centroid = 1000000000

    gpu_resource = faiss.StandardGpuResources()
    gpu_flat = faiss.GpuIndexFlatConfig()
    gpu_flat.useFloat16 = False
    gpu_flat.device = 0

    gpu_distance_measure = faiss.GpuIndexFlatL2(gpu_resource, dim, gpu_flat)

    kmeans_clustering.train(features, gpu_distance_measure)
    _, cluster_idxs = gpu_distance_measure.search(features, 1)
    losses = faiss.vector_to_array(kmeans_clustering.obj)

    image_list = [[] for i in range(num_cluster)]
    for i in range(len(features)):
        image_list[cluster_idxs[i][0]].append(i)

    return image_list, losses[-1]
Esempio n. 26
0
    def nmi(self, embeddings, labels):
        if isinstance(embeddings, list):
            embeddings = np.concatenate(embeddings, axis=0)
            labels = np.concatenate(labels, axis=0)

        faiss_search_index = faiss.IndexFlatL2(embeddings.shape[-1])
        faiss_cluster_index = faiss.Clustering(embeddings.shape[-1],
                                               self.num_classes)
        faiss_cluster_index.n_iter, faiss_cluster_index.min_points_per_centroid, faiss_cluster_index.max_points_per_centroid = 20, 1, 1000000000
        #
        faiss_cluster_index.train(embeddings, faiss_search_index)
        embedding_centroids = faiss.vector_float_to_array(
            faiss_cluster_index.centroids).reshape(self.num_classes,
                                                   embeddings.shape[-1])
        #
        faiss_search_index = faiss.IndexFlatL2(embedding_centroids.shape[-1])
        faiss_search_index.add(embedding_centroids)
        _, centroids_cluster_labels = faiss_search_index.search(embeddings, 1)
        #
        NMI = metrics.cluster.normalized_mutual_info_score(
            centroids_cluster_labels.reshape(-1), labels.reshape(-1))
        #
        return [NMI]
Esempio n. 27
0
def run_kmeans(x, nmb_clusters, verbose=False):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 10
    clus.max_points_per_centroid = 10000000
    clus.verbose = True
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # perform the training
    clus.train(x, index)
    _, I = index.search(x, 1)
    centroids = np.array(faiss.vector_to_array(clus.centroids))
    centroids = np.reshape(centroids, (-1, 64))
    centd = pdist(centroids)
    centroids2d = TSNE(n_components=2).fit_transform(centroids)
    plt.scatter(centroids2d[:, 0], centroids2d[:, 1])
    axes = plt.gca()
    axes.set_xlim([-50, 50])
    axes.set_ylim([-50, 50])
    losses = faiss.vector_to_array(clus.obj)
    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    return [int(n[0]) for n in I], losses[-1], plt, np.mean(centd)
Esempio n. 28
0
def run_kmeans(x, nmb_clusters, verbose=False):
    """Runs kmeans on 1 GPU.
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.nredo = 10

    clus.max_points_per_centroid = int(3 * (n_data / nmb_clusters))  #10000000
    res = faiss.StandardGpuResources()
    flat_config = faiss.GpuIndexFlatConfig()
    flat_config.useFloat16 = False
    flat_config.device = 0
    index = faiss.GpuIndexFlatL2(res, d, flat_config)

    # import pdb; pdb.set_trace()

    # perform the training
    clus.train(x, index)

    D, I = index.search(x, 1)
    losses = faiss.vector_to_array(clus.obj)

    if verbose:
        print('k-means loss evolution: {0}'.format(losses))

    # import pdb; pdb.set_trace()

    return [int(n[0]) for n in I], [float(n[0]) for n in D
                                    ], losses[-1], clus, index, flat_config
Esempio n. 29
0
    def run_kmeans(x, nmb_clusters):
        n_data, d = x.shape

        # faiss implementation of k-means
        clus = faiss.Clustering(d, nmb_clusters)

        # Change faiss seed at each k-means so that the randomly picked
        # initialization centroids do not correspond to the same feature ids
        # from an epoch to another.
        clus.seed = np.random.randint(1234)

        clus.niter = 20
        clus.max_points_per_centroid = 10000000
        res = faiss.StandardGpuResources()
        flat_config = faiss.GpuIndexFlatConfig()
        flat_config.useFloat16 = False
        flat_config.device = 0
        index = faiss.GpuIndexFlatL2(res, d, flat_config)

        # perform the training
        clus.train(x, index)
        _, I = index.search(x, 1)

        return [int(n[0]) for n in I]
def run_kmeans(x, nmb_clusters):
    """
    Args:
        x: data
        nmb_clusters (int): number of clusters
    Returns:
        list: ids of data in each cluster
    """
    n_data, d = x.shape
    logging.info("running k-means clustering with k=%d" % nmb_clusters)
    logging.info("embedding dimensionality is %d" % d)

    # faiss implementation of k-means
    clus = faiss.Clustering(d, nmb_clusters)
    clus.niter = 20
    clus.max_points_per_centroid = 10000000
    index = faiss.IndexFlatL2(d)
    if faiss.get_num_gpus() > 0:
        index = faiss.index_cpu_to_all_gpus(index)
    # perform the training
    clus.train(x, index)
    _, idxs = index.search(x, 1)

    return [int(n[0]) for n in idxs]