Beispiel #1
0
    def split_highest_sse_node(self):
        highest_sse_node = self._find_highest_sse_node(self.split_dataset_loader_gen())

        leaf_id = highest_sse_node.node_id
        node_data = None
        for batch_data in self.split_dataset_loader_gen():
            labels_np, _, node_id_label_map = self.leaf_prediction_np(batch_data)
            node_label_id = node_id_label_map[leaf_id]
            node_data_batch = batch_data.data.cpu().numpy()[labels_np == node_label_id]
            if node_data is None:
                node_data = node_data_batch
            else:
                node_data = np.concatenate([node_data, node_data_batch], 0)
        init_centers = k_means(node_data, 2, n_init=20)[0]

        new_left_leaf = ECTnode.new_leaf_node(self.next_free_node_id, self, init_centers[0, :])
        new_right_leaf = ECTnode.new_leaf_node(self.next_free_node_id + 1, self, init_centers[1, :])
        highest_sse_node.split_node(self.n_splits + 1, new_left_leaf, new_right_leaf)

        self.next_free_node_id += 2
        self.n_splits += 1

        self.add_module(f"node_{new_left_leaf.node_id}", new_left_leaf)
        self.add_module(f"node_{new_right_leaf.node_id}", new_right_leaf)

        self.leaf_nodes.remove(highest_sse_node)
        self.leaf_nodes.append(new_left_leaf)
        self.leaf_nodes.append(new_right_leaf)
        self._update_leaf_node_mappings()

        self.optimizer.add_param_group({'params': new_left_leaf.parameters()})
        self.optimizer.add_param_group({'params': new_right_leaf.parameters()})

        logger.info(f"new plit now we have {self.n_leaf_nodes} leaves")
    def runSpectralEmbedding(self, X, n_components=2, n_clusters=2,
                             k_means_=False):

        # Create distance matrix
        self.create_distance_matrix(X)

        # Run spectral embedding for n_components
        embedding = SpectralEmbedding(n_components=n_components,
                                      affinity='precomputed',
                                      random_state=42,
                                      n_jobs=-1).fit(self.adjacencyMatrix)
        # Alternative way
        # embedding_otherapp = spectral_embedding(self.adjacencyMatrix,
        # n_components=n_components, norm_laplacian=True, random_state=42,
        # drop_first=True)

        # Run k means if set to True
        if k_means_:
            _, kmeans_labels, _ = k_means(X=embedding.embedding_,
                                          n_clusters=n_clusters,
                                          random_state=42, n_init=10)

            # Alternative embedding - More freedom, but slower
            # _, kmeans_labels2, _ = k_means(X=embedding_otherapp, n_clusters=
            # n_clusters, random_state=42, n_init=10)

            return kmeans_labels, embedding.embedding_
        else:
            return embedding.embedding_
Beispiel #3
0
def my_uniteigenvector_zeroeigenvalue_cluster(k):
    G = nx.read_gpickle('data/undirected(fortest).gpickle')
    A = nx.adjacency_matrix(G, nodelist=G.nodes()[:-1], weight='weight')
    #A=A.toarray()
    #np.fill_diagonal(A,0.01) #add node with its own weight to itself
    #Tri = np.diag(np.sum(A, axis=1))
    #L = Tri - A
    #Tri_1 = np.diag(np.reciprocal(np.sqrt(Tri).diagonal()))
    #Ls = Tri_1.dot(L).dot(Tri_1)

    Ls, dd = graph_laplacian(A,normed=True, return_diag=True)

    eigenvalue_n, eigenvector_n = eigsh(Ls*(-1), k=k,
                                   sigma=1.0, which='LM',
                                   tol=0.0)

    #for ic,vl in enumerate(eigenvalue_n):
    #    if abs(vl-0)<=1e-10:
    #        eigenvector_n[:, ic] = np.full(len(G.nodes()[:-1]),1.0 / math.sqrt(len(G.nodes()[:-1]))) # zero eigenvalue

    eigenvector_n[:, -1] = np.full(len(G.nodes()[:-1]), 1.0 / math.sqrt(len(G.nodes()[:-1])))  # zero eigenvalue

    for ir,n in enumerate(eigenvector_n):
        eigenvector_n[ir]=n/float(np.linalg.norm(n))  # normalize to unitvector

    _, labels, _ = k_means(eigenvector_n, k, random_state=None,
                           n_init=100)
    return labels
Beispiel #4
0
    def spectral_clustering_sg(self,
                               affinity,
                               max_clusters=8,
                               eigen_solver=None,
                               random_state=None,
                               n_init=10,
                               eigen_tol=0.0,
                               assign_labels='kmeans'):

        if assign_labels not in ('kmeans', 'discretize'):
            raise ValueError("The 'assign_labels' parameter should be "
                             "'kmeans' or 'discretize', but '%s' was given" %
                             assign_labels)

        random_state = check_random_state(random_state)
        n_components = max_clusters
        maps, lambdas = self.spectral_embedding(affinity,
                                                n_components=n_components,
                                                eigen_solver=eigen_solver,
                                                random_state=random_state,
                                                eigen_tol=eigen_tol,
                                                drop_first=False)

        # determin n_clusters by Spectral Gap HERE!!
        n_clusters = self.estimate_num_of_clusters(lambdas)
        if assign_labels == 'kmeans':
            _, labels, _ = k_means(maps,
                                   n_clusters,
                                   random_state=0,
                                   n_init=n_init)
        else:
            labels = discretize(maps, random_state=random_state)
        return labels
Beispiel #5
0
def k_means_label(pointcloud,
                  n_clusters,
                  init,
                  precompute_distances,
                  n_init=10,
                  max_iter=300,
                  tol=1e-4,
                  random_state=None,
                  n_jobs=1,
                  algorithm="auto"):
    """
    Returns
    --------
    centroid, labels, inertia, best_n_iter
    """
    res = k_means(pointcloud,
                  n_clusters,
                  init=init,
                  precompute_distances=precompute_distances,
                  n_init=n_init,
                  max_iter=max_iter,
                  verbose=False,
                  tol=tol,
                  random_state=random_state,
                  copy_x=True,
                  n_jobs=n_jobs,
                  algorithm=algorithm,
                  return_n_iter=False)

    return res[1]
Beispiel #6
0
 def _create_root_node_centers(self):
     node_data = None
     for batch_data in self.split_dataset_loader_gen():
         if node_data is None:
             node_data = batch_data.detach().cpu().numpy()
         else:
             node_data = np.concatenate([node_data, batch_data.detach().cpu().numpy()], 0)
     return k_means(node_data, 2, n_init=20)[0]
Beispiel #7
0
def run_experiment(ae_model_path):
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(f"Working now on {ae_model_path.name}")
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    new_seed = random.randint(0, 1000)
    logger.info(f"Seed value for this is: {new_seed}")
    set_random_seed(new_seed)

    ae_module = stacked_ae(pt_data.shape[1], [500, 500, 2000, 10],
                           weight_initalizer=torch.nn.init.xavier_normal_,
                           activation_fn=lambda x: F.relu(x),
                           loss_fn=None,
                           optimizer_fn=None)

    model_data = torch.load(ae_model_path, map_location='cpu')
    ae_module.load_state_dict(model_data)
    ae_module = ae_module.cuda()

    # Get embedded data
    embedded_data = None
    for batch_data in torch.utils.data.DataLoader(pt_data,
                                                  batch_size=256,
                                                  shuffle=False):
        embedded_batch_np = ae_module.forward(
            batch_data.cuda())[0].detach().cpu().numpy()
        if embedded_data is None:
            embedded_data = embedded_batch_np
        else:
            embedded_data = np.concatenate([embedded_data, embedded_batch_np],
                                           0)
    del ae_module

    # Perform k-means
    k_means_labels = k_means(embedded_data, n_clusters, n_init=20)[1]

    k_means_nmi_value = nmi(gold_labels,
                            k_means_labels,
                            average_method='arithmetic')
    k_means_acc_value = cluster_acc(gold_labels, k_means_labels)[0]

    result_file = Path(f"{result_dir}/results_ae_kmeans_{dataset_name}.txt")
    result_file_exists = result_file.exists()
    f = open(result_file, "a+")
    if not result_file_exists:
        f.write("#\"ae_model_name\"\t\"NMI\"\t\"ACC\"\n")
    f.write(
        f"{ae_model_path.name}\t{k_means_nmi_value}\t{k_means_acc_value}\n")
    f.close()
Beispiel #8
0
 def Final_Result(self):
     self.Scablekmeans_ProcessingCenter()
     try:
         return kmean.k_means(self.matrix,
                              self.k,
                              init=numpy.array(self.process_center),
                              n_init=1)
     except:
         print("최종 계산된 Center의 갯수가 K 보다 작습니다...", "\n K 값 : ", self.k,
               " /  Center 갯수 : ", len(self.process_center))
Beispiel #9
0
    def fit(self, X, y=None):
        """Creates an affinity matrix for X using the selected affinity,
        then applies spectral clustering to this affinity matrix.
        Parameters
        ----------
        X : array-like or sparse matrix, shape (n_samples, n_features)
            OR, if affinity==`precomputed`, a precomputed affinity
            matrix of shape (n_samples, n_samples)
        """

        # this class is not tested with sparse matrix.
        # any contribution (report, coding) is welcome!
        X = check_array(X,
                        accept_sparse=['csr', 'csc', 'coo'],
                        dtype=np.float64)

        ell = self.n_clusters + 1  # +1 for drop_first, x2 for zero suppression in frequent_direction.
        k = self.n_buffer_rows
        if self.affinity == 'rbf':
            self.affinity_matrix_, dd = laplacian_sketch_rbf_kernel(
                X, ell, k, normed=self.normed, gamma=self.gamma)
        elif self.affinity == 'cosine':
            self.affinity_matrix_, dd = laplacian_sketch_cosine_similarity(
                X, ell, k, normed=self.normed)
        else:
            params = self.kernel_params
            if params is None:
                params = {}
            if callable(self.affinity):
                self.affinity_matrix_, dd = laplacian_sketch(
                    X, ell, k, False, self.normed, self.affinity, params)
            else:
                warnings.warn("%s is unknown kernel" % self.affinity)

        random_state = check_random_state(self.random_state)

        # spectral embedding post process.
        maps = spectral_embedding_imitation(self.affinity_matrix_,
                                            dd,
                                            n_components=self.n_clusters,
                                            random_state=random_state,
                                            drop_first=False)

        if self.assign_labels == 'kmeans':
            _, self.labels_, _ = k_means(maps,
                                         n_clusters,
                                         random_state=random_state,
                                         n_init=n_init)
        else:
            self.labels_ = discretize(maps, random_state=random_state)
Beispiel #10
0
    def cluster(self, affinities):
        laplacian, diagonal = graphutil.graph_laplacian(affinities,
                                                        normed=True,
                                                        return_diag=True)

        self.embedding = self.embed(laplacian, diagonal, self.k, self.tol)

        centroid_vals, self.labels, _ = k_means(self.embedding,
                                                self.k,
                                                random_state=self.rand,
                                                n_init=self.n_init,
                                                init=self.init_centroids)

        self.centroids = []
        for c in centroid_vals:
            self.centroids.append(
                np.argmin([np.sum((c - e)**2) for e in self.embedding]))

        return self.labels
Beispiel #11
0
def run_experiment(ae_model_path):
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(f"Working now on {ae_model_path.name}")
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    logger.info(
        f"++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
    )
    new_seed = random.randint(0, 1000)
    logger.info(f"Seed value for this is: {new_seed}")
    set_random_seed(new_seed)
    train = torch.utils.data.TensorDataset(pt_data)
    train_loader = torch.utils.data.DataLoader(train,
                                               batch_size=256,
                                               shuffle=True)

    n_features = pt_data.shape[1]
    # Same loss as in the DEC implementation
    ae_reconstruction_loss_fn = lambda x, y: torch.mean((x - y)**2)
    ae_module = stacked_ae(n_features, [500, 500, 2000, 10],
                           weight_initalizer=torch.nn.init.xavier_normal_,
                           activation_fn=lambda x: F.relu(x),
                           loss_fn=None,
                           optimizer_fn=None)

    model_data = torch.load(ae_model_path, map_location='cpu')
    ae_module.load_state_dict(model_data)
    ae_module = ae_module.cuda()

    node_data = None
    for batch_data in torch.utils.data.DataLoader(pt_init_sample,
                                                  batch_size=256,
                                                  shuffle=True):
        embedded_batch_np = ae_module.forward(
            batch_data.cuda())[0].detach().cpu().numpy()
        if node_data is None:
            node_data = embedded_batch_np
        else:
            node_data = np.concatenate([node_data, embedded_batch_np], 0)
    init_centers = k_means(node_data, n_clusters, n_init=20)[0]

    # Initialize cluster centers based on a smaller sample
    cluster_module = DEC(init_centers).cuda()
    optimizer = torch.optim.Adam(list(ae_module.parameters()) +
                                 list(cluster_module.parameters()),
                                 lr=0.001)

    def evaluate(train_round_idx, ae_module, cluster_module):
        test_loader = torch.utils.data.DataLoader(
            torch.utils.data.TensorDataset(pt_data), batch_size=256)

        pred_labels = np.zeros(pt_data.shape[0], dtype=np.int)
        index = 0
        n_batches = 0
        for batch_data in test_loader:
            batch_data = batch_data[0].cuda()
            n_batches += 1
            batch_size = batch_data.shape[0]
            embedded_data, reconstructed_data = ae_module.forward(batch_data)
            labels = cluster_module.prediction_hard_np(embedded_data)
            pred_labels[index:index + batch_size] = labels
            index = index + batch_size
        pred_tree = dendrogram_purity_tree_from_clusters(
            cluster_module, pred_labels, 'single')
        pred_tree2 = dendrogram_purity_tree_from_clusters(
            cluster_module, pred_labels, 'complete')
        lp = leaf_purity(pred_tree, gold_labels)
        leaf_purity_value = f"{lp[0]:1.3}\t({lp[1]:1.3})"
        dp_value_single = dendrogram_purity(pred_tree, gold_labels)
        dp_value_complete = dendrogram_purity(pred_tree2, gold_labels)
        logger.info(
            f"{train_round_idx} Evaluation:  leaf_purity: {leaf_purity_value}, purity_single: {dp_value_single}, purity_complete: {dp_value_complete}"
        )
        return dp_value_single, dp_value_complete, leaf_purity_value

    evaluate("init", ae_module, cluster_module)

    n_rounds = 40000
    train_round_idx = 0
    while True:  # each iteration is equal to an epoch
        for batch_data in train_loader:
            train_round_idx += 1
            if train_round_idx > n_rounds:
                break
            batch_data = batch_data[0].cuda()

            embedded_data, reconstruced_data = ae_module.forward(batch_data)
            ae_loss = ae_reconstruction_loss_fn(batch_data, reconstruced_data)

            cluster_loss = cluster_module.loss_dec_compression(embedded_data)
            loss = cluster_loss + 0.1 * ae_loss
            if train_round_idx == 1 or train_round_idx % 100 == 0:
                logger.info(
                    f"{train_round_idx} - loss in this batch: cluster_loss:{cluster_loss.item()} "
                    f"ae_loss:{ae_loss.item()} total_loss: {ae_loss.item() + cluster_loss.item()}"
                )

            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if train_round_idx % 2000 == 0:
                evaluate(train_round_idx, ae_module, cluster_module)
        else:  # For else is being executed if break did not occur, we continue the while true loop otherwise we break it too
            continue
        break  # Break while loop here

    # Write last evaluation

    dp_value_single, dp_value_complete, leaf_purity_value = evaluate(
        "", ae_module, cluster_module)
    result_file = Path(result_dir, f"results_{dataset_name}.txt")
    result_file_exists = result_file.exists()
    f = open(result_file, "a+")
    if not result_file_exists:
        f.write(
            "#\"ae_model_name\"\t\"Dendrogram_Purity Single\"\t\"Dendrogram_Purity Complete\"\t\"Leaf_Purity\t(Std)\"\n"
        )
    f.write(
        f"{ae_model_path.name}\t{dp_value_single}\t{dp_value_complete}\t{leaf_purity_value}\n"
    )
    f.close()
Beispiel #12
0
    dist = numpy.sqrt(numpy.sum(numpy.square(vec1 - vec2)))  
    return dist 
K=65536
print(K)
with open('ox5kdelf-full', 'rb') as file:
         b=pickle.load(file)
with open('ox5kdelfquery-full', 'rb') as file:
         a=pickle.load(file)
print(a[0]['filename'],b[30]['filename'],b[69]['filename'],b[75]['filename'])
c=[]
print(type(b[0]['descriptor_np_list'][0]))
for i in range(len(b)):
    for j in range(len(b[i]['descriptor_np_list'])):
        c.append(b[i]['descriptor_np_list'][j])
c=np.array(c)
codewords, _ ,_ ,_= k_means(c, K,max_iter=20,return_n_iter=True)
code=[]
query=[]
'''
i=0
gd=np.zeros((K,40), dtype=np.float32)
for j in range(len(a[i]['descriptor_np_list'])):
    x=a[i]['descriptor_np_list'][j].reshape(1,40)
    tmp,_=vq(x,codewords)
    
    gd[tmp]+=x-codewords[tmp]
gd0=gd.reshape(1,-1)
print(gd0)

i=30
gd=np.zeros((K,40), dtype=np.float32)
Beispiel #13
0
import torch.utils.data
from sklearn.cluster.k_means_ import k_means
from ect.methods.DEC import DEC
from scripts.Config import *
from scripts.projection_problem.common_stuff import *

ae_module, pt_data, gold_labels, _, train_loader, ae_reconstruction_loss_fn = init_data_and_ae(
)

embedded_data_np = ae_module.encode(pt_data).detach().cpu().numpy()

dec_module = DEC(k_means(embedded_data_np, 2)[0]).cuda()

optimizer = torch.optim.Adam(list(ae_module.parameters()) +
                             list(dec_module.parameters()),
                             lr=0.001)

gamma = 0.1  # Put 0.0 here for pure DEC

n_rounds = 2000
train_round_idx = 0
while True:  # each iteration is equal to an epoch
    for batch_data in train_loader:
        train_round_idx += 1
        if train_round_idx > n_rounds:
            break
        batch_data = batch_data[0]

        embedded_data, reconstruced_data = ae_module.forward(batch_data)
        ae_loss = ae_reconstruction_loss_fn(batch_data, reconstruced_data)
Beispiel #14
0
				# print descrs_for_vocab.shape

				# result = []
				# for x in xrange(0,30):
				# 	print x
				# 	num_of_cluster = 1+20*x
				# 	_, _, inertia_  = k_means(descrs_for_vocab, num_of_cluster)
				# 	result.append([num_of_cluster, inertia_])

				# import matplotlib.pyplot as plt
				# plt.plot(*zip(*result))
				# plt.show()

				print "clustering sift features to form vocabulary"
				print datetime.now()
				vocab, _, _ = k_means(descrs_for_vocab, NUM_OF_WORD_FOR_VOCAB, verbose=True)
				savemat(join(result_dir, "vocab.mat"),{"vocab":vocab})
			
			else:
				vocab = loadmat(join(result_dir, "vocab.mat"))['vocab']
				print vocab.shape



			# extract sift features, dowmsample if needed, convert to BOW
			# 1000 sift features * 128dim * 2byte -> 4 images per MB ->  4000 images per GB
			if not isfile(join(result_dir, "train_bow.mat")):
				print "computing bag of word representation for train images. This may take a while, but the result will be saved for future usage"
				train_image_path = []
				train_image_classes = []
				query_image_path = []
Beispiel #15
0
def fit(self, X, y=None, sample_weight=None):
    """Compute k-means clustering.

    Parameters
    ----------
    X : array-like or sparse matrix, shape=(n_samples, n_features)
        Training instances to cluster. It must be noted that the data
        will be converted to C ordering, which will cause a memory
        copy if the given data is not C-contiguous.

    y : Ignored
        not used, present here for API consistency by convention.

    sample_weight : array-like, shape (n_samples,), optional
        The weights for each observation in X. If None, all observations
        are assigned equal weight (default: None)

    """
    if self.n_init <= 0:
        raise ValueError("Invalid number of initializations."
                         " n_init=%d must be bigger than zero." % self.n_init)

    random_state = check_random_state(self.random_state)

    if self.max_iter <= 0:
        raise ValueError('Number of iterations should be a positive number,'
                         ' got %d instead' % self.max_iter)

    if self.precompute_distances == 'auto':
        precompute_distances = False
    elif isinstance(self.precompute_distances, bool):
        precompute_distances = self.precompute_distances
    else:
        raise ValueError("precompute_distances should be 'auto' or True/False"
                         ", but a value of %r was passed" %
                         self.precompute_distances)

    # avoid forcing order when copy_x=False
    order = "C" if self.copy_x else None
    X = check_array(X,
                    accept_sparse='csr',
                    dtype=[np.float64, np.float32],
                    order=order,
                    copy=self.copy_x)

    daal_ready = not sp.issparse(X) and not precompute_distances
    daal_ready = daal_ready and hasattr(X, '__array__')

    if daal_ready:
        X_len = _num_samples(X)
        daal_ready = (self.n_clusters <= X_len)
        if daal_ready and sample_weight is not None:
            sample_weight = np.asarray(sample_weight)
            daal_ready = (sample_weight.shape[0] == X_len) and (np.allclose(
                sample_weight, np.ones_like(sample_weight)))

    if not daal_ready:
        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
            k_means(
                X, n_clusters=self.n_clusters, sample_weight=sample_weight, init=self.init,
                n_init=self.n_init, max_iter=self.max_iter, verbose=self.verbose,
                precompute_distances=precompute_distances,
                tol=self.tol, random_state=random_state, copy_x=self.copy_x,
                n_jobs=self.n_jobs, algorithm=self.algorithm,
                return_n_iter=True)
    else:
        X = check_array(X, dtype=[np.float64, np.float32])
        self.cluster_centers_, self.labels_, self.inertia_, self.n_iter_ = \
            _daal4py_k_means_dense(
                X, self.n_clusters, self.max_iter, self.tol, self.init, self.n_init,
                random_state)
    return self
Beispiel #16
0
				# print descrs_for_vocab.shape

				# result = []
				# for x in xrange(0,30):
				# 	print x
				# 	num_of_cluster = 1+20*x
				# 	_, _, inertia_  = k_means(descrs_for_vocab, num_of_cluster)
				# 	result.append([num_of_cluster, inertia_])

				# import matplotlib.pyplot as plt
				# plt.plot(*zip(*result))
				# plt.show()

				print "clustering sift features to form vocabulary"
				print datetime.now()
				vocab, _, _ = k_means(descrs_for_vocab, NUM_OF_WORD_FOR_VOCAB)
				savemat(join(result_dir, "vocab.mat"),{"vocab":vocab})
			
			else:
				vocab = loadmat(join(result_dir, "vocab.mat"))['vocab']
				print vocab.shape



			# extract sift features, dowmsample if needed, convert to BOW
			# 1000 sift features * 128dim * 2byte -> 4 images per MB ->  4000 images per GB
			if not isfile(join(result_dir, "train_bow.mat")):
				print "computing bag of word representation for train images. This may take a while, but the result will be saved for future usage"
				train_image_path = []
				train_image_classes = []
				class_mapping = []
Beispiel #17
0
def cloudstering(dendrogram, catalog, criteria, user_k, user_ams, user_scalpars, user_iter, 
    save_isol_leaves, save_clust_leaves, save_branches, blind, rms, s2nlim, locscal):

    """
    SCIMES main function. It collects parents/children
    of all structrures within the dendrogram, and their
    properties. It calls the affinity matrix-related
    functions (for creation, rescaling, cluster counting),
    and it runs several time the actual spectral clustering
    routine by calculating every time the silhouette of the
    current configuration. Input parameter are passed by the
    SpectralCloudstering class.
    
    Parameters
    -----------

    dendrogram: 'astrodendro.dendrogram.Dendrogram' instance
        The dendrogram to clusterize.

    catalog: 'astropy.table.table.Table' instance
        A catalog containing all properties of the dendrogram
        structures. Generally generated with ppv_catalog module.

    header: 'astropy.io.fits.header.Header' instance
        The header of the fits data the dendrogram was 
        generated from. Necessary to obtain the assignment cubes.

    criteria: list of strings
        Clustering criteria referred to the structure properties
        in the catalog (default ['volume', 'luminosity']).

    user_k: int
        The expected number of clusters, if not provided
        it will be guessed automatically through the eigenvalues
        of the unsmoothed affinity matrix.

    user_ams: numpy array
        User provided affinity matrix. Whether this is not
        furnish it is automatically generated through the
        volume and/or luminosity criteria.

    user_scalpars: list of floats
        User-provided scaling parameters to smooth the
        affinity matrices.

    user_iter: int
        User-provided number of k-means iterations.
    
    save_isol_leaves: bool
        Consider the isolated leaves (without parent) 
        as individual 'clusters'. Useful for low
        resolution data where the beam size
        corresponds to the size of a Giant
        Molecular Cloud.

    save_clust_leaves: bool
        Consider unclustered leaves as individual
        'clusters'. This keyword will not include
        the isolated leaves without parents.

    save_all_leaves: bool
        Trigger both save_isol_leaves and
        save_clust_leaves.

    save_branches: bool
        Retain all isolated branches usually discarded
        by the cluster analysis.

    save_all: bool
        Trigger all save_isol_leaves, 
        save_clust_leaves, and save_branches.        
    
    rms: int or float
        Noise level of the observation. Necessary tolist
        calculate the scaling parameter above a certain
        signal-to-noise ratio.

    s2nlim: int or float
        Signal-to-noise limit above which the
        scaling parameter is calculated. Needed
        only if rms is not np.nan.

    blind: bool
        Show the affinity matrices. 
        Matplotlib required.

    locscaling: bool
        Smooth the affinity matrices using a local
        scaling technique.


    Return
    -------

    clusts: list
        The dendrogram branch indexes corresponding to the
        identified clusters

    catalog: 'astropy.table.table.Table' instance
        The input catalog updated with dendrogram structure
        parent, ancestor, number of leaves, and type 
        ('T', trunks or branches without parent; 'B', branches
        with parent; 'L', leaves). 

    AMs: numpy array
        The affinity matrices calculated by the algorithm
    
    escalpars: list
        Estimated scaling parameters for the different
        affinity matrixes
    
    silhouette: float
        Silhouette of the best cluster configuration

    """

    # Collecting all connectivity and other information into more handy lists
    all_structures_idx = np.arange(len(catalog[criteria[0]].data), dtype='int')

    all_levels = []
    brc_levels = []

    all_leav_names = []
    all_leav_idx = []

    all_brc_names = []
    all_brc_idx = []

    all_parents = []
    all_children = []

    all_struct_names = []
    all_ancestors = []

    all_struct_ancestors = []
    all_struct_parents = []
    all_struct_types = []
    nleaves = []

    trunk_brs_idx = []
    two_clust_idx = []    
    mul_leav_idx = []

    s2ns = []

    for structure_idx in all_structures_idx:

        s = dendrogram[structure_idx]
        all_levels.append(s.level)
        
        s2ns.append(dendrogram[structure_idx].height/rms)

        all_struct_names.append(str(s.idx))
        all_struct_ancestors.append(s.ancestor.idx)
        if s.parent:
            all_struct_parents.append(s.parent.idx)
        else:
            all_struct_parents.append(-1)
        nleaves.append(len(s.sorted_leaves()))

        ancestors = []
        anc = s.parent
        while anc != None:

            ancestors.append(anc.idx)
            anc = anc.parent

        ancestors.append(s.idx)
        all_ancestors.append(ancestors)

        # If structure is a leaf find all the parents
        if s.is_leaf and s.parent != None:

            par = s.parent
            all_leav_names.append(str(s.idx))

            parents = []
            
            while par != None:

                parents.append(par.idx)
                par = par.parent
                
            parents.append(len(catalog[criteria[0]].data)) # This is the trunk!
            all_parents.append(parents)
            
        # If structure is a brach find all its leaves
        if s.is_branch:

            brc_levels.append(s.level)
            all_brc_idx.append(s.idx)
            all_brc_names.append(str(s.idx))
            
            children = []
            
            for leaf in s.sorted_leaves():
                children.append(leaf.idx)
                
            all_children.append(children)

            # Trunk branches
            if s.parent == None:

                trunk_brs_idx.append(s.idx)
                all_leav_idx = all_leav_idx + children

                if s.children[0].is_branch or s.children[1].is_branch:
                    mul_leav_idx = mul_leav_idx + children
                else:
                    two_clust_idx.append(s.idx)

                all_struct_types.append('T')

            else:

                all_struct_types.append('B')
        
        else:

            all_struct_types.append('L')


    two_clust_idx = np.unique(two_clust_idx).tolist()
    
    dict_parents = dict(zip(all_leav_names,all_parents))            
    dict_children = dict(zip(all_brc_names,all_children))    
    dict_ancestors = dict(zip(all_struct_names,all_ancestors))

    all_levels.append(-1)
    all_levels = np.asarray(all_levels)

    # Retriving needed properties from the catalog
    # and adding fake "trunk" properties   
    props = []
    for crit in criteria:
        prop = catalog[crit].data.tolist()
        tprop = sum(catalog[crit].data[trunk_brs_idx])
        prop.append(tprop)
        props.append(prop)
    
    s2ns.append(1)
    props.append(s2ns)


    # Generating affinity matrices if not provided
    if user_ams == None:

        AMs = aff_matrix(len(all_leav_idx), len(catalog[criteria[0]].data), \
            all_leav_idx, all_brc_idx, brc_levels, dict_children, props)

        if blind == False:

            # Showing all affinity matrices
            for i, crit in enumerate(criteria):

                plt.matshow(AMs[i,:,:])
                plt.title('"'+crit+'" affinity matrix', fontsize = 'medium')
                plt.xlabel('leaf index')
                plt.ylabel('leaf index')    
                plt.colorbar()
        
    else:

        AMs = user_ams


    S2Nmat = AMs[-1,:,:]
    AMs = AMs[:-1,:,:]

    # Check if the affinity matrix has more than 2 elements
    # otherwise return everything as clusters ("save_all").
    if AMs.shape[1] <= 2:

        print("--- Not necessary to cluster. 'save_all' keyword triggered")

        all_leaves = []
        for leaf in dendrogram.leaves:
            all_leaves.append(leaf.idx)

        clusts = all_leaves

        return clusts, AMs
        
                
    # Check whether the affinity matrix scaling parameter
    # are provided by the user, if so use them, otherwise
    # calculate them    

    """
    scpars = np.zeros(len(criteria))
    
    if user_scalpars is not None:
        print("- Using user-provided scaling parameters")
        user_scalpars = np.asarray(user_scalpars)
        scpars[0:len(user_scalpars)] = user_scalpars
    """
       
    scpars = np.array(user_scalpars)         

    print("- Start spectral clustering")

    # Selecting the criteria and merging the matrices    
    escalpars = []
    AM = np.ones(AMs[0,:,:].shape)
    for i, crit in enumerate(criteria):

        print("-- Rescaling %s matrix" % crit)
        AMc, sigma = mat_smooth(AMs[i,:,:], S2Nmat, s2nlim = s2nlim, 
            scalpar = scpars[i], lscal = locscal)        
        AM = AM*AMc
        escalpars.append(sigma)
            
    
    # Making the reduced affinity matrices
    mul_leav_mat = []
    for mli in mul_leav_idx:
        mul_leav_mat.append(all_leav_idx.index(mli))

    mul_leav_mat = np.asarray(mul_leav_mat)
    rAM = AM[mul_leav_mat,:]
    rAM = rAM[:,mul_leav_mat]

    if blind == False:
            
        # Showing the final affinity matrix
        plt.matshow(AM)
        plt.colorbar()
        plt.title('Final Affinity Matrix')
        plt.xlabel('leaf index')
        plt.ylabel('leaf index')

      
    # Guessing the number of clusters
    # if not provided

    if user_k == 0:   
        kg = guessk(rAM)
    else:
        kg = user_k-len(two_clust_idx)

    print("-- Guessed number of clusters = %i" % (kg+len(two_clust_idx)))
    
    if kg > 1:

        print("-- Number of k-means iteration: %i" % user_iter)

        # Find the best cluster number
        sils = []

        min_ks = max(2,kg-15)
        max_ks = min(kg+15,rAM.shape[0]-1)
                
        clust_configs = []

        for ks in range(min_ks,max_ks):

            try:
                
                evecs = spectral_embedding(rAM, n_components=ks,
                                        eigen_solver='arpack',
                                        random_state=222,
                                        eigen_tol=0.0, drop_first=False)
                _, all_clusters, _ = k_means(evecs, ks, random_state=222, n_init=user_iter)
                
                sil = silhouette_score(evecs, np.asarray(all_clusters), metric='euclidean')

                clust_configs.append(all_clusters)

            except np.linalg.LinAlgError:

                sil = 0
                
            sils.append(sil)
                    
        # Use the best cluster number to generate clusters                    
        best_ks = sils.index(max(sils))+min_ks
        print("-- Best cluster number found through SILHOUETTE (%f)= %i" % (max(sils), best_ks+len(two_clust_idx)))        
        silhoutte = max(sils)
        
        all_clusters = clust_configs[np.argmax(sils)]
                        
    else:

        print("-- Not necessary to cluster")
        all_clusters = np.zeros(len(all_leaves_idx), dtype = np.int32)

    clust_branches = clust_cleaning(mul_leav_idx, all_clusters, dict_parents, dict_children, dict_ancestors, savebranches = save_branches)
    clusts = clust_branches + two_clust_idx

    print("-- Final cluster number (after cleaning) %i" % len(clusts))
    

    # Calculate the silhouette after cluster cleaning
    #fclusts_idx = np.ones(len(mul_leav_idx))
    fclusts_idx = -1*all_clusters

    i = 1
    for clust in clusts:
        i += 1
        fleavs = dendrogram[clust].sorted_leaves()

        fleavs_idx = []
        for fleav in fleavs:
            fleavs_idx.append(fleav.idx)

        fleavs_idx = np.asarray(fleavs_idx)

        # Find the position of the cluster leaves
        pos = np.where(np.in1d(mul_leav_idx,fleavs_idx))[0]
        fclusts_idx[pos] = i

    oldclusts = np.unique(fclusts_idx[fclusts_idx < 0])

    for oldclust in oldclusts:
        fclusts_idx[fclusts_idx == oldclust] = np.max(fclusts_idx)+1

    evecs = spectral_embedding(rAM, n_components=ks,
                            eigen_solver='arpack',
                            random_state=222,
                            eigen_tol=0.0, drop_first=False)
    sil = silhouette_score(evecs, fclusts_idx, metric='euclidean')

    print("-- Final clustering configuration silhoutte %f" % sil)


    all_struct_types = np.asarray(all_struct_types)
    all_struct_parents = np.asarray(all_struct_parents)

    # Add the isolated leaves to the cluster list, if required
    if save_isol_leaves:

        isol_leaves = all_structures_idx[(all_struct_parents == -1) & (all_struct_types == 'L')]
        clusts = clusts + list(isol_leaves)

        print("SAVE_ISOL_LEAVES triggered. Isolated leaves added.") 
        print("-- Total cluster number %i" % len(clusts))


    # Add the unclustered leaves within clusters to the cluster list, if required
    if save_clust_leaves:

        isol_leaves = all_structures_idx[(all_struct_parents == -1) & (all_struct_types == 'L')]

        all_leaves = []
        for leaf in dendrogram.leaves:
            all_leaves.append(leaf.idx)

        clust_leaves = []
        for clust in clusts:
            for leaf in dendrogram[clust].sorted_leaves():
                clust_leaves.append(leaf.idx)

        unclust_leaves = list(set(all_leaves)-set(clust_leaves + list(isol_leaves)))
        clusts = clusts + unclust_leaves

        print("SAVE_CLUST_LEAVES triggered. Unclustered leaves added.")
        print("-- Total cluster number %i" % len(clusts))
    

    # Update the catalog with new information
    catalog['parent'] = all_struct_parents
    catalog['ancestor'] = all_struct_ancestors
    catalog['n_leaves'] = nleaves
    catalog['structure_type'] = all_struct_types

    return clusts, catalog, AMs, escalpars, silhoutte 
Beispiel #18
0
    def train(self):
        print(f"{datetime.now()} Pre-training evaluation:")

        loss, nmi, acc = self._evaluation()
        print(f"loss: {loss}, acc: {acc}, nmi: {nmi}")

        for e in range(self.current_epoch, self.config.epochs):
            print(f"\n{datetime.now()} epoch {e}/{self.config.epochs}")
            end = time.time()

            if self.config.refine_epoch == e:
                print(f"{datetime.now()} starting refinement stage, targets will be reassigned using k-means")
                with open(os.path.join(self.out_dir, "no_refine_run_stats.pickle"), "wb") as handle:
                    pickle.dump(self.run_stats, handle)

            if self.config.refine_epoch <= e:
                # we are in refinement stage, reassign targets with k-means
                preds = []
                self.model.eval()

                for batch in self.eval_dataloader:
                    images, _ = batch
                    preds.append(self.model(images.cuda()).data.cpu().numpy())

                preds = np.concatenate(preds)
                _, labels, _ = k_means.k_means(preds, self.config.k)

                # find permutation of labels that is closest to previous
                num_correct = np.zeros((self.config.k, self.config.k))
                prev_labels = np.argmax(self.targets, axis=1)
                for c_1 in range(self.config.k):
                    for c_2 in range(self.config.k):
                        num_correct[c_1, c_2] = int(((labels == c_1) * (prev_labels == c_2)).sum())
                _, assignments, _ = lap.lapjv(self.n_data - num_correct)
                reordered = np.zeros(self.n_data, dtype=np.int)
                for c in range(self.config.k):
                    reordered[labels == c] = assignments[c]

                self.targets = np.eye(self.config.k)[reordered]

            if self.config.rotnet:
                # train an epoch on rotation auxiliary task
                for batch in self.rot_dataloader:
                    images, labels = batch

                    unpack_images = []
                    for i in range(len(images[0])):
                        for r in range(4):
                            unpack_images.append(images[r][i])

                    unpack_images = np.stack(unpack_images, axis=0)
                    labels = np.reshape(labels, newshape=-1)

                    self.model.train()

                    images = torch.tensor(unpack_images, dtype=torch.float, device="cuda")
                    labels = labels.cuda()

                    out = self.model(images, rot_head=True)
                    loss = self.rot_crit(out, labels)

                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

            # train an epoch on main clustering task
            for batch in self.train_dataloader:
                images1, images2, indices = batch

                if self.config.refine_epoch > e:
                    # optimize and update targets
                    self.model.eval()

                    pred = self.model(images1.cuda()).data.cpu().numpy()

                    batch_targets = self.targets[indices]

                    cost = euclidean_distances(pred, batch_targets)
                    _, assignments, _ = lap.lapjv(cost)

                    for i, idx in enumerate(indices):
                        self.targets[idx] = batch_targets[assignments[i]]

                images = images2.cuda()
                batch_targets = torch.tensor(self.targets[indices], dtype=torch.float, device="cuda")

                self.model.train()
                pred = self.model(images)
                loss = self.clustering_crit(pred, batch_targets)

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            self.lr_scheduler.step()

            loss, nmi, acc = self._evaluation()
            self.run_stats["loss"].append(loss)
            self.run_stats["acc"].append(acc)
            self.run_stats["nmi"].append(nmi)

            print(f"{datetime.now()} train epoch took: {int(time.time() - end)}s")
            print(f"{datetime.now()} loss: {loss}, acc: {acc}, nmi: {nmi}")

            self.current_epoch = e

            if e % self.config.plot_rate == 0:
                fig, ax = plt.subplots(len(self.run_stats), figsize=(10, 30))

                for i, run_stat_name in enumerate(self.run_stats.keys()):
                    ax[i].plot(range(e + 1), self.run_stats[run_stat_name])
                    title = run_stat_name + ' (' + str(format(self.run_stats[run_stat_name][-1], '.4f')) + ')'
                    ax[i].set_title(title)

                plt.savefig(os.path.join(self.out_dir, "plots"))
                plt.close()

                self.save_checkpoint(self.out_dir)
Beispiel #19
0
def spectral_clustering(affinity,
                        n_clusters=8,
                        n_components=None,
                        eigen_solver=None,
                        random_state=None,
                        n_init=10,
                        k=None,
                        eigen_tol=0.0,
                        assign_labels='kmeans',
                        mode=None):
    """Apply clustering to a projection to the normalized laplacian.

    In practice Spectral Clustering is very useful when the structure of
    the individual clusters is highly non-convex or more generally when
    a measure of the center and spread of the cluster is not a suitable
    description of the complete cluster. For instance when clusters are
    nested circles on the 2D plan.

    If affinity is the adjacency matrix of a graph, this method can be
    used to find normalized graph cuts.

    Parameters
    -----------
    affinity: array-like or sparse matrix, shape: (n_samples, n_samples)
        The affinity matrix describing the relationship of the samples to
        embed. **Must be symmetric**.

        Possible examples:
          - adjacency matrix of a graph,
          - heat kernel of the pairwise distance matrix of the samples,
          - symmetric k-nearest neighbours connectivity matrix of the samples.

    n_clusters: integer, optional
        Number of clusters to extract.

    n_components: integer, optional, default is k
        Number of eigen vectors to use for the spectral embedding

    eigen_solver: {None, 'arpack' or 'amg'}
        The eigenvalue decomposition strategy to use. AMG requires pyamg
        to be installed. It can be faster on very large, sparse problems,
        but may also lead to instabilities

    random_state: int seed, RandomState instance, or None (default)
        A pseudo random number generator used for the initialization
        of the lobpcg eigen vectors decomposition when eigen_solver == 'amg'
        and by the K-Means initialization.

    n_init: int, optional, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    eigen_tol : float, optional, default: 0.0
        Stopping criterion for eigendecomposition of the Laplacian matrix
        when using arpack eigen_solver.

    assign_labels : {'kmeans', 'discretize'}, default: 'kmeans'
        The strategy to use to assign labels in the embedding
        space.  There are two ways to assign labels after the laplacian
        embedding.  k-means can be applied and is a popular choice. But it can
        also be sensitive to initialization. Discretization is another
        approach which is less sensitive to random initialization. See
        the 'Multiclass spectral clustering' paper referenced below for
        more details on the discretization approach.

    Returns
    -------
    labels: array of integers, shape: n_samples
        The labels of the clusters.

    References
    ----------

    - Normalized cuts and image segmentation, 2000
      Jianbo Shi, Jitendra Malik
      http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324

    - A Tutorial on Spectral Clustering, 2007
      Ulrike von Luxburg
      http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323

    - Multiclass spectral clustering, 2003
      Stella X. Yu, Jianbo Shi
      http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf

    Notes
    ------
    The graph should contain only one connect component, elsewhere
    the results make little sense.

    This algorithm solves the normalized cut for k=2: it is a
    normalized spectral clustering.
    """
    if not assign_labels in ('kmeans', 'discretize'):
        raise ValueError("The 'assign_labels' parameter should be "
                         "'kmeans' or 'discretize', but '%s' was given" %
                         assign_labels)

    if not k is None:
        warnings.warn(
            "'k' was renamed to n_clusters and will "
            "be removed in 0.15.", DeprecationWarning)
        n_clusters = k
    if not mode is None:
        warnings.warn(
            "'mode' was renamed to eigen_solver "
            "and will be removed in 0.15.", DeprecationWarning)
        eigen_solver = mode

    random_state = check_random_state(random_state)
    n_components = n_clusters if n_components is None else n_components
    maps = spectral_embedding(affinity,
                              n_components=n_components,
                              eigen_solver=eigen_solver,
                              random_state=random_state,
                              eigen_tol=eigen_tol,
                              drop_first=False)

    if assign_labels == 'kmeans':
        _, labels, _ = k_means(maps,
                               n_clusters,
                               random_state=random_state,
                               n_init=n_init)
    else:
        labels = discretize(maps, random_state=random_state)

    return labels, maps
Beispiel #20
0
def spectral_clustering(affinity, n_clusters=8, n_components=None,
                        eigen_solver=None, random_state=None, n_init=10,
                        k=None, eigen_tol=0.0,
                        assign_labels='kmeans',
                        mode=None):
    """Apply clustering to a projection to the normalized laplacian.

    In practice Spectral Clustering is very useful when the structure of
    the individual clusters is highly non-convex or more generally when
    a measure of the center and spread of the cluster is not a suitable
    description of the complete cluster. For instance when clusters are
    nested circles on the 2D plan.

    If affinity is the adjacency matrix of a graph, this method can be
    used to find normalized graph cuts.

    Parameters
    -----------
    affinity: array-like or sparse matrix, shape: (n_samples, n_samples)
        The affinity matrix describing the relationship of the samples to
        embed. **Must be symmetric**.

        Possible examples:
          - adjacency matrix of a graph,
          - heat kernel of the pairwise distance matrix of the samples,
          - symmetric k-nearest neighbours connectivity matrix of the samples.

    n_clusters: integer, optional
        Number of clusters to extract.

    n_components: integer, optional, default is k
        Number of eigen vectors to use for the spectral embedding

    eigen_solver: {None, 'arpack' or 'amg'}
        The eigenvalue decomposition strategy to use. AMG requires pyamg
        to be installed. It can be faster on very large, sparse problems,
        but may also lead to instabilities

    random_state: int seed, RandomState instance, or None (default)
        A pseudo random number generator used for the initialization
        of the lobpcg eigen vectors decomposition when eigen_solver == 'amg'
        and by the K-Means initialization.

    n_init: int, optional, default: 10
        Number of time the k-means algorithm will be run with different
        centroid seeds. The final results will be the best output of
        n_init consecutive runs in terms of inertia.

    eigen_tol : float, optional, default: 0.0
        Stopping criterion for eigendecomposition of the Laplacian matrix
        when using arpack eigen_solver.

    assign_labels : {'kmeans', 'discretize'}, default: 'kmeans'
        The strategy to use to assign labels in the embedding
        space.  There are two ways to assign labels after the laplacian
        embedding.  k-means can be applied and is a popular choice. But it can
        also be sensitive to initialization. Discretization is another
        approach which is less sensitive to random initialization. See
        the 'Multiclass spectral clustering' paper referenced below for
        more details on the discretization approach.

    Returns
    -------
    labels: array of integers, shape: n_samples
        The labels of the clusters.

    References
    ----------

    - Normalized cuts and image segmentation, 2000
      Jianbo Shi, Jitendra Malik
      http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324

    - A Tutorial on Spectral Clustering, 2007
      Ulrike von Luxburg
      http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323

    - Multiclass spectral clustering, 2003
      Stella X. Yu, Jianbo Shi
      http://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf

    Notes
    ------
    The graph should contain only one connect component, elsewhere
    the results make little sense.

    This algorithm solves the normalized cut for k=2: it is a
    normalized spectral clustering.
    """
    if not assign_labels in ('kmeans', 'discretize'):
        raise ValueError("The 'assign_labels' parameter should be "
                         "'kmeans' or 'discretize', but '%s' was given"
                         % assign_labels)

    if not k is None:
        warnings.warn("'k' was renamed to n_clusters and will "
                      "be removed in 0.15.",
                      DeprecationWarning)
        n_clusters = k
    if not mode is None:
        warnings.warn("'mode' was renamed to eigen_solver "
                      "and will be removed in 0.15.",
                      DeprecationWarning)
        eigen_solver = mode

    random_state = check_random_state(random_state)
    n_components = n_clusters if n_components is None else n_components
    maps = spectral_embedding(affinity, n_components=n_components,
                              eigen_solver=eigen_solver,
                              random_state=random_state,
                              eigen_tol=eigen_tol, drop_first=False)

    if assign_labels == 'kmeans':
        _, labels, _ = k_means(maps, n_clusters, random_state=random_state,
                               n_init=n_init)
    else:
        labels = discretize(maps, random_state=random_state)

    return labels