def test_check_sample_weight(): from sklearn.cluster.k_means_ import _check_sample_weight sample_weight = None checked_sample_weight = _check_sample_weight(X, sample_weight) assert_equal(_num_samples(X), _num_samples(checked_sample_weight)) assert_almost_equal(checked_sample_weight.sum(), _num_samples(X)) assert_equal(X.dtype, checked_sample_weight.dtype)
def _check_normalize_sample_weight(sample_weight, X): """Set sample_weight if None, and check for correct dtype""" sample_weight_was_none = sample_weight is None sample_weight = _check_sample_weight(X, sample_weight) if not sample_weight_was_none: # normalize the weights to sum up to n_samples # an array of 1 (i.e. samples_weight is None) is already normalized n_samples = len(sample_weight) scale = n_samples / sample_weight.sum() sample_weight *= scale return sample_weight
def _labels_inertia(X, sample_weight, x_squared_norms, centers, distances, same_cluster_size=False): """E step of the K-means EM algorithm. Compute the labels and the inertia of the given samples and centers. This will compute the distances in-place. Parameters ---------- X : float64 array-like or CSR sparse matrix, shape (n_samples, n_features) The input samples to assign to the labels. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. x_squared_norms : array, shape (n_samples,) Precomputed squared euclidean norm of each data point, to speed up computations. centers : float array, shape (k, n_features) The cluster centers. distances : float array, shape (n_samples,) Pre-allocated array to be filled in with each sample's distance to the closest center. Returns ------- labels : int array of shape(n) The resulting assignment inertia : float Sum of squared distances of samples to their closest cluster center. """ sample_weight = _check_sample_weight(X, sample_weight) n_samples = X.shape[0] n_clusters = centers.shape[0] # See http://jmonlong.github.io/Hippocamplus/2018/06/09/cluster-same-size/#same-size-k-means-variation if same_cluster_size: cluster_size = n_samples // n_clusters labels = np.zeros(n_samples, dtype=np.int32) mindist = np.zeros(n_samples, dtype=np.float32) # count how many samples have been labeled in a cluster counters = np.zeros(n_clusters, dtype=np.int32) # dist: (n_samples, n_clusters) dist = euclidean_distances(X, centers, squared=False) closeness = dist.min(axis=-1) - dist.max(axis=-1) ranking = np.argsort(closeness) for r in ranking: while True: label = dist[r].argmin() if counters[label] < cluster_size: labels[r] = label counters[label] += 1 # squared distances are used for inertia in this function mindist[r] = dist[r, label] ** 2 break else: dist[r, label] = np.inf else: # Breakup nearest neighbor distance computation into batches to prevent # memory blowup in the case of a large number of samples and clusters. # TODO: Once PR #7383 is merged use check_inputs=False in metric_kwargs. labels, mindist = pairwise_distances_argmin_min( X=X, Y=centers, metric='euclidean', metric_kwargs={'squared': True}) # cython k-means code assumes int32 inputs labels = labels.astype(np.int32, copy=False) if n_samples == distances.shape[0]: # distances will be changed in-place distances[:] = mindist inertia = (mindist * sample_weight).sum() return labels, inertia
def kmeans_lloyd(X, sample_weight, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, same_cluster_size=False): """A single run of k-means, assumes preparation completed prior. Parameters ---------- X : array-like of floats, shape (n_samples, n_features) The observations to cluster. n_clusters : int The number of clusters to form as well as the number of centroids to generate. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': choose k observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol : float, optional The relative increment in the results before declaring convergence. verbose : boolean, optional Verbosity mode x_squared_norms : array Precomputed x_squared_norms. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). random_state : int, RandomState instance or None (default) Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ random_state = check_random_state(random_state) if same_cluster_size: assert len(X) % n_clusters == 0, "#samples is not divisible by #clusters" if verbose: print("\n==> Starting k-means clustering...\n") sample_weight = _check_sample_weight(X, sample_weight) x_squared_norms = row_norms(X, squared=True) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = \ _labels_inertia(X, sample_weight, x_squared_norms, centers, distances=distances, same_cluster_size=same_cluster_size) # computation of the means is also called the M-step of EM centers = _centers_dense( X, sample_weight, labels, n_clusters, distances) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, sample_weight, x_squared_norms, best_centers, distances=distances, same_cluster_size=same_cluster_size) return best_labels, best_inertia, best_centers, i + 1
def subspace_kmeans_single(X, sample_weight, n_clusters, init='k-means++', max_iter=300, tol=1e-4, tol_eig=-1e-10, verbose=False, x_squared_norms=None, random_state=None): random_state = check_random_state(random_state) sample_weight = _check_sample_weight(X, sample_weight) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) # === Beginning of original implementation of initialization === # Dimensionality of original space d = X.shape[1] # Set initial V as QR-decomposed Q of random matrix rand_vals = random_state.random_sample(d**2).reshape(d, d) V, _ = np.linalg.qr(rand_vals, mode='complete') # Set initial m as d/2 m = d // 2 # Scatter matrix of the dataset in the original space S_D = np.dot(X.T, X) # Projection onto the first m attributes P_C = np.eye(m, M=d).T # === End of original implementation of initialization === # iterations for i in range(max_iter): centers_old = centers.copy() # === Beginning of original implementation of E-step of EM === X_C = np.dot(np.dot(X, V), P_C) mu_C = np.dot(np.dot(centers, V), P_C) labels, _ = pairwise_distances_argmin_min( X=X_C, Y=mu_C, metric='euclidean', metric_kwargs={'squared': True}) labels = labels.astype(np.int32) # === End of original implementation of E-step of EM === # computation of the means is also called the M-step of EM centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters, distances) # === Beginning of original implementation of M-step of EM === S = np.zeros((d, d)) for i in range(n_clusters): X_i = X[:][labels == i] - centers[:][i] S += np.dot(X_i.T, X_i) Sigma = S - S_D evals, evecs = np.linalg.eigh(Sigma) idx = np.argsort(evals)[::1] V = evecs[:, idx] m = len(np.where(evals < tol_eig)[0]) if m == 0: raise ValueError( 'Dimensionality of clustered space is 0. ' 'The dataset is better explained by a single cluster.') P_C = np.eye(m, M=d).T inertia = 0.0 for i in range(n_clusters): inertia += row_norms(X[:][labels == i] - centers[:][i], squared=True).sum() # === End of original implementation of M-step of EM === if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, sample_weight,x_squared_norms, best_centers, precompute_distances=False, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def _spherical_kmeans_single_lloyd( X, n_clusters, sample_weight=None, max_iter=300, init="k-means++", verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True, ): """ Modified from sklearn.cluster.k_means_.k_means_single_lloyd. """ random_state = check_random_state(random_state) sample_weight = _check_sample_weight(sample_weight, X) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids( X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms ) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment # TODO: _labels_inertia should be done with cosine distance # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized # this doesn't really matter. labels, inertia = _labels_inertia( X, sample_weight, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances, ) # computation of the means if sp.issparse(X): centers = _k_means._centers_sparse( X, sample_weight, labels, n_clusters, distances ) else: centers = _k_means._centers_dense( X.astype(np.float), sample_weight.astype(np.float), labels, n_clusters, distances.astype(np.float), ) # l2-normalize centers (this is the main contibution here) centers = normalize(centers) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print( "Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol) ) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = _labels_inertia( X, sample_weight, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances, ) return best_labels, best_inertia, best_centers, i + 1
def sub_kmeans_single_(self, X, sample_weight, x_squared_norms, tol, random_state): random_state = check_random_state(random_state) sample_weight = _check_sample_weight(X, sample_weight) best_labels, best_inertia, best_centers = None, None, None distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) centers = _init_centroids(X, self.n_clusters, init='k-means++', random_state=random_state, x_squared_norms=x_squared_norms) d = X.shape[1] # dimentionality of original space m = d // 2 # dimentionality of clustered space SD = np.dot(X.T, X) # scatter matrix of the dataset in the original space # orthonormal matrix of a rigid transformation V, _ = np.linalg.qr(random_state.random_sample(d**2).reshape(d, d), mode='complete') for i in range(self.max_iter): centers_old = centers.copy() # get the clusters' labels labels = self.assignment_step_(X=X, V=V, centers=centers, m=m) # compute new centers and sum the clusters' scatter matrices centers = _k_means._centers_dense(X, sample_weight, labels, self.n_clusters, distances) S = self.update_step_(X, centers, labels) # sorted eigenvalues and eigenvectors of SIGMA=S-SD V, m = self.eigen_decomposition_(S - SD) if m == 0: raise ValueError('Might be a single cluster (m = 0).') # inertia - sum of squared distances of samples to their closest cluster center inertia = sum([ row_norms(X[labels == j] - centers[j], squared=True).sum() for j in range(self.n_clusters) ]) # print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: # print("Converged at iteration %d: center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels match cluster centers best_labels, best_inertia = _labels_inertia( X, sample_weight, x_squared_norms, best_centers, precompute_distances=False, distances=distances) return best_centers, best_labels, best_inertia