def _labels_inertia_skl(X, sample_weight, x_squared_norms, centers, precompute_distances=True, distances=None): """E step of the K-means EM algorithm. Compute the labels and the inertia of the given samples and centers. This will compute the distances in-place. :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features) The input samples to assign to the labels. :param sample_weight: array-like, shape (n_samples,) The weights for each observation in X. :param x_squared_norms: array, shape (n_samples,) Precomputed squared euclidean norm of each data point, to speed up computations. :param centers: float array, shape (k, n_features) The cluster centers. :param precompute_distances: boolean, default: True Precompute distances (faster but takes more memory). :param distances: float array, shape (n_samples,) Pre-allocated array to be filled in with each sample's distance to the closest center. :return: labels, int array of shape(n) The resulting assignment :return: inertia, float Sum of squared distances of samples to their closest cluster center. """ n_samples = X.shape[0] sample_weight = _check_sample_weight(sample_weight, X) # set the default value of centers to -1 to be able to detect any anomaly # easily labels = numpy.full(n_samples, -1, numpy.int32) if distances is None: distances = numpy.zeros(shape=(0, ), dtype=X.dtype) # distances will be changed in-place if issparse(X): inertia = _assign_labels_csr(X, sample_weight, x_squared_norms, centers, labels, distances=distances) else: if precompute_distances: return _labels_inertia_precompute_dense( norm='l2', X=X, sample_weight=sample_weight, centers=centers, distances=distances) inertia = _assign_labels_array(X, sample_weight, x_squared_norms, centers, labels, distances=distances) return labels, inertia
def _labels_inertia(norm, X, sample_weight, centers, precompute_distances=True, distances=None): """ E step of the K-means EM algorithm. Computes the labels and the inertia of the given samples and centers. This will compute the distances in-place. :param norm: 'l1' or 'l2' :param X: float64 array-like or CSR sparse matrix, shape (n_samples, n_features) The input samples to assign to the labels. :param sample_weight: array-like, shape (n_samples,) The weights for each observation in X. :param centers: float array, shape (k, n_features) The cluster centers. :param precompute_distances: boolean, default: True Precompute distances (faster but takes more memory). :param distances: existing distances :return: labels : int array of shape(n) The resulting assignment :return: inertia : float Sum of squared distances of samples to their closest cluster center. """ if norm == 'l2': return _labels_inertia_skl(X, sample_weight=sample_weight, centers=centers, precompute_distances=precompute_distances, x_squared_norms=None) sample_weight = _check_sample_weight(sample_weight, X) # set the default value of centers to -1 to be able to detect any anomaly # easily if distances is None: distances = numpy.zeros(shape=(0, ), dtype=X.dtype) # distances will be changed in-place if issparse(X): raise NotImplementedError( # pragma no cover "Sparse matrix is not implemented for norm 'l1'.") if precompute_distances: return _labels_inertia_precompute_dense(norm=norm, X=X, sample_weight=sample_weight, centers=centers, distances=distances) raise NotImplementedError( # pragma no cover "precompute_distances is False, not implemented for norm 'l1'.")
def _kmeans_single_lloyd(norm, X, sample_weight, n_clusters, max_iter=300, init='k-means++', verbose=False, random_state=None, tol=1e-4): """ A single run of k-means, assumes preparation completed prior. :param norm: 'L1' or 'L2' :param X: array-like of floats, shape (n_samples, n_features) The observations to cluster. :param n_clusters: int The number of clusters to form as well as the number of centroids to generate. :param sample_weight: array-like, shape (n_samples,) The weights for each observation in X. :param max_iter: int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. :param init: {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': choose k observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. :param tol: float, optional The relative increment in the results before declaring convergence. :param verbose: boolean, optional Verbosity mode :param random_state: int, RandomState instance or None (default) Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. :return: centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. :return: label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. :return: inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). :return: n_iter : int Number of iterations run. """ random_state = check_random_state(random_state) sample_weight = _check_sample_weight(sample_weight, X) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(norm, X, n_clusters, init, random_state=random_state) if verbose: # pragma no cover print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = numpy.zeros(shape=(X.shape[0], ), dtype=X.dtype) X_sort_index = numpy.argsort(X, axis=0) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = _labels_inertia(norm, X, sample_weight, centers, distances=distances) # computation of the means is also called the M-step of EM centers = _centers_dense(X, sample_weight, labels, n_clusters, distances, X_sort_index) if verbose: # pragma no cover print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = numpy.sum( numpy.abs(centers_old - centers).ravel()) if center_shift_total <= tol: if verbose: # pragma no cover print("Converged at iteration %d: " "center shift %r within tolerance %r" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = _labels_inertia(norm, X, sample_weight, best_centers, distances=distances) return best_labels, best_inertia, best_centers, i + 1