def kmeans_constrained_single(X, n_clusters, size_min=None, size_max=None, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4): """A single run of k-means constrained, assumes preparation completed prior. Parameters ---------- X : array-like of floats, shape (n_samples, n_features) The observations to cluster. size_min : int, optional, default: None Constrain the label assignment so that each cluster has a minimum size of size_min. If None, no constrains will be applied size_max : int, optional, default: None Constrain the label assignment so that each cluster has a maximum size of size_max. If None, no constrains will be applied n_clusters : int The number of clusters to form as well as the number of centroids to generate. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol : float, optional The relative increment in the results before declaring convergence. verbose : boolean, optional Verbosity mode x_squared_norms : array Precomputed x_squared_norms. random_state : int, RandomState instance or None, optional, default: None If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ sample_weight = np.ones(X.shape[0]) random_state = check_random_state(random_state) n_samples = X.shape[0] best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(n_samples, ), dtype=X.dtype) # Determine min and max sizes if non given if size_min is None: size_min = 0 if size_max is None: size_max = n_samples # Number of data points # Check size min and max if not ((size_min >= 0) and (size_min <= n_samples) and (size_max >= 0) and (size_max <= n_samples)): raise ValueError( "size_min and size_max must be a positive number smaller " "than the number of data points or `None`") if size_max < size_min: raise ValueError("size_max must be larger than size_min") if size_min * n_clusters > n_samples: raise ValueError( "The product of size_min and n_clusters cannot exceed the number of samples (X)" ) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = \ _labels_constrained(X, centers, size_min, size_max, distances=distances) # computation of the means is also called the M-step of EM if sp.issparse(X): centers = _centers_sparse(X, sample_weight, labels, n_clusters, distances) else: centers = _centers_dense(X, sample_weight, labels, n_clusters, distances) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_constrained(X, centers, size_min, size_max, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def _kmeans_single(X, n_clusters, x_squared_norms, max_iter=300, init='k-means++', verbose=False, random_state=None, tol=1e-4, precompute_distances=True, sample_weight=None): """A single run of k-means, assumes preparation completed prior. Parameters ---------- X: array-like of floats, shape (n_samples, n_features) The observations to cluster. n_clusters: int The number of clusters to form as well as the number of centroids to generate. max_iter: int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init: {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol: float, optional The relative increment in the results before declaring convergence. verbose: boolean, optional Verbosity mode x_squared_norms: array Precomputed x_squared_norms. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). random_state: integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- centroid: float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label: integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia: float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ if sample_weight == None: sample_weight = np.ones(X.shape[0]) random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init centers = k_means_._init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0],), dtype=np.float64) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) sample_weight = np.asarray([1.0] * len(labels)) # computation of the means is also called the M-step of EM if sp.issparse(X): centers = _k_means._centers_sparse(X, sample_weight, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters, distances) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia shift = squared_norm(centers_old - centers) if shift <= tol: if verbose: print("Converged at iteration %d" % i) break if shift > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): ''' Modified from sklearn.cluster.k_means_.k_means_single_lloyd. ''' random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment # TODO: _labels_inertia should be done with cosine distance # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized # this doesn't really matter. labels, inertia = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means if sp.issparse(X): centers = _k_means._centers_sparse(X, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, labels, n_clusters, distances) # l2-normalize centers (this is the main contibution here) centers = normalize(centers) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def _kmeans_single_lloyd(X, sample_weight, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True, group=None): """A single run of k-means, assumes preparation completed prior. Parameters ---------- X : array-like of floats, shape (n_samples, n_features) The observations to cluster. n_clusters : int The number of clusters to form as well as the number of centroids to generate. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': choose k observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol : float, optional The relative increment in the results before declaring convergence. verbose : boolean, optional Verbosity mode x_squared_norms : array Precomputed x_squared_norms. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). random_state : int, RandomState instance or None (default) Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ random_state = check_random_state(random_state) sample_weight = _check_sample_weight(X, sample_weight) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = \ _labels_inertia(X, sample_weight, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances, group=group) # computation of the means is also called the M-step of EM if sp.issparse(X): centers = _k_means._centers_sparse(X, sample_weight, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters, distances) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, sample_weight, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances, group=group) return best_labels, best_inertia, best_centers, i + 1
def _kmeans_single_with_weights(X, weights, n_clusters, x_squared_norms, max_iter=300, init='kmeans++_with_weights', verbose=False, random_state=None, tol=1e-4, precompute_distances=True): """A single run of k-means with weights, assumes preparation completed prior. Parameters ---------- X: array-like of floats, shape (n_samples, n_features) The observations to cluster. n_clusters: int The number of clusters to form as well as the number of centroids to generate. weights: array, shape (n_samples) max_iter: int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init: {'kmeans++_with_weights', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'kmeans++_with_weights' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init_with_weights for more details. 'random': generate k centroids from a Gaussian with mean and variance estimated from the data. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol: float, optional The relative increment in the results before declaring convergence. verbose: boolean, optional Verbosity mode x_squared_norms: array Precomputed x_squared_norms. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). random_state: integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- centroid: float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label: integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. weighted inertia: float The final value of the inertia criterion (weighted sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ # random_state = check_random_state(random_state) ####remove this best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids_with_weights(X, weights, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=np.float64) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, _ = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means is also called the M-step of EM # if sp.issparse(X): # centers = _k_means._centers_sparse(X, labels, n_clusters, # distances) # else: # centers = _k_means._centers_dense(X, labels, n_clusters, distances) # computation of the weighted means is also called the M-step of EM ##Check this!!# weights_clusters = np.zeros(n_clusters) for i in range(len(X)): weights_clusters[labels[i]] += weights[i] n_samples_in_cluster = np.bincount(labels, minlength=n_clusters) if sp.issparse(X): centers = _k_means._centers_sparse( np.multiply(X, weights.reshape(len(weights), 1)), labels, n_clusters, distances) else: centers = _k_means._centers_dense( np.multiply(X, weights.reshape(len(weights), 1)), labels, n_clusters, distances) weights_clusters = np.zeros(n_clusters) for i in range(len(X)): weights_clusters[labels[i]] += weights[i] weights_clusters[weights_clusters == 0] = 1 n_samples_in_cluster = np.bincount(labels, minlength=n_clusters) centers /= weights_clusters[:, np.newaxis] centers *= n_samples_in_cluster[:, np.newaxis] # weighted inertia is computed here centers_labelled = centers[labels] row_norms_diff = row_norms(X - centers_labelled, squared=True)[np.newaxis, :] inertia = np.sum(weights * row_norms_diff) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia shift = squared_norm(centers_old - centers) if shift <= tol: if verbose: print("Converged at iteration %d" % i) break if shift > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, _ = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) # weighted best_inertia is computed here best_centers_labelled = best_centers[best_labels] best_row_norms_diff = row_norms(X - best_centers_labelled, squared=True)[np.newaxis, :] best_inertia = np.sum(weights * best_row_norms_diff) return best_labels, best_inertia, best_centers, i + 1
def _kmeans_step(X, x_squared_norms, centers, distances, precompute_distances, n_clusters, random_state=None): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters ---------- X : array, shape (n_samples, n_features) The original data array. x_squared_norms : array, shape (n_samples,) Squared euclidean norm of each data point. centers : array, shape (k, n_features) The cluster centers. This array is MODIFIED IN PLACE distances : array, dtype float64, shape (n_samples), optional If not None, should be a pre-allocated array that will be used to store the distances of each sample to its closest center. May not be None when random_reassign is True. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- inertia : float Sum of distances of samples to their closest cluster center. squared_diff : numpy array, shape (n_clusters,) Squared distances between previous and updated cluster centers. """ centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = k_means_._labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means is also called the M-step of EM if sp.issparse(X): centers = _k_means._centers_sparse(X, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, labels, n_clusters, distances) """ if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia """ shift = squared_norm(centers_old - centers) """ if shift <= tol: if verbose: print("Converged at iteration %d" % i) break if shift > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) """ return centers,inertia, shift
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): ''' Modified from sklearn.cluster.k_means_.k_means_single_lloyd. ''' random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment # TODO: _labels_inertia should be done with cosine distance # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized # this doesn't really matter. labels, inertia = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means if sp.issparse(X): centers = _k_means._centers_sparse(X, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, labels, n_clusters, distances) # l2-normalize centers (this is the main contibution here) centers = normalize(centers) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1