def test_x_squared_norms_init_centroids(): # Test that x_squared_norms can be None in _init_centroids from sklearn.cluster._kmeans import _init_centroids X_norms = np.sum(X**2, axis=1) precompute = _init_centroids( X, 3, "k-means++", random_state=0, x_squared_norms=X_norms) assert_array_almost_equal( precompute, _init_centroids(X, 3, "k-means++", random_state=0))
def _fit_single(X, y=None, n_clusters=2, init='random', random_state=None, metric='riemann', max_iter=100, tol=1e-4, n_jobs=1): """helper to fit a single run of centroid.""" # init random state if provided mdm = MDM(metric=metric, n_jobs=n_jobs) squared_nomrs = [np.linalg.norm(x, ord='fro')**2 for x in X] mdm.covmeans_ = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=squared_nomrs) if y is not None: mdm.classes_ = np.unique(y) else: mdm.classes_ = np.arange(n_clusters) labels = mdm.predict(X) k = 0 while True: old_labels = labels.copy() mdm.fit(X, old_labels) dist = mdm._predict_distances(X) labels = mdm.classes_[dist.argmin(axis=1)] k += 1 if (k > max_iter) | (np.mean(labels == old_labels) > (1 - tol)): break inertia = sum([sum(dist[labels == mdm.classes_[i], i]) for i in range(len(mdm.classes_))]) return labels, inertia, mdm
def _fuzzykmeans_single_elkan(X, m, sample_weight, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): if sp.issparse(X): raise TypeError("algorithm='elkan' not supported for sparse input X") n_samples, n_features = X.shape random_state = check_random_state(random_state) fuzzy_labels = random_state.rand(n_samples, n_clusters) fuzzy_labels /= fuzzy_labels.sum(axis=1)[:, np.newaxis] if x_squared_norms is None: x_squared_norms = row_norms(X, squared=True) # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) centers = m_step(centers, fuzzy_labels, m) centers = np.ascontiguousarray(centers) if verbose: print('Initialization complete') checked_sample_weight = _check_normalize_sample_weight(sample_weight, X) centers, labels, n_iter = k_means_elkan(X, checked_sample_weight, n_clusters, centers, tol=tol, max_iter=max_iter, verbose=verbose) fuzzy_labels, labels = e_step(X, centers, m) centers = m_step(X, fuzzy_labels, m) if sample_weight is None: inertia = np.sum((X - centers[labels])**2, dtype=np.float64) else: sq_distances = np.sum( (X - centers[labels])**2, axis=1, dtype=np.float64) * checked_sample_weight inertia = np.sum(sq_distances, dtype=np.float64) return fuzzy_labels, labels, inertia, centers, n_iter
def _init_and_run( X: np.ndarray, y: np.ndarray, init, init_size, r_mat: np.ndarray, max_iter, verbose, tol, init_advanced, working_memory, ): n_clusters = r_mat.shape[1] if init_advanced: print("Advanced initialization.") centers = np.empty((n_clusters, X.shape[1]), dtype=X.dtype) # Decode rejection matrix y_enc = _r_mat_to_y(r_mat) # Initialize centers for determined clusters i = 0 for i in range(n_clusters): mask = y_enc == i if mask.sum() == 0: break centers[i] = X[mask].mean(axis=0) if verbose: print( f"Initializing {i} labeled and {n_clusters - i} unlabeled cluster centers..." ) print("init_size:", init_size) if n_clusters - i > 0: # Initialize cententers for remaining free clusters from free objects (y==-1) centers[i:] = _init_centroids(X[y == -1], n_clusters - i, init=init, init_size=init_size) else: centers = _init_centroids(X, n_clusters, init=init, init_size=init_size) assert centers.shape[0] == r_mat.shape[1] ( labels, sample_distances, inertia, centers, n_iter_, ) = _constrained_kmeans_single( X, r_mat, centers_init=centers, max_iter=max_iter, verbose=verbose, tol=tol, working_memory=working_memory, ) return ( labels, sample_distances, inertia, centers, n_iter_, )
def calc_sampling_distribution(self): x_squared_norms = row_norms(self.X, squared=True) centers = _init_centroids(self.X, self.n_clusters, self.init, random_state=self.random_state, x_squared_norms=x_squared_norms) sens = sensitivity.kmeans_sensitivity(self.X, self.w, centers, max(np.log(self.n_clusters), 1)) self.p = sens / np.sum(sens)
def _fuzzykmeans_single_lloyd(X, m, sample_weight, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): """A single run of k-means, assumes preparation completed prior. Parameters ---------- X : array-like of floats, shape (n_samples, n_features) The observations to cluster. n_clusters : int The number of clusters to form as well as the number of centroids to generate. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': choose k observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol : float, optional The relative increment in the results before declaring convergence. verbose : boolean, optional Verbosity mode x_squared_norms : array Precomputed x_squared_norms. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). random_state : int, RandomState instance or None (default) Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ n_samples, n_features = X.shape random_state = check_random_state(random_state) fuzzy_labels = random_state.rand(n_samples, n_clusters) fuzzy_labels /= fuzzy_labels.sum(axis=1)[:, np.newaxis] sample_weight = _check_normalize_sample_weight(sample_weight, X) best_fuzzy_labels, best_labels, best_inertia, best_centers = None, None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) centers = m_step(centers, fuzzy_labels, m) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = \ _labels_inertia(X, sample_weight, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) fuzzy_labels, labels = e_step(X, centers, m) # computation of the means is also called the M-step of EM # if sp.issparse(X): # centers = _k_means._centers_sparse(X, sample_weight, labels, # n_clusters, distances) # else: # centers = _k_means._centers_dense(X, sample_weight, labels, # n_clusters, distances) centers = m_step(X, fuzzy_labels, m) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_fuzzy_labels = fuzzy_labels.copy() best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, sample_weight, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) best_fuzzy_labels, best_labels = e_step(X, centers, m) return best_fuzzy_labels, best_labels, best_inertia, best_centers, i + 1