def test_labels_assignment_and_inertia(): # pure numpy implementation as easily auditable reference gold # implementation rng = np.random.RandomState(42) noisy_centers = centers + rng.normal(size=centers.shape) labels_gold = - np.ones(n_samples, dtype=np.int) mindist = np.empty(n_samples) mindist.fill(np.infty) for center_id in range(n_clusters): dist = np.sum((X - noisy_centers[center_id]) ** 2, axis=1) labels_gold[dist < mindist] = center_id mindist = np.minimum(dist, mindist) inertia_gold = mindist.sum() assert_true((mindist >= 0.0).all()) assert_true((labels_gold != -1).all()) # perform label assignment using the dense array input x_squared_norms = (X ** 2).sum(axis=1) labels_array, inertia_array = _labels_inertia( X, x_squared_norms, noisy_centers) assert_array_almost_equal(inertia_array, inertia_gold) assert_array_equal(labels_array, labels_gold) # perform label assignment using the sparse CSR input x_squared_norms_from_csr = row_norms(X_csr, squared=True) labels_csr, inertia_csr = _labels_inertia( X_csr, x_squared_norms_from_csr, noisy_centers) assert_array_almost_equal(inertia_csr, inertia_gold) assert_array_equal(labels_csr, labels_gold)
def test_labels_assignment_and_inertia(): # pure numpy implementation as easily auditable reference gold # implementation rng = np.random.RandomState(42) noisy_centers = centers + rng.normal(size=centers.shape) labels_gold = np.full(n_samples, -1, dtype=np.int) mindist = np.empty(n_samples) mindist.fill(np.infty) for center_id in range(n_clusters): dist = np.sum((X - noisy_centers[center_id])**2, axis=1) labels_gold[dist < mindist] = center_id mindist = np.minimum(dist, mindist) inertia_gold = mindist.sum() assert (mindist >= 0.0).all() assert (labels_gold != -1).all() sample_weight = None # perform label assignment using the dense array input x_squared_norms = (X**2).sum(axis=1) labels_array, inertia_array = _labels_inertia(X, sample_weight, x_squared_norms, noisy_centers) assert_array_almost_equal(inertia_array, inertia_gold) assert_array_equal(labels_array, labels_gold) # perform label assignment using the sparse CSR input x_squared_norms_from_csr = row_norms(X_csr, squared=True) labels_csr, inertia_csr = _labels_inertia(X_csr, sample_weight, x_squared_norms_from_csr, noisy_centers) assert_array_almost_equal(inertia_csr, inertia_gold) assert_array_equal(labels_csr, labels_gold)
def test__labels_constrained_kmeans_parity(): X = np.array([ [0, 0], [1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0], [4, 4] ]).astype('float') centers = np.array([ [0, 0], [4, 4] ]).astype('float') size_min, size_max = 0, len(X) # No restrictions and so should be the same as K-means x_squared_norms = row_norms(X, squared=True) distances_constrained = np.zeros(shape=(X.shape[0],), dtype=X.dtype) labels_constrained, inertia_constrained = _labels_constrained(X, centers, size_min, size_max, distances_constrained) distances_kmeans = np.zeros(shape=(X.shape[0],), dtype=X.dtype) labels_kmeans, inertia_kmeans = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=False, distances=distances_kmeans) assert_array_equal(labels_constrained, labels_kmeans) assert_almost_equal(distances_constrained, distances_kmeans) assert inertia_constrained == inertia_kmeans
def predict(self, X, sample_weight=None): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None) Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ check_is_fitted(self, 'cluster_centers_') X = self._check_test_data(X) daal_ready = sample_weight is None and hasattr( X, '__array__') # or sp.isspmatrix_csr(X) if daal_ready: return _daal4py_k_means_dense(X, self.n_clusters, 0, 0.0, self.cluster_centers_, 1, None)[1] else: x_squared_norms = row_norms(X, squared=True) return _labels_inertia(X, sample_weight, x_squared_norms, self.cluster_centers_)[0]
def fit_predict_score(self, X, weights, init, maxIter=1000): self.init_fit(X,weights,init) for i in range(maxIter): self._e_step() self._m_step() labels , base_inertia = _labels_inertia(self.X,self.x_squared_norms,self.ukList) # inertia = base_inertia + len(self.X) * self.K * self.X.shape[1] * self.PENALIZATION_CLUSTER inertia = 2 * np.log(base_inertia) - np.log(len(self.X)) * self.K return labels, - inertia, self.ukList
def inertie(self,uk): _ , base_inertie = _labels_inertia(self.X, self.x_squared_norms, uk, precompute_distances=True) # s = 0 # for u in uk: # s += np.square(uk - u).sum(axis=1).sum() s = set([s for u in uk for s in np.square(uk - u).sum(axis=1)]) if len(s) == 1: s = 0 else: s.discard(0) s = min(s) base_inertie -= s/200 return base_inertie
def Subspace_iter(X, n_clusters, init='k-means++', max_iter=300, tol=1e-4, tol_eig=-1e-10, x_squared_norms=None, random_state=None): random_state = check_random_state(random_state) centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) new_labels, new_inertia, new_centers = None, None, None distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) d_shape = X.shape[1] randomval = random_state.random_sample(d_shape ** 2).reshape(d_shape, d_shape) V_val, _ = np.linalg.qr(randomval, mode='complete') m_val = d_shape // 2 S_D = np.dot(X.T, X) P_Cluster = np.eye(m_val, M=d_shape).T for i in range(max_iter): centers_old = centers.copy() X_values = np.dot(np.dot(X, V_val), P_Cluster) centers_c = np.dot(np.dot(centers, V_val), P_Cluster) labels, _ = pairwise_distances_argmin_min(X = X_values, Y = centers_c, metric='euclidean',metric_kwargs={'squared': True}) labels = labels.astype(np.int32) centers = _k_means._centers_dense(X, labels, n_clusters, distances) S = np.zeros((d_shape, d_shape)) for it in range(n_clusters): X_it = X[:][labels == it] - centers[:][it] S += np.dot(X_it.T, X_it) Sigma = S - S_D EV, _ = np.linalg.eigh(Sigma) m = len(np.where(EV < tol_eig)[0]) P_Cluster = np.eye(m, M=d_shape).T inertia = 0.0 for j in range(n_clusters): inertia += row_norms( X[:][labels == j] - centers[:][j],squared=True).sum() if new_inertia is None or inertia < new_inertia: new_labels = labels.copy() new_centers = centers.copy() new_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: break if center_shift_total > 0: new_labels, new_inertia = _labels_inertia(X, x_squared_norms, new_centers, precompute_distances=False, distances=distances) return new_labels, new_inertia, new_centers, i + 1
def predict(self, X): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ check_is_fitted(self, 'cluster_centers_') # X = self._check_test_data(X) x_squared_norms = row_norms(X, squared=True) return _labels_inertia(X, x_squared_norms, self.cluster_centers_)[0]
def _mini_batch_step(self, X, x_squared_norms, X_weighted, weights, centers, counts, distances): nearest_center, inertia = _labels_inertia(X, np.ones(X.shape[0]), x_squared_norms, centers, distances=distances) loss = 4 * np.sum((centers[nearest_center] - X)**2, axis=1) k = centers.shape[0] for center_idx in range(k): center_mask = nearest_center == center_idx count = (center_mask * weights).sum() if count > 0: centers[center_idx] *= counts[center_idx] centers[center_idx] += np.sum(X_weighted[center_mask], axis=0) counts[center_idx] += count centers[center_idx] /= counts[center_idx] return inertia, loss
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): ''' Modified from sklearn.cluster.k_means_.k_means_single_lloyd. ''' random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0],), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment # TODO: _labels_inertia should be done with cosine distance # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized # this doesn't really matter. labels, inertia = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means if sp.issparse(X): centers = _k_means._centers_sparse(X, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, labels, n_clusters, distances) # l2-normalize centers (this is the main contibution here) centers = normalize(centers) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def _spherical_kmeans_single_lloyd(X, n_clusters, max_iter=300, init='k-means++', verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True): ''' Modified from sklearn.cluster.k_means_.k_means_single_lloyd. ''' random_state = check_random_state(random_state) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment # TODO: _labels_inertia should be done with cosine distance # since ||a - b|| = 2(1 - cos(a,b)) when a,b are unit normalized # this doesn't really matter. labels, inertia = \ _labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means if sp.issparse(X): centers = _k_means._centers_sparse(X, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, labels, n_clusters, distances) # l2-normalize centers (this is the main contibution here) centers = normalize(centers) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def partial_fit(self, D): """ Apply one iteration of VR_MBKM Input: self, dataset Output: self Updated: -self.curr_iter -self.curr_inner_iter -self.tot_inner_iter -self.cluster_centers_ """ ## perform checks on dataset D = check_array(D, accept_sparse='csr') if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=np.float64) if self.curr_inner_iter == 0: self.inner_loop == 0 if self.curr_iter == 0 or self.inner_loop == 0 or self.update_freq == 0: ## OUTER LOOP # use the entire dataset X = D x_squared_norms = row_norms(X, squared=True) self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) if self.curr_iter == 0: ## initialize centers if hasattr(self.init, '__array__'): self.cluster_centers_ = self.init else: self.cluster_centers_ = k_means_._init_centroids( X, self.n_clusters, self.init, random_state=self.random_state_, x_squared_norms=x_squared_norms, init_size=self.init_size) _, cost = k_means_._labels_inertia(X, x_squared_norms, self.cluster_centers_) #print "Cost of current initial centers on the mini-batch is %r " % cost ## initialize counts self.counts_ = np.zeros(self.n_clusters, dtype=np.int32) ## this ensures the benchmark centers are either the seeds ## or obtained from the last iterate of inner loop self.benchmark_centers = self.cluster_centers_.copy() ## run Lloyd's update with entire data distances = np.zeros(X.shape[0], dtype=np.float64) self.benchmark_updates, _, self.squared_diff = _kmeans_step( X=X, x_squared_norms=x_squared_norms, centers=self.benchmark_centers.copy(), distances=distances, precompute_distances=self.precompute_distances, n_clusters=self.n_clusters) self.cluster_centers_ = self.benchmark_updates.copy() self.curr_outer_iter += 1 self.inner_loop = 1 else: ## INNER LOOP: # use a mini-batch of data sample_idx = random.sample(range(D.shape[0]), self.mbsize) X = D[sample_idx, :] #x_squared_norms = row_norms(X, squared=True) self.set_eta() ## run VRMB_step with entire data distances = np.zeros(X.shape[0], dtype=np.float64) self.cluster_centers_, self.squared_diff, _ = VR_MB_step( X, None, self.cluster_centers_.copy(), self.benchmark_centers.copy(), self.benchmark_updates.copy(), self.counts_, self.curr_iter, np.zeros(0, np.double), 0, distances, random_reassign=False, random_state=self.random_state_, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose, learn_rate=self.set_eta()) # increment inner loop counts self.curr_inner_iter = (self.curr_inner_iter + 1) % self.update_freq # increment global loop count self.curr_iter += 1
def test_minibatch_update_consistency(): """Check that dense and sparse minibatch update give the same results""" rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() counts = np.zeros(new_centers.shape[0], dtype=np.int32) counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32) x_squared_norms = (X**2).sum(axis=1) x_squared_norms_csr = csr_row_norm_l2(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step(X_mb, x_mb_squared_norms, new_centers, counts, buffer, 1) assert_true(old_inertia > 0.0) # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia(X_mb, x_mb_squared_norms, new_centers) assert_true(new_inertia > 0.0) assert_true(new_inertia < old_inertia) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers)**2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr, buffer_csr, 1) assert_true(old_inertia_csr > 0.0) # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr, x_mb_squared_norms_csr, new_centers_csr) assert_true(new_inertia_csr > 0.0) assert_true(new_inertia_csr < old_inertia_csr) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers)**2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def sub_kmeans_single_(self, X, sample_weight, x_squared_norms, tol, random_state): random_state = check_random_state(random_state) sample_weight = _check_sample_weight(X, sample_weight) best_labels, best_inertia, best_centers = None, None, None distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) centers = _init_centroids(X, self.n_clusters, init='k-means++', random_state=random_state, x_squared_norms=x_squared_norms) d = X.shape[1] # dimentionality of original space m = d // 2 # dimentionality of clustered space SD = np.dot(X.T, X) # scatter matrix of the dataset in the original space # orthonormal matrix of a rigid transformation V, _ = np.linalg.qr(random_state.random_sample(d**2).reshape(d, d), mode='complete') for i in range(self.max_iter): centers_old = centers.copy() # get the clusters' labels labels = self.assignment_step_(X=X, V=V, centers=centers, m=m) # compute new centers and sum the clusters' scatter matrices centers = _k_means._centers_dense(X, sample_weight, labels, self.n_clusters, distances) S = self.update_step_(X, centers, labels) # sorted eigenvalues and eigenvectors of SIGMA=S-SD V, m = self.eigen_decomposition_(S - SD) if m == 0: raise ValueError('Might be a single cluster (m = 0).') # inertia - sum of squared distances of samples to their closest cluster center inertia = sum([ row_norms(X[labels == j] - centers[j], squared=True).sum() for j in range(self.n_clusters) ]) # print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: # print("Converged at iteration %d: center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels match cluster centers best_labels, best_inertia = _labels_inertia( X, sample_weight, x_squared_norms, best_centers, precompute_distances=False, distances=distances) return best_centers, best_labels, best_inertia
def _k_means_minus_minus( X, sample_weight, n_clusters, prop_outliers, max_iter=300, init="k-means++", verbose=False, x_squared_norms=None, random_state=None, tol=1e-4, precompute_distances=True, ): """A single run of k-means, assumes preparation completed prior. Parameters ---------- X : array-like of floats, shape (n_samples, n_features) The observations to cluster. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. n_clusters : int The number of clusters to form as well as the number of centroids to generate. prop_outliers : float What proportion of the training dataset X to treat as outliers, and to exclude in each iteration of Lloyd's algorithm. max_iter : int, optional, default 300 Maximum number of iterations of the k-means algorithm to run. init : {'k-means++', 'random', or ndarray, or a callable}, optional Method for initialization, default to 'k-means++': 'k-means++' : selects initial cluster centers for k-mean clustering in a smart way to speed up convergence. See section Notes in k_init for more details. 'random': choose k observations (rows) at random from data for the initial centroids. If an ndarray is passed, it should be of shape (k, p) and gives the initial centers. If a callable is passed, it should take arguments X, k and and a random state and return an initialization. tol : float, optional The relative increment in the results before declaring convergence. verbose : boolean, optional Verbosity mode x_squared_norms : array Precomputed x_squared_norms. precompute_distances : boolean, default: True Precompute distances (faster but takes more memory). random_state : int, RandomState instance or None (default) Determines random number generation for centroid initialization. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. Returns ------- centroid : float ndarray with shape (k, n_features) Centroids found at the last iteration of k-means. label : integer ndarray with shape (n_samples,) label[i] is the code or index of the centroid the i'th observation is closest to. inertia : float The final value of the inertia criterion (sum of squared distances to the closest centroid for all observations in the training set). n_iter : int Number of iterations run. """ n_outliers = int(X.shape[0] * prop_outliers) random_state = check_random_state(random_state) sample_weight = _check_normalize_sample_weight(sample_weight, X) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) # iterations for i in range(max_iter): centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = _labels_inertia( X, sample_weight, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances, ) # the "minus-minus" modification step - filter out n_outliers # of # datapoints that are farthest from their assigned cluster centers X_subset, sample_weight_subset, labels_subset, distances_subset = ( X, sample_weight, labels, distances, ) if n_outliers > 0: outlier_indices = np.argpartition( distances, -n_outliers)[-n_outliers:] # ~20x faster than np.argsort() X_subset, sample_weight_subset, labels_subset, distances_subset = ( np.delete(X, outlier_indices, axis=0), np.delete(sample_weight, outlier_indices, axis=0), np.delete(labels, outlier_indices, axis=0), np.delete(distances, outlier_indices, axis=0), ) # indices_to_refit = np.argsort(distances) < (X.shape[0] - n_outliers) # X_subset, sample_weight_subset = X[indices_to_refit], sample_weight[indices_to_refit] # computation of the means is also called the M-step of EM if sp.issparse(X): centers = _k_means._centers_sparse(X_subset, sample_weight_subset, labels_subset, n_clusters, distances_subset) else: centers = _k_means._centers_dense(X_subset, sample_weight_subset, labels_subset, n_clusters, distances_subset) if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = _labels_inertia( X, sample_weight, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances, ) return best_labels, best_inertia, best_centers, i + 1
def mbkmean(self, options, n_clusters, n_init, batch_size, n_iter, n_samples, labels_true, k_means, X): #to do with online MBK_mean #Compute clustering with MiniBatchKMeans mbk = cluster.MiniBatchKMeans(init=self.init, n_clusters=n_clusters, batch_size=batch_size, n_init=10, max_no_improvement=n_iter, verbose=0) #INIT THREADs try: if options[2] == '-pp' or options[3] == '-pp': thread_1 = afficheur('starting threads', labels_true, mbk, k_means, X, n_clusters) thread_1.start() except IndexError: pass try: if options[2] == '-s': #init state n_batches = int(np.ceil(float(n_samples) / batch_size)) max_iter = 100 tol = 0 _, n_features = X.shape old_center_buffer = np.zeros(n_features, dtype=X.dtype) random_state = check_random_state(None) init_size = 3 * batch_size if init_size > n_samples: init_size = n_samples validation_indices = random_state.randint( 0, n_samples, init_size) X_valid = X[validation_indices] x_squared_norms = row_norms(X, squared=True) x_squared_norms_valid = x_squared_norms[validation_indices] counts = np.zeros(n_clusters, dtype=np.int32) best_inertia = None cluster_centers = None for init_idx in range(n_init): cluster_centers = cluster._init_centroids( X, n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size) batch_inertia, centers_squared_diff = cluster._mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=None, verbose=False) _, inertia = cluster._labels_inertia( X_valid, x_squared_norms_valid, cluster_centers) if best_inertia is None or inertia < best_inertia: mbk.cluster_centers_ = cluster_centers mbk.counts_ = counts best_inertia = inertia print('best inertia %d' % best_inertia) while (True): thread_1 = afficheur('starting threads', labels_true, mbk, k_means, X, n_clusters) thread_1.start() t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) thread_1.update(mbk) t_mini_batch = time.time() - t0 thread_1.stop() thread_1.join() n_iter = self.input_num("Iterations suivante : ") if n_iter == "stop": return mbk, t_mini_batch break if isinstance(n_iter, int) == False: print('error integer is required !!! type %s' % type(n_iter)) break except IndexError: pass try: if options[2] == '-pp': random_state = check_random_state(None) t0 = time.time() # Sample a minibatch from the full dataset for iteration_idx in range(n_iter - 1): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) thread_1.update(mbk) t_mini_batch = time.time() - t0 thread_1.stop() thread_1.join() return mbk, t_mini_batch except IndexError: pass try: if options[2] == '-p': random_state = check_random_state(None) t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) t_mini_batch = time.time() - t0 return mbk, t_mini_batch except IndexError: pass try: if options[2] == '-n': t0 = time.time() mbk = mbk.fit(X) t_mini_batch = time.time() - t0 return mbk, t_mini_batch except IndexError: pass try: if options[2] == None: random_state = check_random_state(None) # Sample a minibatch from the full dataset t0 = time.time() for iteration_idx in range(n_iter - 1): minibatch_indices = random_state.randint( 0, n_samples, self.batch_size) mbk = mbk.partial_fit(X, minibatch_indices=minibatch_indices) t_mini_batch = time.time() - t0 return mbk, t_mini_batch except IndexError: pass try: if options[2] == '-o': n_batches = int(np.ceil(float(n_samples) / batch_size)) max_iter = 100 n_iter = int(max_iter * n_batches) tol = 0 _, n_features = X.shape old_center_buffer = np.zeros(n_features, dtype=X.dtype) try: # print('self.max_iter %d , n_batches %d '%(n_iter,n_batches)) if options[3] == '-pp': #init state random_state = check_random_state(None) init_size = 3 * batch_size if init_size > n_samples: init_size = n_samples validation_indices = random_state.randint( 0, n_samples, init_size) X_valid = X[validation_indices] x_squared_norms = row_norms(X, squared=True) x_squared_norms_valid = x_squared_norms[ validation_indices] counts = np.zeros(n_clusters, dtype=np.int32) best_inertia = None cluster_centers = None #Random init with minimum inertia for init_idx in range(n_init): cluster_centers = cluster._init_centroids( X, n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size) batch_inertia, centers_squared_diff = cluster._mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=None, verbose=False) _, inertia = cluster._labels_inertia( X_valid, x_squared_norms_valid, cluster_centers) if best_inertia is None or inertia < best_inertia: mbk.cluster_centers_ = cluster_centers mbk.counts_ = counts best_inertia = inertia print('best inertia %d' % best_inertia) convergence_context = {} mbk.batch_inertia = batch_inertia mbk.centers_squared_diff = centers_squared_diff t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) tol = self._tolerance(X, tol) thread_1.update(mbk) # Monitor convergence and do early stopping if necessary if cluster._mini_batch_convergence( mbk, iteration_idx, n_iter, tol, n_samples, mbk.centers_squared_diff, mbk.batch_inertia, convergence_context, verbose=mbk.verbose): t_mini_batch = time.time() - t0 thread_1.stop() thread_1.join() return mbk, t_mini_batch break elif options[3] == '-p': random_state = check_random_state(None) convergence_context = {} t0 = time.time() for iteration_idx in range(n_iter): minibatch_indices = random_state.randint( 0, n_samples, batch_size) mbk = mbk.partial_fit(X[minibatch_indices]) tol = self._tolerance(X, tol) # Monitor convergence and do early stopping if necessary if cluster._mini_batch_convergence( mbk, iteration_idx, n_iter, tol, n_samples, mbk.centers_squared_diff, mbk.batch_inertia, convergence_context, verbose=False): t_mini_batch = time.time() - t0 return mbk, t_mini_batch break except IndexError: pass except IndexError: pass
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() weight_sums = np.zeros(new_centers.shape[0], dtype=np.double) weight_sums_csr = np.zeros(new_centers.shape[0], dtype=np.double) x_squared_norms = (X**2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] sample_weight_mb = np.ones(X_mb.shape[0], dtype=np.double) # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step(X_mb, sample_weight_mb, x_mb_squared_norms, new_centers, weight_sums, buffer, 1, None, random_reassign=False) assert_greater(old_inertia, 0.0) # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia(X_mb, sample_weight_mb, x_mb_squared_norms, new_centers) assert_greater(new_inertia, 0.0) assert_less(new_inertia, old_inertia) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers)**2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr, weight_sums_csr, buffer_csr, 1, None, random_reassign=False) assert_greater(old_inertia_csr, 0.0) # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia(X_mb_csr, sample_weight_mb, x_mb_squared_norms_csr, new_centers_csr) assert_greater(new_inertia_csr, 0.0) assert_less(new_inertia_csr, old_inertia_csr) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers)**2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
def partial_fit(self, X, y=None, sample_weight=None): """Update k means estimate on a single mini-batch X. Parameters ---------- X : array-like of shape (n_samples, n_features) Coordinates of the data points to cluster. It must be noted that X will be copied if it is not C-contiguous. y : Ignored Not used, present here for API consistency by convention. sample_weight : array-like, shape (n_samples,), optional The weights for each observation in X. If None, all observations are assigned equal weight (default: None). Returns ------- self """ X = check_array(X, accept_sparse="csr", order="C", dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=X.dtype) if n_samples == 0: return self # unit-normalize for spherical k-means X = normalize(X) sample_weight = _check_normalize_sample_weight(sample_weight, X) x_squared_norms = row_norms(X, squared=True) self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) if (not hasattr(self, 'counts_') or not hasattr(self, 'cluster_centers_')): # this is the first call partial_fit on this object: # initialize the cluster centers self.cluster_centers_ = _init_centroids( X, self.n_clusters, self.init, random_state=self.random_state_, x_squared_norms=x_squared_norms, init_size=self.init_size) self.counts_ = np.zeros(self.n_clusters, dtype=sample_weight.dtype) random_reassign = False distances = None else: # The lower the minimum count is, the more we do random # reassignment, however, we don't want to do random # reassignment too often, to allow for building up counts random_reassign = self.random_state_.randint( 10 * (1 + self.counts_.min())) == 0 distances = np.zeros(X.shape[0], dtype=X.dtype) self.cluster_centers_ = normalize(self.cluster_centers_) _mini_batch_spherical_step(X, sample_weight, x_squared_norms, self.cluster_centers_, self.counts_, np.zeros(0, dtype=X.dtype), 0, random_reassign=random_reassign, distances=distances, random_state=self.random_state_, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) self.cluster_centers_ = normalize(self.cluster_centers_) if self.compute_labels: self.labels_, self.inertia_ = _labels_inertia( X, sample_weight, x_squared_norms, self.cluster_centers_) return self
def _mini_batch_spherical_step(X, sample_weight, x_squared_norms, centers, weight_sums, old_center_buffer, compute_squared_diff, distances, random_reassign=False, random_state=None, reassignment_ratio=.01, verbose=False): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters ---------- X : array, shape (n_samples, n_features) The original data array. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. x_squared_norms : array, shape (n_samples,) Squared euclidean norm of each data point. centers : array, shape (k, n_features) The cluster centers. This array is MODIFIED IN PLACE counts : array, shape (k,) The vector in which we keep track of the numbers of elements in a cluster. This array is MODIFIED IN PLACE distances : array, dtype float, shape (n_samples), optional If not None, should be a pre-allocated array that will be used to store the distances of each sample to its closest center. May not be None when random_reassign is True. random_state : int, RandomState instance or None (default) Determines random number generation for centroid initialization and to pick new clusters amongst observations with uniform probability. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. random_reassign : boolean, optional If True, centers with very low counts are randomly reassigned to observations. reassignment_ratio : float, optional Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more likely to be reassigned, which means that the model will take longer to converge, but should converge in a better clustering. verbose : bool, optional, default False Controls the verbosity. compute_squared_diff : bool If set to False, the squared diff computation is skipped. old_center_buffer : int Copy of old centers for monitoring convergence. Returns ------- inertia : float Sum of squared distances of samples to their closest cluster center. squared_diff : numpy array, shape (n_clusters,) Squared distances between previous and updated cluster centers. """ # Perform label assignment to nearest centers nearest_center, inertia = _labels_inertia(X, sample_weight, x_squared_norms, centers, distances=distances) if random_reassign and reassignment_ratio > 0: random_state = check_random_state(random_state) # Reassign clusters that have very low weight to_reassign = weight_sums < reassignment_ratio * weight_sums.max() # pick at most .5 * batch_size samples as new centers if to_reassign.sum() > .5 * X.shape[0]: indices_dont_reassign = \ np.argsort(weight_sums)[int(.5 * X.shape[0]):] to_reassign[indices_dont_reassign] = False n_reassigns = to_reassign.sum() if n_reassigns: # Pick new clusters amongst observations with uniform probability new_centers = random_state.choice(X.shape[0], replace=False, size=n_reassigns) if verbose: print("[MiniBatchKMeans] Reassigning %i cluster centers." % n_reassigns) if sp.issparse(X) and not sp.issparse(centers): assign_rows_csr( X, new_centers.astype(np.intp, copy=False), np.where(to_reassign)[0].astype(np.intp, copy=False), centers) else: centers[to_reassign] = X[new_centers] # reset counts of reassigned centers, but don't reset them too small # to avoid instant reassignment. This is a pretty dirty hack as it # also modifies the learning rates. weight_sums[to_reassign] = np.min(weight_sums[~to_reassign]) # implementation for the sparse CSR representation completely written in # cython if sp.issparse(X): return inertia, _mini_batch_update_csr(X, sample_weight, x_squared_norms, centers, weight_sums, nearest_center, old_center_buffer, compute_squared_diff) # dense variant in mostly numpy (not as memory efficient though) k = centers.shape[0] squared_diff = 0.0 for center_idx in range(k): # find points from minibatch that are assigned to this center center_mask = nearest_center == center_idx wsum = sample_weight[center_mask].sum() if wsum > 0: if compute_squared_diff: old_center_buffer[:] = centers[center_idx] # inplace remove previous count scaling centers[center_idx] *= weight_sums[center_idx] # inplace sum with new points members of this cluster centers[center_idx] += \ np.sum(X[center_mask] * sample_weight[center_mask, np.newaxis], axis=0) # unit-normalize for spherical k-means centers[center_idx] = normalize(centers[center_idx, None])[:, 0] # update the squared diff if necessary if compute_squared_diff: diff = centers[center_idx].ravel() - old_center_buffer.ravel() squared_diff += np.dot(diff, diff) return inertia, squared_diff
def partial_fit(self, X, y=None): """Override partial_fit() in MiniBatchKMeans class (Jan-16: added a return var: squared_diff) (April-16: changed set_eta as an internal step) Parameters ---------- X : array-like, shape = [n_samples, n_features] Coordinates of the data points to cluster. """ X = check_array(X, accept_sparse="csr") n_samples, n_features = X.shape if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=np.float64) if n_samples == 0: return self x_squared_norms = row_norms(X, squared=True) self.random_state_ = getattr(self, "random_state_", check_random_state(self.random_state)) if (not hasattr(self, 'counts_') or not hasattr(self, 'cluster_centers_')): # this is the first call partial_fit on this object: # initialize the cluster centers # pdb.set_trace() if hasattr(self.init, '__array__'): self.cluster_centers_ = self.init else: self.cluster_centers_ = k_means_._init_centroids( X, self.n_clusters, self.init, random_state=self.random_state_, x_squared_norms=x_squared_norms, init_size=self.init_size) _, cost = k_means_._labels_inertia(X, x_squared_norms, self.cluster_centers_) print "Cost of current initial centers on the mini-batch is %r " % cost self.counts_ = np.zeros(self.n_clusters, dtype=np.int32) #random_reassign = False distances = None self.curr_iter = 1 else: # The lower the minimum count is, the more we do random # reassignment, however, we don't want to do random # reassignment too often, to allow for building up counts #random_reassign = self.random_state_.randint( # 10 * (1 + self.counts_.min())) == 0 distances = np.zeros(X.shape[0], dtype=np.float64) """ modification HERE """ #self.set_eta() self.cluster_centers_, self.squared_diff, _ = MB_step( X, x_squared_norms, self.cluster_centers_, self.counts_, self.curr_iter, np.zeros(0, np.double), 0, random_reassign=False, distances=distances, random_state=self.random_state_, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose, learn_rate=self.set_eta()) self.curr_iter = self.curr_iter + 1 if self.compute_labels: self.labels_, self.inertia_ = k_means_._labels_inertia( X, x_squared_norms, self.cluster_centers_) return self
def _kmeans_step(X, x_squared_norms, centers, distances, precompute_distances, n_clusters, random_state=None): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters ---------- X : array, shape (n_samples, n_features) The original data array. x_squared_norms : array, shape (n_samples,) Squared euclidean norm of each data point. centers : array, shape (k, n_features) The cluster centers. This array is MODIFIED IN PLACE distances : array, dtype float64, shape (n_samples), optional If not None, should be a pre-allocated array that will be used to store the distances of each sample to its closest center. May not be None when random_reassign is True. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. Returns ------- inertia : float Sum of distances of samples to their closest cluster center. squared_diff : numpy array, shape (n_clusters,) Squared distances between previous and updated cluster centers. """ centers_old = centers.copy() # labels assignment is also called the E-step of EM labels, inertia = k_means_._labels_inertia(X, x_squared_norms, centers, precompute_distances=precompute_distances, distances=distances) # computation of the means is also called the M-step of EM if sp.issparse(X): centers = _k_means._centers_sparse(X, labels, n_clusters, distances) else: centers = _k_means._centers_dense(X, labels, n_clusters, distances) """ if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia """ shift = squared_norm(centers_old - centers) """ if shift <= tol: if verbose: print("Converged at iteration %d" % i) break if shift > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, x_squared_norms, best_centers, precompute_distances=precompute_distances, distances=distances) """ return centers,inertia, shift
def subspace_kmeans_single(X, sample_weight, n_clusters, init='k-means++', max_iter=300, tol=1e-4, tol_eig=-1e-10, verbose=False, x_squared_norms=None, random_state=None): random_state = check_random_state(random_state) sample_weight = _check_sample_weight(X, sample_weight) best_labels, best_inertia, best_centers = None, None, None # init centers = _init_centroids(X, n_clusters, init, random_state=random_state, x_squared_norms=x_squared_norms) if verbose: print("Initialization complete") # Allocate memory to store the distances for each sample to its # closer center for reallocation in case of ties distances = np.zeros(shape=(X.shape[0], ), dtype=X.dtype) # === Beginning of original implementation of initialization === # Dimensionality of original space d = X.shape[1] # Set initial V as QR-decomposed Q of random matrix rand_vals = random_state.random_sample(d**2).reshape(d, d) V, _ = np.linalg.qr(rand_vals, mode='complete') # Set initial m as d/2 m = d // 2 # Scatter matrix of the dataset in the original space S_D = np.dot(X.T, X) # Projection onto the first m attributes P_C = np.eye(m, M=d).T # === End of original implementation of initialization === # iterations for i in range(max_iter): centers_old = centers.copy() # === Beginning of original implementation of E-step of EM === X_C = np.dot(np.dot(X, V), P_C) mu_C = np.dot(np.dot(centers, V), P_C) labels, _ = pairwise_distances_argmin_min( X=X_C, Y=mu_C, metric='euclidean', metric_kwargs={'squared': True}) labels = labels.astype(np.int32) # === End of original implementation of E-step of EM === # computation of the means is also called the M-step of EM centers = _k_means._centers_dense(X, sample_weight, labels, n_clusters, distances) # === Beginning of original implementation of M-step of EM === S = np.zeros((d, d)) for i in range(n_clusters): X_i = X[:][labels == i] - centers[:][i] S += np.dot(X_i.T, X_i) Sigma = S - S_D evals, evecs = np.linalg.eigh(Sigma) idx = np.argsort(evals)[::1] V = evecs[:, idx] m = len(np.where(evals < tol_eig)[0]) if m == 0: raise ValueError( 'Dimensionality of clustered space is 0. ' 'The dataset is better explained by a single cluster.') P_C = np.eye(m, M=d).T inertia = 0.0 for i in range(n_clusters): inertia += row_norms(X[:][labels == i] - centers[:][i], squared=True).sum() # === End of original implementation of M-step of EM === if verbose: print("Iteration %2d, inertia %.3f" % (i, inertia)) if best_inertia is None or inertia < best_inertia: best_labels = labels.copy() best_centers = centers.copy() best_inertia = inertia center_shift_total = squared_norm(centers_old - centers) if center_shift_total <= tol: if verbose: print("Converged at iteration %d: " "center shift %e within tolerance %e" % (i, center_shift_total, tol)) break if center_shift_total > 0: # rerun E-step in case of non-convergence so that predicted labels # match cluster centers best_labels, best_inertia = \ _labels_inertia(X, sample_weight,x_squared_norms, best_centers, precompute_distances=False, distances=distances) return best_labels, best_inertia, best_centers, i + 1
def MB_step(X, x_squared_norms, centers, counts, curr_iter, old_center_buffer, compute_squared_diff, distances, random_reassign=False, random_state=None, reassignment_ratio=.01, verbose=False, learn_rate=0.0): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters ---------- X : array, shape (n_samples, n_features) The original data array. x_squared_norms : array, shape (n_samples,) Squared euclidean norm of each data point. centers : array, shape (k, n_features) The cluster centers. This array is MODIFIED IN PLACE counts : array, shape (k,) The vector in which we keep track of the numbers of elements in a cluster. This array is MODIFIED IN PLACE distances : array, dtype float64, shape (n_samples), optional If not None, should be a pre-allocated array that will be used to store the distances of each sample to its closest center. May not be None when random_reassign is True. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. random_reassign : boolean, optional If True, centers with very low counts are randomly reassigned to observations. reassignment_ratio : float, optional Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more likely to be reassigned, which means that the model will take longer to converge, but should converge in a better clustering. verbose : bool, optional, default False Controls the verbosity. compute_squared_diff : bool If set to False, the squared diff computation is skipped. old_center_buffer : int Copy of old centers for monitoring convergence. learn_rate: learning rate Returns ------- centers: Updated centers inertia : float Sum of distances of samples to their closest cluster center. squared_diff : numpy array, shape (n_clusters,) Squared distances between previous and updated cluster centers. """ # Perform label assignment to nearest centers nearest_center, inertia = k_means_._labels_inertia(X, x_squared_norms, centers, distances=distances) if random_reassign and reassignment_ratio > 0: random_state = check_random_state(random_state) # Reassign clusters that have very low counts to_reassign = counts < reassignment_ratio * counts.max() # pick at most .5 * batch_size samples as new centers if to_reassign.sum() > .5 * X.shape[0]: indices_dont_reassign = np.argsort(counts)[int(.5 * X.shape[0]):] to_reassign[indices_dont_reassign] = False n_reassigns = to_reassign.sum() if n_reassigns: # Pick new clusters amongst observations with uniform probability new_centers = choice(X.shape[0], replace=False, size=n_reassigns, random_state=random_state) if verbose: print("[MiniBatchKMeans] Reassigning %i cluster centers." % n_reassigns) if sp.issparse(X) and not sp.issparse(centers): assign_rows_csr(X, astype(new_centers, np.intp), astype(np.where(to_reassign)[0], np.intp), centers) else: centers[to_reassign] = X[new_centers] # reset counts of reassigned centers, but don't reset them too small # to avoid instant reassignment. This is a pretty dirty hack as it # also modifies the learning rates. counts[to_reassign] = np.min(counts[~to_reassign]) squared_diff = 0.0 ## implementation for the sparse CSR representation completely written in # cython if sp.issparse(X): if compute_squared_diff: old_center_buffer = centers #rand_vec = make_rand_vector(X.shape[1]) #learn_rate = 0.0 centers = _MB_step._mini_batch_update_csr(X, x_squared_norms, centers, counts, nearest_center, old_center_buffer, compute_squared_diff, curr_iter, learn_rate) if compute_squared_diff: diff = centers - old_center_buffer squared_diff = row_norms(diff, squared=True).sum() return centers, squared_diff, inertia ## dense variant in mostly numpy (not as memory efficient though) k = centers.shape[0] for center_idx in range(k): # find points from minibatch that are assigned to this center center_mask = nearest_center == center_idx old_count = counts[center_idx] this_count = center_mask.sum() counts[center_idx] += this_count # update counts if this_count > 0: new_count = counts[center_idx] if compute_squared_diff: old_center_buffer[:] = centers[center_idx] # inplace remove previous count scaling #centers[center_idx] *= counts[center_idx] # inplace sum with new points members of this cluster #centers[center_idx] += np.sum(X[center_mask], axis=0) # update the count statistics for this center #counts[center_idx] += count # inplace rescale to compute mean of all points (old and new) #centers[center_idx] /= counts[center_idx] new_center = np.sum(X[center_mask], axis=0) if learn_rate == 0.0: learn_rate = (new_count - old_count) / float(new_count) centers[center_idx] = centers[center_idx] + learn_rate * ( new_center / (new_count - old_count) - centers[center_idx]) # update the squared diff if necessary if compute_squared_diff: diff = centers[center_idx].ravel() - old_center_buffer.ravel() squared_diff += np.dot(diff, diff) return centers, squared_diff, inertia
def func(dat_matrix): x_squared_norms = row_norms(dat_matrix, squared=True) inertias = _labels_inertia(dat_matrix, x_squared_norms, km.cluster_centers_)[1] return inertias
def fit(self, X, y=None): """Compute the centroids on X by chunking it into mini-batches. Parameters ---------- X : array-like or sparse matrix, shape=(n_samples, n_features) Training instances to cluster. y : Ignored """ random_state = check_random_state(self.random_state) X = check_array(X, accept_sparse="csr", order='C', dtype=[np.float64, np.float32]) n_samples, n_features = X.shape if n_samples < self.n_clusters: raise ValueError("Number of samples smaller than number " "of clusters.") n_init = self.n_init if hasattr(self.init, '__array__'): self.init = np.ascontiguousarray(self.init, dtype=X.dtype) if n_init != 1: warnings.warn( 'Explicit initial center position passed: ' 'performing only one init in MiniBatchKMeans instead of ' 'n_init=%d' % self.n_init, RuntimeWarning, stacklevel=2) n_init = 1 x_squared_norms = k_means_.row_norms(X, squared=True) if self.tol > 0.0: tol = k_means_._tolerance(X, self.tol) # using tol-based early stopping needs the allocation of a # dedicated before which can be expensive for high dim data: # hence we allocate it outside of the main loop old_center_buffer = np.zeros(n_features, dtype=X.dtype) else: tol = 0.0 # no need for the center buffer if tol-based early stopping is # disabled old_center_buffer = np.zeros(0, dtype=X.dtype) distances = np.zeros(self.batch_size, dtype=X.dtype) n_batches = int(np.ceil(float(n_samples) / self.batch_size)) n_iter = int(self.max_iter * n_batches) init_size = self.init_size if init_size is None: init_size = 3 * self.batch_size if init_size > n_samples: init_size = n_samples self.init_size_ = init_size validation_indices = random_state.randint(0, n_samples, init_size) X_valid = X[validation_indices] x_squared_norms_valid = x_squared_norms[validation_indices] # perform several inits with random sub-sets best_inertia = None for init_idx in range(n_init): if self.verbose: print("Init %d/%d with method: %s" % (init_idx + 1, n_init, self.init)) counts = np.zeros(self.n_clusters, dtype=np.int32) # TODO: once the `k_means` function works with sparse input we # should refactor the following init to use it instead. # Initialize the centers using only a fraction of the data as we # expect n_samples to be very large when using MiniBatchKMeans cluster_centers = k_means_._init_centroids( X, self.n_clusters, self.init, random_state=random_state, x_squared_norms=x_squared_norms, init_size=init_size) # Compute the label assignment on the init dataset batch_inertia, centers_squared_diff = k_means_._mini_batch_step( X_valid, x_squared_norms[validation_indices], cluster_centers, counts, old_center_buffer, False, distances=None, verbose=self.verbose) # Keep only the best cluster centers across independent inits on # the common validation set _, inertia = k_means_._labels_inertia(X_valid, x_squared_norms_valid, cluster_centers) if self.verbose: print("Inertia for init %d/%d: %f" % (init_idx + 1, n_init, inertia)) if best_inertia is None or inertia < best_inertia: self.cluster_centers_ = cluster_centers self.counts_ = counts best_inertia = inertia # Empty context to be used inplace by the convergence check routine convergence_context = {} # Perform the iterative optimization until the final convergence # criterion for iteration_idx in range(n_iter): # Sample a minibatch from the full dataset minibatch_indices = random_state.randint(0, n_samples, self.batch_size) # Perform the actual update step on the minibatch data batch_inertia, centers_squared_diff = k_means_._mini_batch_step( X[minibatch_indices], x_squared_norms[minibatch_indices], self.cluster_centers_, self.counts_, old_center_buffer, tol > 0.0, distances=distances, # Here we randomly choose whether to perform # random reassignment: the choice is done as a function # of the iteration index, and the minimum number of # counts, in order to force this reassignment to happen # every once in a while random_reassign=((iteration_idx + 1) % (10 + self.counts_.min()) == 0), random_state=random_state, reassignment_ratio=self.reassignment_ratio, verbose=self.verbose) # Monitor convergence and do early stopping if necessary if k_means_._mini_batch_convergence(self, iteration_idx, n_iter, tol, n_samples, centers_squared_diff, batch_inertia, convergence_context, verbose=self.verbose): break self.n_iter_ = iteration_idx + 1 if self.compute_labels: self.labels_, self.inertia_ = self._labels_inertia_minibatch(X) return self
def _e_step(self): labels , _ = _labels_inertia(self.X, self.x_squared_norms, self.ukList, precompute_distances=True) self.rkn = np.array([labels == k for k in range(self.K)])
def test_minibatch_update_consistency(): # Check that dense and sparse minibatch update give the same results rng = np.random.RandomState(42) old_centers = centers + rng.normal(size=centers.shape) new_centers = old_centers.copy() new_centers_csr = old_centers.copy() counts = np.zeros(new_centers.shape[0], dtype=np.int32) counts_csr = np.zeros(new_centers.shape[0], dtype=np.int32) x_squared_norms = (X ** 2).sum(axis=1) x_squared_norms_csr = row_norms(X_csr, squared=True) buffer = np.zeros(centers.shape[1], dtype=np.double) buffer_csr = np.zeros(centers.shape[1], dtype=np.double) # extract a small minibatch X_mb = X[:10] X_mb_csr = X_csr[:10] x_mb_squared_norms = x_squared_norms[:10] x_mb_squared_norms_csr = x_squared_norms_csr[:10] # step 1: compute the dense minibatch update old_inertia, incremental_diff = _mini_batch_step( X_mb, x_mb_squared_norms, new_centers, counts, buffer, 1, None, random_reassign=False) assert_greater(old_inertia, 0.0) # compute the new inertia on the same batch to check that it decreased labels, new_inertia = _labels_inertia( X_mb, x_mb_squared_norms, new_centers) assert_greater(new_inertia, 0.0) assert_less(new_inertia, old_inertia) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers - old_centers) ** 2) assert_almost_equal(incremental_diff, effective_diff) # step 2: compute the sparse minibatch update old_inertia_csr, incremental_diff_csr = _mini_batch_step( X_mb_csr, x_mb_squared_norms_csr, new_centers_csr, counts_csr, buffer_csr, 1, None, random_reassign=False) assert_greater(old_inertia_csr, 0.0) # compute the new inertia on the same batch to check that it decreased labels_csr, new_inertia_csr = _labels_inertia( X_mb_csr, x_mb_squared_norms_csr, new_centers_csr) assert_greater(new_inertia_csr, 0.0) assert_less(new_inertia_csr, old_inertia_csr) # check that the incremental difference computation is matching the # final observed value effective_diff = np.sum((new_centers_csr - old_centers) ** 2) assert_almost_equal(incremental_diff_csr, effective_diff) # step 3: check that sparse and dense updates lead to the same results assert_array_equal(labels, labels_csr) assert_array_almost_equal(new_centers, new_centers_csr) assert_almost_equal(incremental_diff, incremental_diff_csr) assert_almost_equal(old_inertia, old_inertia_csr) assert_almost_equal(new_inertia, new_inertia_csr)
## X = np.random.normal(size=(n_samples, n_features)) tol = 1e-4 ## print("\n-- scipy.cluster.vq") ## ratio = 1. ## np.random.seed(random_state) ## sc, _ = utils.timeit(profile(kmeans))(X, n_clusters, iter=2, ## thresh=tol / ratio) ## ## utils.cache_value(sc, 'prof_kmeans/scipy_kmeans_%d_%d' ## ## % (n_samples, n_features)) ## inertia1 = _labels_inertia(X, (X ** 2).sum(axis=-1), sc)[1] ## print('scipy inertia: %.1f' % np.sqrt(inertia1)) print("\n-- sklearn.cluster") ratio = 1. #np.mean(np.var(X, axis=0)) # just to make the comparison fair. np.random.seed(random_state) sk, _, _ = utils.timeit(profile(k_means))(X, n_clusters, n_init=2, tol=tol / ratio, init="random", random_state=random_state) ## utils.cache_value(sk, 'prof_kmeans/sklearn_kmeans_%d_%d' % ## (n_samples, n_features)) inertia2 = _labels_inertia(X, (X ** 2).sum(axis=-1), sk)[1] print('inertia: %.1f' % np.sqrt(inertia2)) ## print ('\nsklearn - scipy inertia: %.1f. Relative variation: %.1e' % ## ((inertia2 - inertia1), (inertia2 - inertia1) / ( ## 2. * (inertia1 + inertia2))))