def test_densify_rows(): X = sp.csr_matrix([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) rows = np.array([0, 2, 3], dtype=np.intp) out = np.ones((rows.shape[0], X.shape[1]), dtype=np.float64) assign_rows_csr(X, rows, np.arange(out.shape[0], dtype=np.intp)[::-1], out) assert_array_equal(out, X[rows].toarray()[::-1])
def maximization(X, labels, distances, centers, samples): X_indptr = X.indptr X_data = X.data X_indices = X.indices sameple_weight = numpy.ones(samples, dtype=X.dtype) weight_cluster = numpy.zeros(k, dtype=float) for i in range(samples): c = labels[i] weight_cluster[c] += sameple_weight[i] empty_clusters = numpy.where(weight_cluster == 0)[0] n_empty_clusters = empty_clusters.shape[0] if n_empty_clusters > 0: far_points = distances.argsort()[::-1][:n_empty_clusters] assign_rows_csr(X, far_points.astype(numpy.intp), empty_clusters.astype(numpy.intp), centers) for i in range(n_empty_clusters): weight_cluster[empty_clusters[i]] = 1 for i in range(len(labels)): curr_label = labels[i] for index in range(X_indptr[i], X_indptr[i + 1]): j = X_indices[index] centers[curr_label, j] += X_data[index] * sameple_weight[i] numpy.true_divide(centers, weight_cluster[:, numpy.newaxis], out=centers, casting="unsafe") return centers
def _centers_sparse(X, sample_weight, labels, n_clusters, distances): """ M step of the K-means EM algorithm Computation of cluster centers / means. Parameters ---------- X : scipy.sparse.csr_matrix, shape (n_samples, n_features) sample_weight : array-like, shape (n_samples,) The weights for each observation in X. labels : array of integers, shape (n_samples) Current label assignment n_clusters : int Number of desired clusters distances : array-like, shape (n_samples) Distance to closest cluster for each sample. Returns ------- centers : array, shape (n_clusters, n_features) The resulting centers """ n_samples = X.shape[0] n_features = X.shape[1] data = X.data indices = X.indices indptr = X.indptr dtype = X.dtype centers = numpy.zeros((n_clusters, n_features), dtype=dtype) weight_in_cluster = numpy.zeros((n_clusters, ), dtype=dtype) for i in range(n_samples): c = labels[i] weight_in_cluster[c] += sample_weight[i] empty_clusters = numpy.where(weight_in_cluster == 0)[0] n_empty_clusters = empty_clusters.shape[0] # maybe also relocate small clusters? if n_empty_clusters > 0: # find points to reassign empty clusters to far_from_centers = distances.argsort()[::-1][:n_empty_clusters] assign_rows_csr(X, far_from_centers, empty_clusters, centers) for i in range(n_empty_clusters): weight_in_cluster[empty_clusters[i]] = 1 for i in range(labels.shape[0]): curr_label = labels[i] for ind in range(indptr[i], indptr[i + 1]): j = indices[ind] centers[curr_label, j] += data[ind] * sample_weight[i] centers /= weight_in_cluster[:, numpy.newaxis] return centers
def test_densify_rows(): X = sp.csr_matrix([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_rows = np.array([0, 2, 3], dtype=np.intp) out = np.ones((6, X.shape[1]), dtype=np.float64) out_rows = np.array([1, 3, 4], dtype=np.intp) expect = np.ones_like(out) expect[out_rows] = X[X_rows, :].toarray() assign_rows_csr(X, X_rows, out_rows, out) assert_array_equal(out, expect)
def MB_step(X, x_squared_norms, centers, counts, curr_iter, old_center_buffer, compute_squared_diff, distances, random_reassign=False, random_state=None, reassignment_ratio=.01, verbose=False, learn_rate=0.0): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters ---------- X : array, shape (n_samples, n_features) The original data array. x_squared_norms : array, shape (n_samples,) Squared euclidean norm of each data point. centers : array, shape (k, n_features) The cluster centers. This array is MODIFIED IN PLACE counts : array, shape (k,) The vector in which we keep track of the numbers of elements in a cluster. This array is MODIFIED IN PLACE distances : array, dtype float64, shape (n_samples), optional If not None, should be a pre-allocated array that will be used to store the distances of each sample to its closest center. May not be None when random_reassign is True. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. random_reassign : boolean, optional If True, centers with very low counts are randomly reassigned to observations. reassignment_ratio : float, optional Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more likely to be reassigned, which means that the model will take longer to converge, but should converge in a better clustering. verbose : bool, optional, default False Controls the verbosity. compute_squared_diff : bool If set to False, the squared diff computation is skipped. old_center_buffer : int Copy of old centers for monitoring convergence. learn_rate: learning rate Returns ------- centers: Updated centers inertia : float Sum of distances of samples to their closest cluster center. squared_diff : numpy array, shape (n_clusters,) Squared distances between previous and updated cluster centers. """ # Perform label assignment to nearest centers nearest_center, inertia = k_means_._labels_inertia(X, x_squared_norms, centers, distances=distances) if random_reassign and reassignment_ratio > 0: random_state = check_random_state(random_state) # Reassign clusters that have very low counts to_reassign = counts < reassignment_ratio * counts.max() # pick at most .5 * batch_size samples as new centers if to_reassign.sum() > .5 * X.shape[0]: indices_dont_reassign = np.argsort(counts)[int(.5 * X.shape[0]):] to_reassign[indices_dont_reassign] = False n_reassigns = to_reassign.sum() if n_reassigns: # Pick new clusters amongst observations with uniform probability new_centers = choice(X.shape[0], replace=False, size=n_reassigns, random_state=random_state) if verbose: print("[MiniBatchKMeans] Reassigning %i cluster centers." % n_reassigns) if sp.issparse(X) and not sp.issparse(centers): assign_rows_csr(X, astype(new_centers, np.intp), astype(np.where(to_reassign)[0], np.intp), centers) else: centers[to_reassign] = X[new_centers] # reset counts of reassigned centers, but don't reset them too small # to avoid instant reassignment. This is a pretty dirty hack as it # also modifies the learning rates. counts[to_reassign] = np.min(counts[~to_reassign]) squared_diff = 0.0 ## implementation for the sparse CSR representation completely written in # cython if sp.issparse(X): if compute_squared_diff: old_center_buffer = centers #rand_vec = make_rand_vector(X.shape[1]) #learn_rate = 0.0 centers = _MB_step._mini_batch_update_csr(X, x_squared_norms, centers, counts, nearest_center, old_center_buffer, compute_squared_diff, curr_iter, learn_rate) if compute_squared_diff: diff = centers - old_center_buffer squared_diff = row_norms(diff, squared=True).sum() return centers, squared_diff, inertia ## dense variant in mostly numpy (not as memory efficient though) k = centers.shape[0] for center_idx in range(k): # find points from minibatch that are assigned to this center center_mask = nearest_center == center_idx old_count = counts[center_idx] this_count = center_mask.sum() counts[center_idx] += this_count # update counts if this_count > 0: new_count = counts[center_idx] if compute_squared_diff: old_center_buffer[:] = centers[center_idx] # inplace remove previous count scaling #centers[center_idx] *= counts[center_idx] # inplace sum with new points members of this cluster #centers[center_idx] += np.sum(X[center_mask], axis=0) # update the count statistics for this center #counts[center_idx] += count # inplace rescale to compute mean of all points (old and new) #centers[center_idx] /= counts[center_idx] new_center = np.sum(X[center_mask], axis=0) if learn_rate == 0.0: learn_rate = (new_count - old_count) / float(new_count) centers[center_idx] = centers[center_idx] + learn_rate * ( new_center / (new_count - old_count) - centers[center_idx]) # update the squared diff if necessary if compute_squared_diff: diff = centers[center_idx].ravel() - old_center_buffer.ravel() squared_diff += np.dot(diff, diff) return centers, squared_diff, inertia
def _mini_batch_spherical_step(X, sample_weight, x_squared_norms, centers, weight_sums, old_center_buffer, compute_squared_diff, distances, random_reassign=False, random_state=None, reassignment_ratio=.01, verbose=False): """Incremental update of the centers for the Minibatch K-Means algorithm. Parameters ---------- X : array, shape (n_samples, n_features) The original data array. sample_weight : array-like, shape (n_samples,) The weights for each observation in X. x_squared_norms : array, shape (n_samples,) Squared euclidean norm of each data point. centers : array, shape (k, n_features) The cluster centers. This array is MODIFIED IN PLACE counts : array, shape (k,) The vector in which we keep track of the numbers of elements in a cluster. This array is MODIFIED IN PLACE distances : array, dtype float, shape (n_samples), optional If not None, should be a pre-allocated array that will be used to store the distances of each sample to its closest center. May not be None when random_reassign is True. random_state : int, RandomState instance or None (default) Determines random number generation for centroid initialization and to pick new clusters amongst observations with uniform probability. Use an int to make the randomness deterministic. See :term:`Glossary <random_state>`. random_reassign : boolean, optional If True, centers with very low counts are randomly reassigned to observations. reassignment_ratio : float, optional Control the fraction of the maximum number of counts for a center to be reassigned. A higher value means that low count centers are more likely to be reassigned, which means that the model will take longer to converge, but should converge in a better clustering. verbose : bool, optional, default False Controls the verbosity. compute_squared_diff : bool If set to False, the squared diff computation is skipped. old_center_buffer : int Copy of old centers for monitoring convergence. Returns ------- inertia : float Sum of squared distances of samples to their closest cluster center. squared_diff : numpy array, shape (n_clusters,) Squared distances between previous and updated cluster centers. """ # Perform label assignment to nearest centers nearest_center, inertia = _labels_inertia(X, sample_weight, x_squared_norms, centers, distances=distances) if random_reassign and reassignment_ratio > 0: random_state = check_random_state(random_state) # Reassign clusters that have very low weight to_reassign = weight_sums < reassignment_ratio * weight_sums.max() # pick at most .5 * batch_size samples as new centers if to_reassign.sum() > .5 * X.shape[0]: indices_dont_reassign = \ np.argsort(weight_sums)[int(.5 * X.shape[0]):] to_reassign[indices_dont_reassign] = False n_reassigns = to_reassign.sum() if n_reassigns: # Pick new clusters amongst observations with uniform probability new_centers = random_state.choice(X.shape[0], replace=False, size=n_reassigns) if verbose: print("[MiniBatchKMeans] Reassigning %i cluster centers." % n_reassigns) if sp.issparse(X) and not sp.issparse(centers): assign_rows_csr( X, new_centers.astype(np.intp, copy=False), np.where(to_reassign)[0].astype(np.intp, copy=False), centers) else: centers[to_reassign] = X[new_centers] # reset counts of reassigned centers, but don't reset them too small # to avoid instant reassignment. This is a pretty dirty hack as it # also modifies the learning rates. weight_sums[to_reassign] = np.min(weight_sums[~to_reassign]) # implementation for the sparse CSR representation completely written in # cython if sp.issparse(X): return inertia, _mini_batch_update_csr(X, sample_weight, x_squared_norms, centers, weight_sums, nearest_center, old_center_buffer, compute_squared_diff) # dense variant in mostly numpy (not as memory efficient though) k = centers.shape[0] squared_diff = 0.0 for center_idx in range(k): # find points from minibatch that are assigned to this center center_mask = nearest_center == center_idx wsum = sample_weight[center_mask].sum() if wsum > 0: if compute_squared_diff: old_center_buffer[:] = centers[center_idx] # inplace remove previous count scaling centers[center_idx] *= weight_sums[center_idx] # inplace sum with new points members of this cluster centers[center_idx] += \ np.sum(X[center_mask] * sample_weight[center_mask, np.newaxis], axis=0) # unit-normalize for spherical k-means centers[center_idx] = normalize(centers[center_idx, None])[:, 0] # update the squared diff if necessary if compute_squared_diff: diff = centers[center_idx].ravel() - old_center_buffer.ravel() squared_diff += np.dot(diff, diff) return inertia, squared_diff