def transform(self, X): """Transform X to a cluster-distance space. In the new space, each dimension is the distance to the cluster centers. Note that even if X is sparse, the array returned by `transform` will typically be dense. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to transform. Returns ------- X_new : array, shape [n_samples, self.n_clusters_] X transformed in the new space. """ check_is_fitted(self) if self._needs_normalization(): X = normalize_rows(X) distances = np.hstack([ dist.cdist(X[:, selector], centroid[np.newaxis, selector], self.distance) for selector, centroid in zip(self.filters_, self.centroids_) ]) return distances
def predict(self, X): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ check_is_fitted(self) if self._needs_normalization(): X = normalize_rows(X) predict = partial(_predict_path, result=self.result_) with maybe_pool(self.n_jobs) as pool: paths = pool.map(predict, X) labels = [self.reverse_paths_[path] for path in paths] return np.array(labels, dtype=np.int32)
def _sampled_dispersion( seed: int, sampler: BaseSampler, kmeans: KMeans, fit: bool = True ) -> float: logging.debug(f"Sampling with seed {seed}.") X = sampler.get_sample(seed) logging.debug(f"Sample shape {X.shape}") if getattr(kmeans, "normalize_rows", False): logging.debug("Normalizing rows.") X = normalize_rows(X) if fit: logging.debug("Fitting kmeans for sample.") y = kmeans.fit_predict(X) else: logging.debug("Predicting labels for sample.") y = kmeans.predict(X) logging.debug("Computing dispersion for clustered sample.") clusters = pd.DataFrame(X).groupby(y) return float( np.mean( [ np.mean(dist.pdist(cluster_members.values, kmeans.distance)) for _, cluster_members in clusters if cluster_members.shape[0] != 1 ] ) )
def predict(self, X): """Predict the closest cluster each sample in X belongs to. In the vector quantization literature, `cluster_centers_` is called the code book and each value returned by `predict` is the index of the closest code in the code book. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to predict. Returns ------- labels : array, shape [n_samples,] Index of the cluster each sample belongs to. """ check_is_fitted(self) if self.normalize_rows: X = normalize_rows(X) labels = dst.cdist(X, self.cluster_centers_, self.distance).argmin(axis=1) return labels
def __call__(self, data: Data, number_of_clusters: int) -> Tuple[IntLabels, Centroids]: _validate_kmeans_input(data, number_of_clusters) if number_of_clusters == 1: return ( np.zeros((data.shape[0], 1), dtype=int), np.mean(data, axis=0, keepdims=True), ) data = data.reshape(data.shape, order="C") if self.normalize_rows: _validate_normalizable(data) data = normalize_rows(data) label_set = np.arange(number_of_clusters) logging.debug("Initializing KMeans centroids.") centroids = self.initialize(data, number_of_clusters) logging.debug("First centroids found.") old_labels = np.nan * np.zeros((data.shape[0], )) labels = self.labeling(data, centroids) logging.debug("Labels assigned.") for _ in range(self.number_of_iterations): if np.unique(labels).size != number_of_clusters: centroids, labels = self._fix_labels(data, centroids, labels, number_of_clusters) if np.all(labels == old_labels): logging.debug("Stability achieved.") break old_labels = labels centroids = redefine_centroids(data, old_labels, label_set, self.allow_dask) labels = self.labeling(data, centroids) return labels, centroids
def _dispersion(data: Data, kmeans: KMeans) -> float: assert data.shape[0] == kmeans.labels_.size, "kmeans not fit on this data" if getattr(kmeans, "normalize_rows", False): data = normalize_rows(data) clusters = pd.DataFrame(data).groupby(kmeans.labels_) return float( np.mean( [ np.mean(dist.pdist(cluster_members.values, kmeans.distance)) for _, cluster_members in clusters if cluster_members.shape[0] != 1 ] ) )
def transform(self, X): """Transform X to a cluster-distance space. In the new space, each dimension is the distance to the cluster centers. Note that even if X is sparse, the array returned by `transform` will typically be dense. Parameters ---------- X : {array-like, sparse matrix}, shape = [n_samples, n_features] New data to transform. Returns ------- X_new : array, shape [n_samples, k] X transformed in the new space. """ check_is_fitted(self) if self.normalize_rows: X = normalize_rows(X) return dst.cdist(X, self.cluster_centers_, self.distance)