def predict(self, data): """Predict the closest cluster for each sample in X. Parameters ---------- data : array-like, shape=[n_samples, dim,] Training data, where n_samples is the number of samples and dim is the number of dimensions. Returns ------- labels : array-like, shape=[n_samples,] Index of the cluster each sample belongs to. """ labels = gs.zeros(len(data)) for point_index, point_value in enumerate(data): distances = gs.zeros(len(self.cluster_centers_)) for cluster_index, cluster_value in enumerate( self.cluster_centers_): distances[cluster_index] = self.metric.dist( point_value, cluster_value) labels[point_index] = gs.argmin(distances) return labels
def find_normalization_factor(variances, variances_range, normalization_factor_var): """Find the normalization factor given some variances. Parameters ---------- variances : array-like, shape=[n_gaussians,] Array of standard deviations for each component of some GMM. variances_range : array-like, shape=[n_variances,] Array of standard deviations. normalization_factor_var : array-like, shape=[n_variances,] Array of computed normalization factor. Returns ------- norm_factor : array-like, shape=[n_gaussians,] Array of normalization factors for the given variances. """ n_gaussians, precision = variances.shape[0], variances_range.shape[0] ref = gs.expand_dims(variances_range, 0) ref = gs.repeat(ref, n_gaussians, axis=0) val = gs.expand_dims(variances, 1) val = gs.repeat(val, precision, axis=1) difference = gs.abs(ref - val) index = gs.argmin(difference, axis=-1) norm_factor = normalization_factor_var[index] return norm_factor
def closest_neighbor_index(self, point, neighbors): """Closest neighbor of point among neighbors. Parameters ---------- point : array-like, shape=[..., dim] Point. neighbors : array-like, shape=[n_neighbors, dim] Neighbors. Returns ------- closest_neighbor_index : int Index of closest neighbor. """ n_points = point.shape[0] if gs.ndim(point) == gs.ndim( neighbors) else 1 n_neighbors = neighbors.shape[0] if n_points > 1 and n_neighbors > 1: neighbors = gs.repeat(neighbors, n_points, axis=0) point = gs.concatenate([point for _ in range(n_neighbors)]) closest_neighbor_index = gs.argmin( gs.transpose( gs.reshape(self.dist(point, neighbors), (n_neighbors, n_points)), ), axis=1, ) if n_points == 1: return closest_neighbor_index[0] return closest_neighbor_index
def predict(self, X): """Predict the labels for each data point. Label each data point with the cluster having the nearest centroid using metric distance. Parameters ---------- X : array-like, shape=[..., n_features] Input data. Returns ------- self : array-like, shape=[...,] Array of predicted cluster indices for each sample. """ if self.centroids is None: raise RuntimeError("fit needs to be called first.") dists = gs.stack( [self.metric.dist(centroid, X) for centroid in self.centroids], axis=1) dists = gs.squeeze(dists) labels = gs.argmin(dists, -1) return labels
def _circle_mean(points): """Determine the mean on a circle. Data are expected in radians in the range [-pi, pi). The mean is returned in the same range. If the mean is unique, this algorithm is guaranteed to find it. It is not vulnerable to local minima of the Frechet function. If the mean is not unique, the algorithm only returns one of the means. Which mean is returned depends on numerical rounding errors. Reference --------- ..[HH15] Hotz, T. and S. F. Huckemann (2015), "Intrinsic means on the circle: Uniqueness, locus and asymptotics", Annals of the Institute of Statistical Mathematics 67 (1), 177–193. https://arxiv.org/abs/1108.2141 """ if points.ndim > 1: points_ = Hypersphere.extrinsic_to_angle(points) else: points_ = gs.copy(points) sample_size = points_.shape[0] mean0 = gs.mean(points_) var0 = gs.sum((points_ - mean0) ** 2) sorted_points = gs.sort(points_) means = _circle_variances(mean0, var0, sample_size, sorted_points) return means[gs.argmin(means[:, 1]), 0]
def predict(self, X, fuzzy_predictions=False): """Predict the labels for each data point. Label each data point with the cluster having the nearest centroid using metric distance. Parameters ---------- X : array-like, shape=[..., n_features] Input data. Returns n |7i0-o≥ ------- self : array-like, shape=[...,] Array of predicted cluster indices for each sample. """ if self.centroids is None: raise RuntimeError('fit needs to be called first.') dists = gs.stack( [self.metric.dist(centroid, X) for centroid in self.centroids], axis=1) dists = gs.squeeze(dists) if fuzzy_predictions: dists[np.where(dists == 0)] = 0.00001 belongs = 1 / (dists * np.sum(1 / dists, axis=1)[:, None]) else: belongs = gs.argmin(dists, -1) return belongs
def _update_medoid_indexes(self, distances, labels, medoid_indices): for cluster in range(self.n_clusters): cluster_index = gs.where(labels == cluster)[0] if len(cluster_index) == 0: logging.warning('One cluster is empty.') continue in_cluster_distances = distances[ cluster_index, gs.expand_dims(cluster_index, axis=-1)] in_cluster_all_costs = gs.sum(in_cluster_distances, axis=1) min_cost_index = gs.argmin(in_cluster_all_costs) min_cost = in_cluster_all_costs[min_cost_index] current_cost = in_cluster_all_costs[gs.argmax( cluster_index == medoid_indices[cluster])] if min_cost < current_cost: medoid_indices[cluster] = cluster_index[min_cost_index]
def closest_neighbor_index(self, point, neighbors): """ Closest neighbor of point among neighbors. """ dist = self.dist(point, neighbors) closest_neighbor_index = gs.argmin(dist) return closest_neighbor_index
def fit(self, X, max_iter=100): """Predict for each data point the closest center in terms of riemannian_metric distance Parameters ---------- X : array-like, shape=[n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. max_iter : Maximum number of iterations Returns ------- self : object Return centroids array """ n_samples = X.shape[0] belongs = gs.zeros(n_samples) self.centroids = [ gs.expand_dims(X[randint(0, n_samples - 1)], 0) for i in range(self.n_clusters) ] self.centroids = gs.concatenate(self.centroids) index = 0 while index < max_iter: index += 1 dists = [ gs.to_ndarray( self.riemannian_metric.dist(self.centroids[i], X), 2, 1) for i in range(self.n_clusters) ] dists = gs.hstack(dists) belongs = gs.argmin(dists, 1) old_centroids = gs.copy(self.centroids) for i in range(self.n_clusters): fold = gs.squeeze(X[belongs == i]) if len(fold) > 0: self.centroids[i] = self.riemannian_metric.mean(fold) else: self.centroids[i] = X[randint(0, n_samples - 1)] centroids_distances = self.riemannian_metric.dist( old_centroids, self.centroids) if gs.mean(centroids_distances) < self.tol: if self.verbose > 0: print("Convergence Reached after ", index, " iterations") return gs.copy(self.centroids) return gs.copy(self.centroids)
def closest_neighbor_index(self, point, neighbors): """Closest neighbor of point among neighbors. Parameters ---------- point neighbors Returns ------- closest_neighbor_index """ dist = self.dist(point, neighbors) closest_neighbor_index = gs.argmin(dist) return closest_neighbor_index
def fit(self, data, max_iter=100): """Provide clusters centroids and data labels. Labels data by minimizing the distance between data points and cluster centroids chosen from the data points. Minimization is performed by swapping the centroids and data points. Parameters ---------- data : array-like, shape=[n_samples, dim] Training data, where n_samples is the number of samples and dim is the number of dimensions. max_iter : int Maximum number of iterations. Optional, default: 100. Returns ------- self : array-like, shape=[n_clusters,] Centroids. """ distances = self.metric.dist_pairwise(data) medoids_indices = self._initialize_medoids(distances) for iteration in range(max_iter): old_medoids_indices = gs.copy(medoids_indices) labels = gs.argmin(distances[medoids_indices, :], axis=0) self._update_medoid_indexes(distances, labels, medoids_indices) if gs.all(old_medoids_indices == medoids_indices): break if iteration == max_iter - 1: logging.warning('Maximum number of iteration reached before ' 'convergence. Consider increasing max_iter to ' 'improve the fit.') self.cluster_centers_ = data[medoids_indices] self.labels_ = labels self.medoid_indices_ = medoids_indices return self.cluster_centers_
def find_variance_from_index(weighted_distances, variances_range, phi_inv_var): r"""Return the variance given weighted distances. Parameters ---------- weighted_distances : array-like, shape=[n_gaussians,] Mean of the weighted distances between training data and current barycentres. The weights of each data sample corresponds to the probability of belonging to a component of the Gaussian mixture model. variances_range : array-like, shape=[n_variances,] Array of standard deviations. phi_inv_var : array-like, shape=[n_variances,] Array of the computed inverse of a function phi whose expression is closed-form :math:`\sigma\mapsto \sigma^3 \times \frac{d } {\mathstrut d\sigma}\log \zeta_m(\sigma)' where :math:'\sigma' denotes the variance and :math:'\zeta' the normalization coefficient and :math:'m' the dimension. Returns ------- var : array-like, shape=[n_gaussians,] Estimated variances for each component of the GMM. """ n_gaussians, precision = \ weighted_distances.shape[0], variances_range.shape[0] ref = gs.expand_dims(phi_inv_var, 0) ref = gs.repeat(ref, n_gaussians, axis=0) val = gs.expand_dims(weighted_distances, 1) val = gs.repeat(val, precision, axis=1) abs_difference = gs.abs(ref - val) index = gs.argmin(abs_difference, -1) var = variances_range[index] return var
def closest_neighbor_index(self, point, neighbors): """Closest neighbor of point among neighbors. Parameters ---------- point : array-like, shape=[..., dim] Point. neighbors : array-like, shape=[..., dim] Neighbors. Returns ------- closest_neighbor_index : int Index of closest neighbor. """ dist = self.dist(point, neighbors) closest_neighbor_index = gs.argmin(dist) return closest_neighbor_index
def predict(self, X): """Predict for each data point the closest center in terms of riemannian_metric distance Parameters ---------- X : array-like, shape=[n_samples, n_features] data, where n_samples is the number of samples and n_features is the number of features. Returns ------- self : object Return array containing for each point the cluster associated """ dists = gs.hstack([ self.riemannian_metric.dist(self.centroids[i], X) for i in range(self.n_clusters) ]) belongs = gs.argmin(dists, -1) return belongs
def predict(self, X): """Predict the labels for each data point. Label each data point with the cluster having the nearest centroid using riemannian_metric distance. Parameters ---------- X : array-like, shape=[n_samples, n_features] Input data Returns ------- self : array-like, shape=[n_samples,] Array of predicted cluster indices for each sample """ assert self.centroids is not None, 'fit needs to be called first' dists = gs.hstack([self.riemannian_metric.dist(centroid, X) for centroid in self.centroids]) belongs = gs.argmin(dists, -1) return belongs
def predict(self, X): """Classify input data. Parameters ---------- X : array-like, shape (n_samples, n_features) The input samples. Returns ------- y : ndarray, shape (n_samples,) The label for each sample is the label of the closest sample seen during fit. """ # Check is fit had been called check_is_fitted(self, ["X_", "y_"]) # Input validation X = check_array(X) closest = gs.argmin(euclidean_distances(X, self.X_), axis=1) return self.y_[closest]
def predict(self, X): """ A reference implementation of a prediction for a classifier. Parameters ---------- X : array-like, shape (n_samples, n_features) The input samples. Returns ------- y : ndarray, shape (n_samples,) The label for each sample is the label of the closest sample seen during fit. """ # Check is fit had been called check_is_fitted(self, ['X_', 'y_']) # Input validation X = check_array(X) closest = gs.argmin(euclidean_distances(X, self.X_), axis=1) return self.y_[closest]
def fit(self, X, max_iter=100): """Provide clusters centroids and data labels. Alternate between computing the mean of each cluster and labelling data according to the new positions of the centroids. Parameters ---------- X : array-like, shape=[n_samples, n_features] Training data, where n_samples is the number of samples and n_features is the number of features. max_iter : int Maximum number of iterations Returns ------- self : array-like, shape=[n_clusters,] centroids array """ n_samples = X.shape[0] belongs = gs.zeros(n_samples) self.centroids = [ gs.expand_dims(X[randint(0, n_samples - 1)], 0) for i in range(self.n_clusters) ] self.centroids = gs.concatenate(self.centroids) index = 0 while index < max_iter: index += 1 dists = [ gs.to_ndarray( self.riemannian_metric.dist(self.centroids[i], X), 2, 1) for i in range(self.n_clusters) ] dists = gs.hstack(dists) belongs = gs.argmin(dists, 1) old_centroids = gs.copy(self.centroids) for i in range(self.n_clusters): fold = gs.squeeze(X[belongs == i]) if len(fold) > 0: mean = FrechetMean(metric=self.riemannian_metric, method=self.mean_method, max_iter=150) mean.fit(fold) self.centroids[i] = mean.estimate_ else: self.centroids[i] = X[randint(0, n_samples - 1)] centroids_distances = self.riemannian_metric.dist( old_centroids, self.centroids) if gs.mean(centroids_distances) < self.tol: if self.verbose > 0: logging.info('Convergence reached after {} ' 'iterations'.format(index)) return gs.copy(self.centroids) if index == max_iter: logging.warning('K-means maximum number of iterations {} reached. ' 'The mean may be inaccurate'.format(max_iter)) return gs.copy(self.centroids)
def fit(self, X, max_iter=100): """Provide clusters centroids and data labels. Alternate between computing the mean of each cluster and labelling data according to the new positions of the centroids. Parameters ---------- X : array-like, shape=[..., n_features] Training data, where n_samples is the number of samples and n_features is the number of features. max_iter : int Maximum number of iterations. Optional, default: 100. Returns ------- self : array-like, shape=[n_clusters,] Centroids. """ n_samples = X.shape[0] self.centroids = [gs.expand_dims(X[randint(0, n_samples - 1)], 0) for i in range(self.n_clusters)] self.centroids = gs.concatenate(self.centroids, axis=0) index = 0 while index < max_iter: index += 1 dists = [gs.to_ndarray( self.metric.dist(self.centroids[i], X), 2, 1)**2 for i in range(self.n_clusters)] dists = gs.hstack(dists) if self.fuzzy: dists[np.where(dists == 0)] = 0.00001 weights = 1 / (dists * np.sum(1 / dists, axis=1)[:, None]) else: belongs = gs.argmin(dists, 1) old_centroids = gs.copy(self.centroids) for i in range(self.n_clusters): if self.fuzzy: mean = FrechetMean( metric=self.metric, method=self.mean_method, max_iter=150, lr=self.lr, point_type=self.point_type, ) mean.fit(X, weights=weights[:, i]) self.centroids[i] = mean.estimate_ else: fold = gs.squeeze(X[belongs == i]) if len(fold) > 0: mean = FrechetMean( metric=self.metric, method=self.mean_method, max_iter=150, lr=self.lr, point_type=self.point_type) mean.fit(fold) self.centroids[i] = mean.estimate_ else: self.centroids[i] = X[randint(0, n_samples - 1)] centroids_distances = self.metric.dist( old_centroids, self.centroids) if gs.mean(centroids_distances) < self.tol: if self.verbose > 0: logging.info('Convergence reached after {} ' 'iterations'.format(index)) if self.n_clusters == 1: self.centroids = gs.squeeze(self.centroids, axis=0) return gs.copy(self.centroids) if index == max_iter: logging.warning('K-means maximum number of iterations {} reached. ' 'The mean may be inaccurate'.format(max_iter)) if self.n_clusters == 1: self.centroids = gs.squeeze(self.centroids, axis=0) return gs.copy(self.centroids)
def fit(self, X): """Provide clusters centroids and data labels. Alternate between computing the mean of each cluster and labelling data according to the new positions of the centroids. Parameters ---------- X : array-like, shape=[..., n_features] Training data, where n_samples is the number of samples and n_features is the number of features. max_iter : int Maximum number of iterations. Optional, default: 100. Returns ------- self : array-like, shape=[n_clusters,] Centroids. """ n_samples = X.shape[0] if self.verbose > 0: logging.info("Initializing...") if self.init == "kmeans++": centroids = [gs.expand_dims(X[randint(0, n_samples - 1)], 0)] for i in range(self.n_clusters - 1): dists = [ gs.to_ndarray(self.metric.dist(centroids[j], X), 2, 1) for j in range(i + 1) ] dists = gs.hstack(dists) dists_to_closest_centroid = gs.amin(dists, 1) indices = gs.arange(n_samples) weights = dists_to_closest_centroid / gs.sum( dists_to_closest_centroid) index = rv_discrete(values=(indices, weights)).rvs() centroids.append(gs.expand_dims(X[index], 0)) else: centroids = [ gs.expand_dims(X[randint(0, n_samples - 1)], 0) for i in range(self.n_clusters) ] self.centroids = gs.concatenate(centroids, axis=0) self.init_centroids = gs.concatenate(centroids, axis=0) dists = [ gs.to_ndarray(self.metric.dist(self.centroids[i], X), 2, 1) for i in range(self.n_clusters) ] dists = gs.hstack(dists) self.labels = gs.argmin(dists, 1) index = 0 while index < self.max_iter: index += 1 if self.verbose > 0: logging.info(f"Iteration {index}...") old_centroids = gs.copy(self.centroids) for i in range(self.n_clusters): fold = gs.squeeze(X[self.labels == i]) if len(fold) > 0: mean = FrechetMean( metric=self.metric, max_iter=self.max_iter_mean, point_type=self.point_type, method=self.mean_method, init_step_size=self.init_step_size, ) mean.fit(fold) self.centroids[i] = mean.estimate_ else: self.centroids[i] = X[randint(0, n_samples - 1)] dists = [ gs.to_ndarray(self.metric.dist(self.centroids[i], X), 2, 1) for i in range(self.n_clusters) ] dists = gs.hstack(dists) self.labels = gs.argmin(dists, 1) dists_to_closest_centroid = gs.amin(dists, 1) self.inertia = gs.sum(dists_to_closest_centroid**2) centroids_distances = self.metric.dist(old_centroids, self.centroids) if self.verbose > 0: logging.info( f"Convergence criterion at the end of iteration {index} " f"is {gs.mean(centroids_distances)}.") if gs.mean(centroids_distances) < self.tol: if self.verbose > 0: logging.info( f"Convergence reached after {index} iterations.") if self.n_clusters == 1: self.centroids = gs.squeeze(self.centroids, axis=0) return gs.copy(self.centroids) if index == self.max_iter: logging.warning( f"K-means maximum number of iterations {self.max_iter} reached. " "The mean may be inaccurate.") if self.n_clusters == 1: self.centroids = gs.squeeze(self.centroids, axis=0) return gs.copy(self.centroids)