Esempio n. 1
0
    def predict(self, data):
        """Predict the closest cluster for each sample in X.

        Parameters
        ----------
        data : array-like, shape=[n_samples, dim,]
            Training data, where n_samples is the number of samples and
            dim is the number of dimensions.

        Returns
        -------
        labels : array-like, shape=[n_samples,]
            Index of the cluster each sample belongs to.
        """
        labels = gs.zeros(len(data))

        for point_index, point_value in enumerate(data):
            distances = gs.zeros(len(self.cluster_centers_))
            for cluster_index, cluster_value in enumerate(
                    self.cluster_centers_):
                distances[cluster_index] = self.metric.dist(
                    point_value, cluster_value)

            labels[point_index] = gs.argmin(distances)

        return labels
def find_normalization_factor(variances, variances_range,
                              normalization_factor_var):
    """Find the normalization factor given some variances.

    Parameters
    ----------
    variances : array-like, shape=[n_gaussians,]
        Array of standard deviations for each component
        of some GMM.
    variances_range : array-like, shape=[n_variances,]
        Array of standard deviations.
    normalization_factor_var : array-like, shape=[n_variances,]
        Array of computed normalization factor.

    Returns
    -------
    norm_factor : array-like, shape=[n_gaussians,]
        Array of normalization factors for the given
        variances.
    """
    n_gaussians, precision = variances.shape[0], variances_range.shape[0]

    ref = gs.expand_dims(variances_range, 0)
    ref = gs.repeat(ref, n_gaussians, axis=0)
    val = gs.expand_dims(variances, 1)
    val = gs.repeat(val, precision, axis=1)

    difference = gs.abs(ref - val)

    index = gs.argmin(difference, axis=-1)
    norm_factor = normalization_factor_var[index]

    return norm_factor
Esempio n. 3
0
    def closest_neighbor_index(self, point, neighbors):
        """Closest neighbor of point among neighbors.

        Parameters
        ----------
        point : array-like, shape=[..., dim]
            Point.
        neighbors : array-like, shape=[n_neighbors, dim]
            Neighbors.

        Returns
        -------
        closest_neighbor_index : int
            Index of closest neighbor.
        """
        n_points = point.shape[0] if gs.ndim(point) == gs.ndim(
            neighbors) else 1
        n_neighbors = neighbors.shape[0]

        if n_points > 1 and n_neighbors > 1:
            neighbors = gs.repeat(neighbors, n_points, axis=0)

            point = gs.concatenate([point for _ in range(n_neighbors)])

        closest_neighbor_index = gs.argmin(
            gs.transpose(
                gs.reshape(self.dist(point, neighbors),
                           (n_neighbors, n_points)), ),
            axis=1,
        )

        if n_points == 1:
            return closest_neighbor_index[0]

        return closest_neighbor_index
Esempio n. 4
0
    def predict(self, X):
        """Predict the labels for each data point.

        Label each data point with the cluster having the nearest
        centroid using metric distance.

        Parameters
        ----------
        X : array-like, shape=[..., n_features]
            Input data.

        Returns
        -------
        self : array-like, shape=[...,]
            Array of predicted cluster indices for each sample.
        """
        if self.centroids is None:
            raise RuntimeError("fit needs to be called first.")
        dists = gs.stack(
            [self.metric.dist(centroid, X) for centroid in self.centroids],
            axis=1)
        dists = gs.squeeze(dists)

        labels = gs.argmin(dists, -1)

        return labels
Esempio n. 5
0
def _circle_mean(points):
    """Determine the mean on a circle.

    Data are expected in radians in the range [-pi, pi). The mean is returned
    in the same range. If the mean is unique, this algorithm is guaranteed to
    find it. It is not vulnerable to local minima of the Frechet function. If
    the mean is not unique, the algorithm only returns one of the means. Which
    mean is returned depends on numerical rounding errors.

    Reference
    ---------
    ..[HH15]     Hotz, T. and S. F. Huckemann (2015), "Intrinsic means on the circle:
                 Uniqueness, locus and asymptotics", Annals of the Institute of
                 Statistical Mathematics 67 (1), 177–193.
                 https://arxiv.org/abs/1108.2141
    """
    if points.ndim > 1:
        points_ = Hypersphere.extrinsic_to_angle(points)
    else:
        points_ = gs.copy(points)
    sample_size = points_.shape[0]
    mean0 = gs.mean(points_)
    var0 = gs.sum((points_ - mean0) ** 2)
    sorted_points = gs.sort(points_)
    means = _circle_variances(mean0, var0, sample_size, sorted_points)
    return means[gs.argmin(means[:, 1]), 0]
Esempio n. 6
0
    def predict(self, X, fuzzy_predictions=False):
        """Predict the labels for each data point.

        Label each data point with the cluster having the nearest
        centroid using metric distance.

        Parameters
        ----------
        X : array-like, shape=[..., n_features]
            Input data.

        Returns n
    |7i0-o≥
        -------
        self : array-like, shape=[...,]
            Array of predicted cluster indices for each sample.
        """
        if self.centroids is None:
            raise RuntimeError('fit needs to be called first.')
        dists = gs.stack(
            [self.metric.dist(centroid, X)
             for centroid in self.centroids],
            axis=1)
        dists = gs.squeeze(dists)

        if fuzzy_predictions:
            dists[np.where(dists == 0)] = 0.00001
            belongs = 1 / (dists * np.sum(1 / dists, axis=1)[:, None])

        else:
            belongs = gs.argmin(dists, -1)

        return belongs
Esempio n. 7
0
    def _update_medoid_indexes(self, distances, labels, medoid_indices):

        for cluster in range(self.n_clusters):

            cluster_index = gs.where(labels == cluster)[0]

            if len(cluster_index) == 0:
                logging.warning('One cluster is empty.')
                continue

            in_cluster_distances = distances[
                cluster_index,
                gs.expand_dims(cluster_index, axis=-1)]

            in_cluster_all_costs = gs.sum(in_cluster_distances, axis=1)

            min_cost_index = gs.argmin(in_cluster_all_costs)

            min_cost = in_cluster_all_costs[min_cost_index]

            current_cost = in_cluster_all_costs[gs.argmax(
                cluster_index == medoid_indices[cluster])]

            if min_cost < current_cost:
                medoid_indices[cluster] = cluster_index[min_cost_index]
Esempio n. 8
0
    def closest_neighbor_index(self, point, neighbors):
        """
        Closest neighbor of point among neighbors.
        """
        dist = self.dist(point, neighbors)
        closest_neighbor_index = gs.argmin(dist)

        return closest_neighbor_index
Esempio n. 9
0
    def fit(self, X, max_iter=100):
        """Predict for each data point the closest center in terms of
            riemannian_metric distance

        Parameters
        ----------
        X : array-like, shape=[n_samples, n_features]
            Training data, where n_samples is the number of samples
            and n_features is the number of features.

        max_iter : Maximum number of iterations

        Returns
        -------
        self : object
            Return centroids array
        """
        n_samples = X.shape[0]
        belongs = gs.zeros(n_samples)
        self.centroids = [
            gs.expand_dims(X[randint(0, n_samples - 1)], 0)
            for i in range(self.n_clusters)
        ]
        self.centroids = gs.concatenate(self.centroids)
        index = 0
        while index < max_iter:
            index += 1

            dists = [
                gs.to_ndarray(
                    self.riemannian_metric.dist(self.centroids[i], X), 2, 1)
                for i in range(self.n_clusters)
            ]
            dists = gs.hstack(dists)
            belongs = gs.argmin(dists, 1)
            old_centroids = gs.copy(self.centroids)
            for i in range(self.n_clusters):
                fold = gs.squeeze(X[belongs == i])
                if len(fold) > 0:
                    self.centroids[i] = self.riemannian_metric.mean(fold)

                else:
                    self.centroids[i] = X[randint(0, n_samples - 1)]

            centroids_distances = self.riemannian_metric.dist(
                old_centroids, self.centroids)

            if gs.mean(centroids_distances) < self.tol:
                if self.verbose > 0:
                    print("Convergence Reached after ", index, " iterations")

                return gs.copy(self.centroids)

        return gs.copy(self.centroids)
Esempio n. 10
0
    def closest_neighbor_index(self, point, neighbors):
        """Closest neighbor of point among neighbors.

        Parameters
        ----------
        point
        neighbors
        Returns
        -------
        closest_neighbor_index

        """
        dist = self.dist(point, neighbors)
        closest_neighbor_index = gs.argmin(dist)

        return closest_neighbor_index
Esempio n. 11
0
    def fit(self, data, max_iter=100):
        """Provide clusters centroids and data labels.

        Labels data by minimizing the distance between data points
        and cluster centroids chosen from the data points.
        Minimization is performed by swapping the centroids and data points.

        Parameters
        ----------
        data : array-like, shape=[n_samples, dim]
            Training data, where n_samples is the number of samples and
            dim is the number of dimensions.
        max_iter : int
            Maximum number of iterations.
            Optional, default: 100.

        Returns
        -------
        self : array-like, shape=[n_clusters,]
            Centroids.
        """
        distances = self.metric.dist_pairwise(data)

        medoids_indices = self._initialize_medoids(distances)

        for iteration in range(max_iter):

            old_medoids_indices = gs.copy(medoids_indices)

            labels = gs.argmin(distances[medoids_indices, :], axis=0)

            self._update_medoid_indexes(distances, labels, medoids_indices)

            if gs.all(old_medoids_indices == medoids_indices):
                break
            if iteration == max_iter - 1:
                logging.warning('Maximum number of iteration reached before '
                                'convergence. Consider increasing max_iter to '
                                'improve the fit.')

        self.cluster_centers_ = data[medoids_indices]
        self.labels_ = labels
        self.medoid_indices_ = medoids_indices

        return self.cluster_centers_
Esempio n. 12
0
    def find_variance_from_index(weighted_distances, variances_range,
                                 phi_inv_var):
        r"""Return the variance given weighted distances.

        Parameters
        ----------
        weighted_distances : array-like, shape=[n_gaussians,]
            Mean of the weighted distances between training data
            and current barycentres. The weights of each data sample
            corresponds to the probability of belonging to a component
            of the Gaussian mixture model.
        variances_range : array-like, shape=[n_variances,]
            Array of standard deviations.
        phi_inv_var : array-like, shape=[n_variances,]
            Array of the computed inverse of a function phi
            whose expression is closed-form
            :math:`\sigma\mapsto \sigma^3 \times \frac{d  }
            {\mathstrut d\sigma}\log \zeta_m(\sigma)'
            where :math:'\sigma' denotes the variance
            and :math:'\zeta' the normalization coefficient
            and :math:'m' the dimension.

        Returns
        -------
        var : array-like, shape=[n_gaussians,]
            Estimated variances for each component of the GMM.
        """
        n_gaussians, precision = \
            weighted_distances.shape[0], variances_range.shape[0]

        ref = gs.expand_dims(phi_inv_var, 0)
        ref = gs.repeat(ref, n_gaussians, axis=0)

        val = gs.expand_dims(weighted_distances, 1)
        val = gs.repeat(val, precision, axis=1)

        abs_difference = gs.abs(ref - val)

        index = gs.argmin(abs_difference, -1)

        var = variances_range[index]

        return var
Esempio n. 13
0
    def closest_neighbor_index(self, point, neighbors):
        """Closest neighbor of point among neighbors.

        Parameters
        ----------
        point : array-like, shape=[..., dim]
            Point.
        neighbors : array-like, shape=[..., dim]
            Neighbors.

        Returns
        -------
        closest_neighbor_index : int
            Index of closest neighbor.
        """
        dist = self.dist(point, neighbors)
        closest_neighbor_index = gs.argmin(dist)

        return closest_neighbor_index
Esempio n. 14
0
    def predict(self, X):
        """Predict for each data point the closest center in terms of
            riemannian_metric distance

        Parameters
        ----------
        X : array-like, shape=[n_samples, n_features]
            data, where n_samples is the number of samples
            and n_features is the number of features.

        Returns
        -------
        self : object
            Return array containing for each point the cluster associated
        """
        dists = gs.hstack([
            self.riemannian_metric.dist(self.centroids[i], X)
            for i in range(self.n_clusters)
        ])
        belongs = gs.argmin(dists, -1)
        return belongs
Esempio n. 15
0
    def predict(self, X):
        """Predict the labels for each data point.

        Label each data point with the cluster having the nearest
        centroid using riemannian_metric distance.

        Parameters
        ----------
        X : array-like, shape=[n_samples, n_features]
            Input data

        Returns
        -------
        self : array-like, shape=[n_samples,]
            Array of predicted cluster indices for each sample
        """
        assert self.centroids is not None, 'fit needs to be called first'
        dists = gs.hstack([self.riemannian_metric.dist(centroid, X)
                           for centroid in self.centroids])
        belongs = gs.argmin(dists, -1)
        return belongs
Esempio n. 16
0
    def predict(self, X):
        """Classify input data.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        y : ndarray, shape (n_samples,)
            The label for each sample is the label of the closest sample
            seen during fit.
        """
        # Check is fit had been called
        check_is_fitted(self, ["X_", "y_"])

        # Input validation
        X = check_array(X)

        closest = gs.argmin(euclidean_distances(X, self.X_), axis=1)
        return self.y_[closest]
Esempio n. 17
0
    def predict(self, X):
        """ A reference implementation of a prediction for a classifier.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            The input samples.

        Returns
        -------
        y : ndarray, shape (n_samples,)
            The label for each sample is the label of the closest sample
            seen during fit.
        """
        # Check is fit had been called
        check_is_fitted(self, ['X_', 'y_'])

        # Input validation
        X = check_array(X)

        closest = gs.argmin(euclidean_distances(X, self.X_), axis=1)
        return self.y_[closest]
Esempio n. 18
0
    def fit(self, X, max_iter=100):
        """Provide clusters centroids and data labels.

        Alternate between computing the mean of each cluster
        and labelling data according to the new positions of the centroids.

        Parameters
        ----------
        X : array-like, shape=[n_samples, n_features]
            Training data, where n_samples is the number of samples and
            n_features is the number of features.

        max_iter : int
            Maximum number of iterations

        Returns
        -------
        self : array-like, shape=[n_clusters,]
            centroids array
        """
        n_samples = X.shape[0]
        belongs = gs.zeros(n_samples)
        self.centroids = [
            gs.expand_dims(X[randint(0, n_samples - 1)], 0)
            for i in range(self.n_clusters)
        ]
        self.centroids = gs.concatenate(self.centroids)
        index = 0
        while index < max_iter:
            index += 1

            dists = [
                gs.to_ndarray(
                    self.riemannian_metric.dist(self.centroids[i], X), 2, 1)
                for i in range(self.n_clusters)
            ]
            dists = gs.hstack(dists)
            belongs = gs.argmin(dists, 1)
            old_centroids = gs.copy(self.centroids)
            for i in range(self.n_clusters):
                fold = gs.squeeze(X[belongs == i])

                if len(fold) > 0:

                    mean = FrechetMean(metric=self.riemannian_metric,
                                       method=self.mean_method,
                                       max_iter=150)
                    mean.fit(fold)

                    self.centroids[i] = mean.estimate_
                else:
                    self.centroids[i] = X[randint(0, n_samples - 1)]

            centroids_distances = self.riemannian_metric.dist(
                old_centroids, self.centroids)

            if gs.mean(centroids_distances) < self.tol:
                if self.verbose > 0:
                    logging.info('Convergence reached after {} '
                                 'iterations'.format(index))

                return gs.copy(self.centroids)

        if index == max_iter:
            logging.warning('K-means maximum number of iterations {} reached. '
                            'The mean may be inaccurate'.format(max_iter))

        return gs.copy(self.centroids)
Esempio n. 19
0
    def fit(self, X, max_iter=100):
        """Provide clusters centroids and data labels.

        Alternate between computing the mean of each cluster
        and labelling data according to the new positions of the centroids.

        Parameters
        ----------
        X : array-like, shape=[..., n_features]
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        max_iter : int
            Maximum number of iterations.
            Optional, default: 100.

        Returns
        -------
        self : array-like, shape=[n_clusters,]
            Centroids.
        """
        n_samples = X.shape[0]
        self.centroids = [gs.expand_dims(X[randint(0, n_samples - 1)], 0)
                          for i in range(self.n_clusters)]
        self.centroids = gs.concatenate(self.centroids, axis=0)
        index = 0
        while index < max_iter:
            index += 1

            dists = [gs.to_ndarray(
                     self.metric.dist(self.centroids[i], X), 2, 1)**2
                     for i in range(self.n_clusters)]
            dists = gs.hstack(dists)

            if self.fuzzy:
                dists[np.where(dists == 0)] = 0.00001
                weights = 1 / (dists * np.sum(1 / dists, axis=1)[:, None])
            else:
                belongs = gs.argmin(dists, 1)

            old_centroids = gs.copy(self.centroids)
            for i in range(self.n_clusters):

                if self.fuzzy:

                    mean = FrechetMean(
                        metric=self.metric,
                        method=self.mean_method,
                        max_iter=150,
                        lr=self.lr,
                        point_type=self.point_type,
                        )

                    mean.fit(X, weights=weights[:, i])
                    self.centroids[i] = mean.estimate_

                else:
                    fold = gs.squeeze(X[belongs == i])

                    if len(fold) > 0:

                        mean = FrechetMean(
                            metric=self.metric,
                            method=self.mean_method,
                            max_iter=150,
                            lr=self.lr,
                            point_type=self.point_type)
                        mean.fit(fold)

                        self.centroids[i] = mean.estimate_
                    else:
                        self.centroids[i] = X[randint(0, n_samples - 1)]

            centroids_distances = self.metric.dist(
                old_centroids, self.centroids)

            if gs.mean(centroids_distances) < self.tol:
                if self.verbose > 0:
                    logging.info('Convergence reached after {} '
                                 'iterations'.format(index))

                if self.n_clusters == 1:
                    self.centroids = gs.squeeze(self.centroids, axis=0)

                return gs.copy(self.centroids)

        if index == max_iter:
            logging.warning('K-means maximum number of iterations {} reached. '
                            'The mean may be inaccurate'.format(max_iter))

        if self.n_clusters == 1:
            self.centroids = gs.squeeze(self.centroids, axis=0)
        return gs.copy(self.centroids)
Esempio n. 20
0
    def fit(self, X):
        """Provide clusters centroids and data labels.

        Alternate between computing the mean of each cluster
        and labelling data according to the new positions of the centroids.

        Parameters
        ----------
        X : array-like, shape=[..., n_features]
            Training data, where n_samples is the number of samples and
            n_features is the number of features.
        max_iter : int
            Maximum number of iterations.
            Optional, default: 100.

        Returns
        -------
        self : array-like, shape=[n_clusters,]
            Centroids.
        """
        n_samples = X.shape[0]
        if self.verbose > 0:
            logging.info("Initializing...")
        if self.init == "kmeans++":
            centroids = [gs.expand_dims(X[randint(0, n_samples - 1)], 0)]
            for i in range(self.n_clusters - 1):
                dists = [
                    gs.to_ndarray(self.metric.dist(centroids[j], X), 2, 1)
                    for j in range(i + 1)
                ]
                dists = gs.hstack(dists)
                dists_to_closest_centroid = gs.amin(dists, 1)
                indices = gs.arange(n_samples)
                weights = dists_to_closest_centroid / gs.sum(
                    dists_to_closest_centroid)
                index = rv_discrete(values=(indices, weights)).rvs()
                centroids.append(gs.expand_dims(X[index], 0))
        else:
            centroids = [
                gs.expand_dims(X[randint(0, n_samples - 1)], 0)
                for i in range(self.n_clusters)
            ]
        self.centroids = gs.concatenate(centroids, axis=0)
        self.init_centroids = gs.concatenate(centroids, axis=0)

        dists = [
            gs.to_ndarray(self.metric.dist(self.centroids[i], X), 2, 1)
            for i in range(self.n_clusters)
        ]
        dists = gs.hstack(dists)
        self.labels = gs.argmin(dists, 1)
        index = 0
        while index < self.max_iter:
            index += 1
            if self.verbose > 0:
                logging.info(f"Iteration {index}...")

            old_centroids = gs.copy(self.centroids)
            for i in range(self.n_clusters):
                fold = gs.squeeze(X[self.labels == i])

                if len(fold) > 0:

                    mean = FrechetMean(
                        metric=self.metric,
                        max_iter=self.max_iter_mean,
                        point_type=self.point_type,
                        method=self.mean_method,
                        init_step_size=self.init_step_size,
                    )
                    mean.fit(fold)

                    self.centroids[i] = mean.estimate_
                else:
                    self.centroids[i] = X[randint(0, n_samples - 1)]

            dists = [
                gs.to_ndarray(self.metric.dist(self.centroids[i], X), 2, 1)
                for i in range(self.n_clusters)
            ]
            dists = gs.hstack(dists)
            self.labels = gs.argmin(dists, 1)
            dists_to_closest_centroid = gs.amin(dists, 1)
            self.inertia = gs.sum(dists_to_closest_centroid**2)
            centroids_distances = self.metric.dist(old_centroids,
                                                   self.centroids)
            if self.verbose > 0:
                logging.info(
                    f"Convergence criterion at the end of iteration {index} "
                    f"is {gs.mean(centroids_distances)}.")

            if gs.mean(centroids_distances) < self.tol:
                if self.verbose > 0:
                    logging.info(
                        f"Convergence reached after {index} iterations.")

                if self.n_clusters == 1:
                    self.centroids = gs.squeeze(self.centroids, axis=0)

                return gs.copy(self.centroids)

        if index == self.max_iter:
            logging.warning(
                f"K-means maximum number of iterations {self.max_iter} reached. "
                "The mean may be inaccurate.")

        if self.n_clusters == 1:
            self.centroids = gs.squeeze(self.centroids, axis=0)
        return gs.copy(self.centroids)