Esempio n. 1
0
    def _update_centroids(self, x: DNDarray, matching_centroids: DNDarray):
        """
        Compute coordinates of new centroid as mean of the data points in ``x`` that are assigned to this centroid.

        Parameters
        ----------
        x :  DNDarray
            Input data
        matching_centroids : DNDarray
            Array filled with indices ``i`` indicating to which cluster ``ci`` each sample point in ``x`` is assigned

        """
        new_cluster_centers = self._cluster_centers.copy()
        for i in range(self.n_clusters):
            # points in current cluster
            selection = (matching_centroids == i).astype(ht.int64)
            # accumulate points and total number of points in cluster
            assigned_points = x * selection
            points_in_cluster = selection.sum(axis=0, keepdim=True).clip(
                1.0,
                ht.iinfo(ht.int64).max)

            # compute the new centroids
            new_cluster_centers[i:i + 1, :] = (assigned_points /
                                               points_in_cluster).sum(
                                                   axis=0, keepdim=True)

        return new_cluster_centers
Esempio n. 2
0
    def fit(self, X):
        """
        Computes the centroid of a k-means clustering.

        Parameters
        ----------
        X : ht.DNDarray, shape=(n_samples, n_features)
            Training instances to cluster.
        """
        # input sanitation
        if not isinstance(X, ht.DNDarray):
            raise ValueError(
                "input needs to be a ht.DNDarray, but was {}".format(type(X)))

        # initialize the clustering
        self._initialize_cluster_centers(X)
        self._n_iter = 0
        matching_centroids = ht.zeros((X.shape[0]),
                                      split=X.split,
                                      device=X.device,
                                      comm=X.comm)

        X = X.expand_dims(axis=2)
        new_cluster_centers = self._cluster_centers.copy()

        # iteratively fit the points to the centroids
        for epoch in range(self.max_iter):
            # increment the iteration count
            self._n_iter += 1
            # determine the centroids
            matching_centroids = self._fit_to_cluster(X)

            # update the centroids
            for i in range(self.n_clusters):
                # points in current cluster
                selection = (matching_centroids == i).astype(ht.int64)

                # accumulate points and total number of points in cluster
                assigned_points = (X * selection).sum(axis=0, keepdim=True)
                points_in_cluster = selection.sum(axis=0, keepdim=True).clip(
                    1.0,
                    ht.iinfo(ht.int64).max)

                # compute the new centroids
                new_cluster_centers[:, :, i:i +
                                    1] = assigned_points / points_in_cluster

            # check whether centroid movement has converged
            self._inertia = ((self._cluster_centers -
                              new_cluster_centers)**2).sum()
            self._cluster_centers = new_cluster_centers.copy()
            if self.tol is not None and self._inertia <= self.tol:
                break

        self._labels = matching_centroids.squeeze()

        return self
Esempio n. 3
0
    def test_iinfo(self):
        info32 = ht.iinfo(ht.int32)
        self.assertEqual(info32.bits, 32)
        self.assertEqual(info32.max, 2147483647)
        self.assertEqual(info32.min, -2147483648)

        with self.assertRaises(TypeError):
            ht.iinfo(1.0)

        with self.assertRaises(TypeError):
            ht.iinfo(ht.float64)

        with self.assertRaises(TypeError):
            ht.iinfo('int16')