Beispiel #1
0
    def test_sparse(self):
        """ Tests K-means produces the same results using dense and sparse
        data structures. """
        file_ = "tests/files/libsvm/2"

        x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True)
        x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False)

        kmeans = KMeans(random_state=170)

        y_sparse = kmeans.fit_predict(x_sp).collect()
        sparse_c = kmeans.centers.toarray()

        kmeans = KMeans(random_state=170)

        y_dense = kmeans.fit_predict(x_ds).collect()
        dense_c = kmeans.centers

        self.assertTrue(np.allclose(sparse_c, dense_c))
        self.assertTrue(np.array_equal(y_sparse, y_dense))
Beispiel #2
0
    def test_kmeans(self):
        """ Tests K-means fit_predict and compares the result with
            regular ds-arrays """
        config.session.execute("TRUNCATE TABLE hecuba.istorage")
        config.session.execute("DROP KEYSPACE IF EXISTS hecuba_dislib")

        x, y = make_blobs(n_samples=1500, random_state=170)
        x_filtered = np.vstack(
            (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))

        block_size = (x_filtered.shape[0] // 10, x_filtered.shape[1])

        x_train = ds.array(x_filtered, block_size=block_size)
        x_train_hecuba = ds.array(x=x_filtered, block_size=block_size)
        x_train_hecuba.make_persistent(name="hecuba_dislib.test_array")

        kmeans = KMeans(n_clusters=3, random_state=170)
        labels = kmeans.fit_predict(x_train).collect()

        kmeans2 = KMeans(n_clusters=3, random_state=170)
        h_labels = kmeans2.fit_predict(x_train_hecuba).collect()

        self.assertTrue(np.allclose(kmeans.centers, kmeans2.centers))
        self.assertTrue(np.allclose(labels, h_labels))
Beispiel #3
0
    def test_fit_predict(self):
        """ Tests fit_predict."""
        x, y = make_blobs(n_samples=1500, random_state=170)
        x_filtered = np.vstack(
            (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))

        x_train = ds.array(x_filtered, block_size=(300, 2))

        kmeans = KMeans(n_clusters=3, random_state=170)
        labels = kmeans.fit_predict(x_train).collect()

        skmeans = SKMeans(n_clusters=3, random_state=170)
        sklabels = skmeans.fit_predict(x_filtered)

        centers = np.array([[-8.941375656533449, -5.481371322614891],
                            [-4.524023204953875, 0.06235042593214654],
                            [2.332994701667008, 0.37681003933082696]])

        self.assertTrue(np.allclose(centers, kmeans.centers))
        self.assertTrue(np.allclose(labels, sklabels))
Beispiel #4
0
def main():
    """
    Usage example copied from scikit-learn's webpage.

    """
    plt.figure(figsize=(12, 12))

    n_samples = 1500
    random_state = 170
    x, y = make_blobs(n_samples=n_samples, random_state=random_state)

    dis_x = ds.array(x, block_size=(300, 2))

    # Incorrect number of clusters
    kmeans = KMeans(n_clusters=2, random_state=random_state)
    y_pred = kmeans.fit_predict(dis_x).collect()

    plt.subplot(221)
    plt.scatter(x[:, 0], x[:, 1], c=y_pred)
    centers = kmeans.centers
    plt.scatter(centers[:, 0], centers[:, 1], c="red")
    plt.title("Incorrect Number of Blobs")

    # Anisotropicly distributed data
    transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
    x_aniso = np.dot(x, transformation)

    dis_x_aniso = ds.array(x_aniso, block_size=(300, 2))

    kmeans = KMeans(n_clusters=3, random_state=random_state)
    y_pred = kmeans.fit_predict(dis_x_aniso).collect()

    plt.subplot(222)
    plt.scatter(x_aniso[:, 0], x_aniso[:, 1], c=y_pred)
    centers = kmeans.centers
    plt.scatter(centers[:, 0], centers[:, 1], c="red")
    plt.title("Anisotropicly Distributed Blobs")

    # Different variance
    x_varied, y_varied = make_blobs(n_samples=n_samples,
                                    cluster_std=[1.0, 2.5, 0.5],
                                    random_state=random_state)

    dis_x_varied = ds.array(x_varied, block_size=(300, 2))

    kmeans = KMeans(n_clusters=3, random_state=random_state)
    y_pred = kmeans.fit_predict(dis_x_varied).collect()

    plt.subplot(223)
    plt.scatter(x_varied[:, 0], x_varied[:, 1], c=y_pred)
    centers = kmeans.centers
    plt.scatter(centers[:, 0], centers[:, 1], c="red")
    plt.title("Unequal Variance")

    # Unevenly sized blobs
    x_filtered = np.vstack((x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))

    dis_x_filtered = ds.array(x_filtered, block_size=(300, 2))

    kmeans = KMeans(n_clusters=3, random_state=random_state)
    y_pred = kmeans.fit_predict(dis_x_filtered).collect()

    plt.subplot(224)
    plt.scatter(x_filtered[:, 0], x_filtered[:, 1], c=y_pred)
    centers = kmeans.centers
    plt.scatter(centers[:, 0], centers[:, 1], c="red")
    plt.title("Unevenly Sized Blobs")
    plt.show()
Beispiel #5
0
    def _initialize_parameters(self, x, random_state):
        """Initialization of the Gaussian mixture parameters.

        Parameters
        ----------
        x : ds-array, shape=(n_samples, n_features)
            Data points.

        random_state : RandomState
            A random number generator instance.
        """
        if self.weights_init is not None:
            self.weights_ = self.weights_init / np.sum(self.weights_init)
        if self.means_init is not None:
            self.means_ = self.means_init
        if self.precisions_init is not None:
            if self.covariance_type == 'full':
                self.precisions_cholesky_ = np.array(
                    [linalg.cholesky(prec_init, lower=True)
                     for prec_init in self.precisions_init])
            elif self.covariance_type == 'tied':
                self.precisions_cholesky_ = linalg.cholesky(
                    self.precisions_init, lower=True)
            else:
                self.precisions_cholesky_ = self.precisions_init
        initialize_params = (self.weights_init is None or
                             self.means_init is None or
                             self.precisions_init is None)
        if initialize_params:
            n_components = self.n_components
            resp_blocks = []
            if self.init_params == 'kmeans':
                if self.verbose:
                    print("KMeans initialization start")
                seed = random_state.randint(0, int(1e8))
                kmeans = KMeans(n_clusters=n_components, random_state=seed,
                                verbose=self.verbose)
                y = kmeans.fit_predict(x)
                self.kmeans = kmeans
                for y_part in y._iterator(axis=0):
                    resp_blocks.append([_resp_subset(y_part._blocks,
                                                     n_components)])

            elif self.init_params == 'random':
                chunks = x._n_blocks[0]
                seeds = random_state.randint(np.iinfo(np.int32).max,
                                             size=chunks)
                for i, x_row in enumerate(x._iterator(axis=0)):
                    resp_blocks.append([_random_resp_subset(x_row.shape[0],
                                                            n_components,
                                                            seeds[i])])
            else:
                raise ValueError("Unimplemented initialization method '%s'"
                                 % self.init_params)
            resp = Array(blocks=resp_blocks,
                         top_left_shape=(x._top_left_shape[0], n_components),
                         reg_shape=(x._reg_shape[0], n_components),
                         shape=(x.shape[0], n_components), sparse=False)
            weights, nk, means = self._estimate_parameters(x, resp)
            if self.means_init is None:
                self.means_ = means
            if self.weights_init is None:
                self.weights_ = weights

            if self.precisions_init is None:
                cov, p_c = _estimate_covariances(x, resp, nk,
                                                 self.means_, self.reg_covar,
                                                 self.covariance_type,
                                                 self.arity)
                self.covariances_ = cov
                self.precisions_cholesky_ = p_c

            for resp_block in resp._blocks:
                compss_delete_object(resp_block)