Esempio n. 1
0
def single_chunk_blobs():
    """X, y pair for clustering

    The `X` and `y` have a single block, so chunksize is 100.
    Useful for testing `partial_fit` methods.
    """
    X, y = make_blobs(chunks=100, random_state=0)
    return X, y
Esempio n. 2
0
def Xl_blobs_easy():
    """
    Tuple of (X, labels) for classification.

    The centers are very spread out, so the clustering is easy.
    """
    centers = np.array([[-7, -7], [0, 0], [7, 7]])
    X, y = make_blobs(cluster_std=0.1,
                      centers=centers,
                      chunks=50,
                      random_state=0)
    return X, y
Esempio n. 3
0
    def _prep_data(self, reg=False):
        self.n_samples = int(1e5)
        self.chunk_size = int(1e4)
        self.n_chunks = np.ceil(self.n_samples / self.chunk_size).astype(int)

        if reg:
            self.x, self.y = make_regression(n_samples=self.n_samples,
                                             chunks=self.chunk_size,
                                             random_state=0,
                                             n_features=40)
        else:
            self.x, self.y = make_blobs(n_samples=self.n_samples,
                                        chunks=self.chunk_size,
                                        random_state=0,
                                        n_features=40,
                                        centers=2,
                                        cluster_std=100)

        return self
Esempio n. 4
0
from functools import partial

import numpy as np
import pytest
import sklearn.cluster

from dask_ml import metrics
from dask_ml.cluster import SpectralClustering
from dask_ml.datasets import make_blobs

X, y = make_blobs(n_samples=200, chunks=100, random_state=0)


@pytest.mark.parametrize("as_ndarray", [False, True])
@pytest.mark.parametrize("persist_embedding", [True, False])
def test_basic(as_ndarray, persist_embedding):
    sc = SpectralClustering(
        n_components=25, random_state=0, persist_embedding=persist_embedding
    )
    if as_ndarray:
        X_ = X.compute()
    else:
        X_ = X
    sc.fit(X_)
    assert len(sc.labels_) == len(X_)


@pytest.mark.parametrize(
    "assign_labels", [sklearn.cluster.KMeans(n_init=2), "sklearn-kmeans"]
)
def test_sklearn_kmeans(assign_labels):