Ejemplo n.º 1
0
def test_score(nrows, ncols, nclusters, n_parts, input_type, client):

    from cuml.dask.cluster import KMeans as cumlKMeans

    from cuml.dask.datasets import make_blobs

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=nclusters,
                      n_parts=n_parts,
                      cluster_std=0.01,
                      shuffle=False,
                      random_state=10)

    if input_type == "dataframe":
        X_train = to_dask_cudf(X)
        y_train = to_dask_cudf(y)
        y = y_train
    elif input_type == "array":
        X_train, y_train = X, y

    cumlModel = cumlKMeans(init="k-means||",
                           n_clusters=nclusters,
                           random_state=10)

    cumlModel.fit(X_train)

    actual_score = cumlModel.score(X_train)

    local_model = cumlModel.get_combined_model()
    expected_score = local_model.score(X_train.compute())

    assert abs(actual_score - expected_score) < 1e-3
Ejemplo n.º 2
0
def test_transform(nrows, ncols, nclusters, n_parts, input_type, cluster):

    client = None

    try:

        client = Client(cluster)

        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=nclusters,
                          n_parts=n_parts,
                          cluster_std=0.01,
                          shuffle=False,
                          random_state=10)
        y = y.astype('int64')

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            y_train = to_dask_cudf(y)
            labels = cp.squeeze(y_train.compute().to_pandas().values)
        elif input_type == "array":
            X_train, y_train = X, y
            labels = cp.squeeze(y_train.compute())

        cumlModel = cumlKMeans(init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_train)

        xformed = cumlModel.transform(X_train).compute()
        if input_type == "dataframe":
            xformed = cp.array(xformed
                               if len(xformed.shape) == 1
                               else xformed.as_gpu_matrix())

        if nclusters == 1:
            # series shape is (nrows,) not (nrows, 1) but both are valid
            # and equivalent for this test
            assert xformed.shape in [(nrows, nclusters), (nrows,)]
        else:
            assert xformed.shape == (nrows, nclusters)

        # The argmin of the transformed values should be equal to the labels
        # reshape is a quick manner of dealing with (nrows,) is not (nrows, 1)
        xformed_labels = cp.argmin(xformed.reshape((int(nrows),
                                                    int(nclusters))), axis=1)

        assert sk_adjusted_rand_score(cp.asnumpy(labels),
                                      cp.asnumpy(xformed_labels))

    finally:
        client.close()
Ejemplo n.º 3
0
def test_weighted_kmeans(nrows, ncols, nclusters, n_parts, client):
    cluster_std = 10000.0
    from cuml.dask.cluster import KMeans as cumlKMeans

    from cuml.dask.datasets import make_blobs

    # Using fairly high variance between points in clusters
    wt = cp.array([0.00001 for j in range(nrows)])

    bound = nclusters * 100000

    # Open the space really large
    centers = cp.random.uniform(-bound, bound, size=(nclusters, ncols))

    X_cudf, y = make_blobs(n_samples=nrows,
                           n_features=ncols,
                           centers=centers,
                           n_parts=n_parts,
                           cluster_std=cluster_std,
                           shuffle=False,
                           verbose=False,
                           random_state=10)

    # Choose one sample from each label and increase its weight
    for i in range(nclusters):
        wt[cp.argmax(cp.array(y.compute()) == i).item()] = 5000.0

    cumlModel = cumlKMeans(verbose=0,
                           init="k-means||",
                           n_clusters=nclusters,
                           random_state=10)

    chunk_parts = int(nrows / n_parts)
    sample_weights = da.from_array(wt, chunks=(chunk_parts, ))
    cumlModel.fit(X_cudf, sample_weight=sample_weights)

    X = X_cudf.compute()

    labels_ = cumlModel.predict(X_cudf).compute()
    cluster_centers_ = cumlModel.cluster_centers_

    for i in range(nrows):

        label = labels_[i]
        actual_center = cluster_centers_[label]

        diff = sum(abs(X[i] - actual_center))

        # The large weight should be the centroid
        if wt[i] > 1.0:
            assert diff < 1.0

        # Otherwise it should be pretty far away
        else:
            assert diff > 1000.0
Ejemplo n.º 4
0
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=False,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=0,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)
        cumlLabels = cumlModel.predict(X_cudf, delayed_predict)

        n_workers = len(list(client.has_what().keys()))

        # Verifying we are grouping partitions. This should be changed soon.
        if n_parts is not None and n_parts < n_workers:
            assert cumlLabels.npartitions == n_parts
        else:
            assert cumlLabels.npartitions == n_workers

        cumlPred = cp.array(cumlLabels.compute())

        assert cumlPred.shape[0] == nrows
        assert np.max(cumlPred) == nclusters - 1
        assert np.min(cumlPred) == 0

        labels = np.squeeze(y.compute().to_pandas().values)

        score = adjusted_rand_score(labels, cp.squeeze(cumlPred.get()))

        print(str(score))

        assert 1.0 == score

    finally:
        client.close()
Ejemplo n.º 5
0
def test_transform(nrows, ncols, nclusters, n_parts, cluster):

    client = None

    try:

        client = Client(cluster)

        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=False,
                               shuffle=False,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=0,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)

        labels = np.squeeze(y.compute().to_pandas().values)

        xformed = cumlModel.transform(X_cudf).compute()

        if nclusters == 1:
            # series shape is (nrows,) not (nrows, 1) but both are valid
            # and equivalent for this test
            assert xformed.shape in [(nrows, nclusters), (nrows, )]
        else:
            assert xformed.shape == (nrows, nclusters)

        xformed = cp.array(xformed if len(xformed.shape) ==
                           1 else xformed.as_gpu_matrix())

        # The argmin of the transformed values should be equal to the labels
        # reshape is a quick manner of dealing with (nrows,) is not (nrows, 1)
        xformed_labels = cp.argmin(xformed.reshape(
            (int(nrows), int(nclusters))),
                                   axis=1)

        assert adjusted_rand_score(labels, cp.squeeze(xformed_labels.get()))

    finally:
        client.close()
Ejemplo n.º 6
0
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    input_type, client):

    from cuml.dask.cluster import KMeans as cumlKMeans

    from cuml.dask.datasets import make_blobs

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=nclusters,
                      n_parts=n_parts,
                      cluster_std=0.01,
                      random_state=10)

    if input_type == "dataframe":
        X_train = to_dask_cudf(X)
        y_train = to_dask_cudf(y)
    elif input_type == "array":
        X_train, y_train = X, y

    cumlModel = cumlKMeans(init="k-means||",
                           n_clusters=nclusters,
                           random_state=10)

    cumlModel.fit(X_train)
    cumlLabels = cumlModel.predict(X_train, delayed=delayed_predict)

    n_workers = len(list(client.has_what().keys()))

    # Verifying we are grouping partitions. This should be changed soon.
    if n_parts is not None:
        parts_len = n_parts
    else:
        parts_len = n_workers

    if input_type == "dataframe":
        assert cumlLabels.npartitions == parts_len
        cumlPred = cumlLabels.compute().values
        labels = y_train.compute().values
    elif input_type == "array":
        assert len(cumlLabels.chunks[0]) == parts_len
        cumlPred = cp.array(cumlLabels.compute())
        labels = cp.squeeze(y_train.compute())

    assert cumlPred.shape[0] == nrows
    assert cp.max(cumlPred) == nclusters - 1
    assert cp.min(cumlPred) == 0

    score = adjusted_rand_score(labels, cumlPred)

    print(str(score))

    assert 1.0 == score
Ejemplo n.º 7
0
def test_score(nrows, ncols, nclusters, n_parts, cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=False,
                               shuffle=False,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=0,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)

        actual_score = cumlModel.score(X_cudf)

        X = cp.array(X_cudf.compute().as_gpu_matrix())

        predictions = cumlModel.predict(X_cudf).compute()
        predictions = cp.array(predictions)

        centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix())

        expected_score = 0
        for idx, label in enumerate(predictions):

            x = X[idx]
            y = centers[label]

            dist = np.sqrt(np.sum((x - y)**2))
            expected_score += dist**2

        assert actual_score + SCORE_EPS \
            >= (-1 * expected_score) \
            >= actual_score - SCORE_EPS

    finally:
        client.close()
Ejemplo n.º 8
0
def test_end_to_end(nrows, ncols, nclusters, n_parts, cluster):

    client = Client(cluster)

    try:
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=True,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=1,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)
        cumlLabels = cumlModel.predict(X_cudf)
        n_workers = len(list(client.has_what().keys()))

        # Verifying we are grouping partitions. This should be changed soon.
        if n_parts is not None and n_parts < n_workers:
            assert cumlLabels.npartitions == n_parts
        else:
            assert cumlLabels.npartitions == n_workers

        from sklearn.metrics import adjusted_rand_score

        cumlPred = cumlLabels.compute().to_pandas().values

        assert cumlPred.shape[0] == nrows
        assert np.max(cumlPred) == nclusters - 1
        assert np.min(cumlPred) == 0

        labels = y.compute().to_pandas().values

        score = adjusted_rand_score(labels.reshape(labels.shape[0]), cumlPred)

        assert 1.0 == score

    finally:
        client.close()
Ejemplo n.º 9
0
def test_end_to_end(nrows, ncols, nclusters, n_parts, client=None):

    owns_cluster = False
    if client is None:
        owns_cluster = True
        cluster = LocalCUDACluster(threads_per_worker=1)
        client = Client(cluster)

    from cuml.dask.cluster import KMeans as cumlKMeans
    from dask_ml.cluster import KMeans as dmlKMeans

    from cuml.test.dask.utils import dask_make_blobs

    X_df, X_cudf = dask_make_blobs(nrows,
                                   ncols,
                                   nclusters,
                                   n_parts,
                                   cluster_std=0.1,
                                   verbose=True,
                                   random_state=10)

    wait(X_cudf)

    cumlModel = cumlKMeans(verbose=0,
                           init="k-means||",
                           n_clusters=nclusters,
                           random_state=10)
    daskmlModel1 = dmlKMeans(init="k-means||",
                             n_clusters=nclusters,
                             random_state=10)

    cumlModel.fit(X_cudf)
    daskmlModel1.fit(X_df)

    cumlLabels = cumlModel.predict(X_cudf)
    daskmlLabels1 = daskmlModel1.predict(X_df)

    from sklearn.metrics import adjusted_rand_score

    cumlPred = cumlLabels.compute().to_pandas().values
    daskmlPred1 = daskmlLabels1.compute()

    score = adjusted_rand_score(cumlPred, daskmlPred1)

    if owns_cluster:
        client.close()
        cluster.close()

    assert 1.0 == score
Ejemplo n.º 10
0
def test_transform(nrows, ncols, nclusters, n_parts, cluster):

    client = Client(cluster)

    try:

        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X_cudf, y = make_blobs(nrows,
                               ncols,
                               nclusters,
                               n_parts,
                               cluster_std=0.01,
                               verbose=True,
                               random_state=10)

        wait(X_cudf)

        cumlModel = cumlKMeans(verbose=0,
                               init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_cudf)

        labels = y.compute().to_pandas().values
        labels = labels.reshape(labels.shape[0])

        xformed = cumlModel.transform(X_cudf).compute()

        assert xformed.shape == (nrows, nclusters)

        # The argmin of the transformed values should be equal to the labels
        xformed_labels = np.argmin(xformed.to_pandas().to_numpy(), axis=1)

        from sklearn.metrics import adjusted_rand_score
        assert adjusted_rand_score(labels, xformed_labels)

    finally:
        client.close()
Ejemplo n.º 11
0
def test_score(nrows, ncols, nclusters, n_parts, input_type, cluster):

    client = None

    try:

        client = Client(cluster)
        from cuml.dask.cluster import KMeans as cumlKMeans

        from cuml.dask.datasets import make_blobs

        X, y = make_blobs(n_samples=int(nrows),
                          n_features=ncols,
                          centers=nclusters,
                          n_parts=n_parts,
                          cluster_std=0.01,
                          shuffle=False,
                          random_state=10)

        wait(X)
        if input_type == "dataframe":
            X_train = to_dask_cudf(X)
            y_train = to_dask_cudf(y)
            y = y_train
        elif input_type == "array":
            X_train, y_train = X, y

        cumlModel = cumlKMeans(init="k-means||",
                               n_clusters=nclusters,
                               random_state=10)

        cumlModel.fit(X_train)

        actual_score = cumlModel.score(X_train)

        predictions = cumlModel.predict(X_train).compute()

        if input_type == "dataframe":
            X = cp.array(X_train.compute().as_gpu_matrix())
            predictions = cp.array(predictions)

            centers = cp.array(cumlModel.cluster_centers_.as_gpu_matrix())
        elif input_type == "array":
            X = X_train.compute()
            centers = cumlModel.cluster_centers_

        expected_score = 0
        for idx, label in enumerate(predictions):

            x = X[idx]
            y = centers[label]

            dist = cp.sqrt(cp.sum((x - y)**2))
            expected_score += dist**2

        assert actual_score + SCORE_EPS \
            >= (-1 * expected_score) \
            >= actual_score - SCORE_EPS

    finally:
        client.close()