Example #1
0
def test_single_linkage_sklearn_compare(nrows, ncols, nclusters, k,
                                        connectivity):

    X, y = make_blobs(int(nrows),
                      ncols,
                      nclusters,
                      cluster_std=1.0,
                      shuffle=False,
                      random_state=42)

    cuml_agg = AgglomerativeClustering(n_clusters=nclusters,
                                       affinity='euclidean',
                                       linkage='single',
                                       n_neighbors=k,
                                       connectivity=connectivity)

    try:
        cuml_agg.fit(X)
    except Exception:
        cuml_agg.fit(X)

    sk_agg = cluster.AgglomerativeClustering(n_clusters=nclusters,
                                             affinity='euclidean',
                                             linkage='single')
    sk_agg.fit(cp.asnumpy(X))

    # Cluster assignments should be exact, even though the actual
    # labels may differ
    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) == 1.0)
    assert (cuml_agg.n_connected_components_ == sk_agg.n_connected_components_)
    assert (cuml_agg.n_leaves_ == sk_agg.n_leaves_)
    assert (cuml_agg.n_clusters_ == sk_agg.n_clusters_)
Example #2
0
def test_hdbscan_cluster_patterns_extract_clusters(
        dataset, nrows, connectivity, cluster_selection_epsilon,
        cluster_selection_method, min_cluster_size, allow_single_cluster,
        max_cluster_size, min_samples):

    # This also tests duplicate data points
    X, y = get_pattern(dataset, nrows)[0]

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    cuml_agg._extract_clusters(sk_agg.condensed_tree_)

    assert adjusted_rand_score(cuml_agg.labels_test, sk_agg.labels_) == 1.0
    assert np.allclose(cp.asnumpy(cuml_agg.probabilities_test),
                       sk_agg.probabilities_)
Example #3
0
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict,
                    input_type, client):

    from cuml.dask.cluster import KMeans as cumlKMeans

    from cuml.dask.datasets import make_blobs

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=nclusters,
                      n_parts=n_parts,
                      cluster_std=0.01,
                      random_state=10)

    if input_type == "dataframe":
        X_train = to_dask_cudf(X)
        y_train = to_dask_cudf(y)
    elif input_type == "array":
        X_train, y_train = X, y

    cumlModel = cumlKMeans(init="k-means||",
                           n_clusters=nclusters,
                           random_state=10)

    cumlModel.fit(X_train)
    cumlLabels = cumlModel.predict(X_train, delayed=delayed_predict)

    n_workers = len(list(client.has_what().keys()))

    # Verifying we are grouping partitions. This should be changed soon.
    if n_parts is not None:
        parts_len = n_parts
    else:
        parts_len = n_workers

    if input_type == "dataframe":
        assert cumlLabels.npartitions == parts_len
        cumlPred = cumlLabels.compute().values
        labels = y_train.compute().values
    elif input_type == "array":
        assert len(cumlLabels.chunks[0]) == parts_len
        cumlPred = cp.array(cumlLabels.compute())
        labels = cp.squeeze(y_train.compute())

    assert cumlPred.shape[0] == nrows
    assert cp.max(cumlPred) == nclusters - 1
    assert cp.min(cumlPred) == 0

    score = adjusted_rand_score(labels, cumlPred)

    print(str(score))

    assert 1.0 == score
Example #4
0
def test_hdbscan_core_dists_bug_4054():
    """
    This test explicitly verifies that the MRE from
    https://github.com/rapidsai/cuml/issues/4054
    matches the reference impl
    """

    X, y = datasets.make_moons(n_samples=10000, noise=0.12, random_state=0)

    cu_labels_ = HDBSCAN(min_samples=25, min_cluster_size=25).fit_predict(X)
    sk_labels_ = hdbscan.HDBSCAN(min_samples=25,
                                 min_cluster_size=25,
                                 approx_min_span_tree=False).fit_predict(X)

    assert adjusted_rand_score(cu_labels_, sk_labels_) > 0.99
Example #5
0
def test_duplicate_distances(connectivity):
    X = cp.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [2.0, 2.0, 2.0]])

    cuml_agg = AgglomerativeClustering(n_clusters=2,
                                       affinity="euclidean",
                                       linkage="single",
                                       n_neighbors=3,
                                       connectivity=connectivity)

    sk_agg = cluster.AgglomerativeClustering(n_clusters=2,
                                             affinity="euclidean",
                                             linkage="single")

    cuml_agg.fit(X)
    sk_agg.fit(X.get())

    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) == 1.0)
Example #6
0
def test_hdbscan_sklearn_datasets(dataset, connectivity,
                                  cluster_selection_epsilon,
                                  cluster_selection_method,
                                  min_samples_cluster_size_bounds,
                                  allow_single_cluster):

    min_samples, min_cluster_size, max_cluster_size = \
        min_samples_cluster_size_bounds

    X = dataset.data

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       gen_min_span_tree=True,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    cuml_agg.fit(X)

    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    assert_condensed_trees(sk_agg, min_cluster_size)
    assert_cluster_counts(sk_agg, cuml_agg)

    assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_)))
    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.85)

    assert np.allclose(np.sort(sk_agg.cluster_persistence_),
                       np.sort(cuml_agg.cluster_persistence_),
                       rtol=0.1,
                       atol=0.1)
Example #7
0
def test_hdbscan_blobs(nrows, ncols, nclusters, connectivity,
                       cluster_selection_epsilon, cluster_selection_method,
                       allow_single_cluster, min_cluster_size,
                       max_cluster_size, min_samples):

    X, y = make_blobs(n_samples=int(nrows),
                      n_features=ncols,
                      centers=nclusters,
                      cluster_std=0.7,
                      shuffle=False,
                      random_state=42)

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    cuml_agg.fit(X)
    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    assert_condensed_trees(sk_agg, min_cluster_size)
    assert_cluster_counts(sk_agg, cuml_agg)

    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) >= 0.95)
    assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_)))

    assert np.allclose(np.sort(sk_agg.cluster_persistence_),
                       np.sort(cuml_agg.cluster_persistence_),
                       rtol=0.01,
                       atol=0.01)
Example #8
0
def test_hdbscan_cluster_patterns(dataset, nrows, connectivity,
                                  cluster_selection_epsilon,
                                  cluster_selection_method, min_cluster_size,
                                  allow_single_cluster, max_cluster_size,
                                  min_samples):

    # This also tests duplicate data points
    X, y = get_pattern(dataset, nrows)[0]

    cuml_agg = HDBSCAN(verbose=logger.level_info,
                       allow_single_cluster=allow_single_cluster,
                       min_samples=min_samples,
                       max_cluster_size=max_cluster_size,
                       min_cluster_size=min_cluster_size,
                       cluster_selection_epsilon=cluster_selection_epsilon,
                       cluster_selection_method=cluster_selection_method)

    cuml_agg.fit(X)

    sk_agg = hdbscan.HDBSCAN(
        allow_single_cluster=allow_single_cluster,
        approx_min_span_tree=False,
        gen_min_span_tree=True,
        min_samples=min_samples,
        min_cluster_size=min_cluster_size,
        cluster_selection_epsilon=cluster_selection_epsilon,
        cluster_selection_method=cluster_selection_method,
        algorithm="generic")

    sk_agg.fit(cp.asnumpy(X))

    assert_condensed_trees(sk_agg, min_cluster_size)
    assert_cluster_counts(sk_agg, cuml_agg)

    assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_)))
    assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.95)

    assert np.allclose(np.sort(sk_agg.cluster_persistence_),
                       np.sort(cuml_agg.cluster_persistence_),
                       rtol=0.1,
                       atol=0.1)
plt.scatter(centers_skl[:,0], centers_skl[:,1], c='blue', s=100, alpha=.5)
plt.show()


# plot the cuml kmeans centers with red circle outlines
centers_cuml = kmeans_cuml.cluster_centers_
plt.scatter(cupy.asnumpy(centers_cuml[0].values),
            cupy.asnumpy(centers_cuml[1].values),
            facecolors = 'none', edgecolors='red', s=100)

plt.title('cuml and sklearn kmeans clustering')
plt.show()
"""## Compare Results"""

from cuml.metrics import adjusted_rand_score

score_cuml = adjusted_rand_score(host_labels, kmeans_cuml.labels_)

from sklearn.metrics import adjusted_rand_score

score_skl = adjusted_rand_score(host_labels, kmeans_skl.labels_)

threshold = 1e-4

passed = (score_cuml - score_skl) < threshold
print('compare kmeans: cuml vs sklearn labels_ are ' + ('equal' if passed else 'NOT equal'))

fig = plt.figure(figsize=(16, 10))
plt.scatter(host_data.iloc[:, 0], host_data.iloc[:, 1], c=host_labels, s=50, cmap='viridis')
plt.show()