def test_single_linkage_sklearn_compare(nrows, ncols, nclusters, k, connectivity): X, y = make_blobs(int(nrows), ncols, nclusters, cluster_std=1.0, shuffle=False, random_state=42) cuml_agg = AgglomerativeClustering(n_clusters=nclusters, affinity='euclidean', linkage='single', n_neighbors=k, connectivity=connectivity) try: cuml_agg.fit(X) except Exception: cuml_agg.fit(X) sk_agg = cluster.AgglomerativeClustering(n_clusters=nclusters, affinity='euclidean', linkage='single') sk_agg.fit(cp.asnumpy(X)) # Cluster assignments should be exact, even though the actual # labels may differ assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) == 1.0) assert (cuml_agg.n_connected_components_ == sk_agg.n_connected_components_) assert (cuml_agg.n_leaves_ == sk_agg.n_leaves_) assert (cuml_agg.n_clusters_ == sk_agg.n_clusters_)
def test_hdbscan_cluster_patterns_extract_clusters( dataset, nrows, connectivity, cluster_selection_epsilon, cluster_selection_method, min_cluster_size, allow_single_cluster, max_cluster_size, min_samples): # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) cuml_agg._extract_clusters(sk_agg.condensed_tree_) assert adjusted_rand_score(cuml_agg.labels_test, sk_agg.labels_) == 1.0 assert np.allclose(cp.asnumpy(cuml_agg.probabilities_test), sk_agg.probabilities_)
def test_end_to_end(nrows, ncols, nclusters, n_parts, delayed_predict, input_type, client): from cuml.dask.cluster import KMeans as cumlKMeans from cuml.dask.datasets import make_blobs X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, n_parts=n_parts, cluster_std=0.01, random_state=10) if input_type == "dataframe": X_train = to_dask_cudf(X) y_train = to_dask_cudf(y) elif input_type == "array": X_train, y_train = X, y cumlModel = cumlKMeans(init="k-means||", n_clusters=nclusters, random_state=10) cumlModel.fit(X_train) cumlLabels = cumlModel.predict(X_train, delayed=delayed_predict) n_workers = len(list(client.has_what().keys())) # Verifying we are grouping partitions. This should be changed soon. if n_parts is not None: parts_len = n_parts else: parts_len = n_workers if input_type == "dataframe": assert cumlLabels.npartitions == parts_len cumlPred = cumlLabels.compute().values labels = y_train.compute().values elif input_type == "array": assert len(cumlLabels.chunks[0]) == parts_len cumlPred = cp.array(cumlLabels.compute()) labels = cp.squeeze(y_train.compute()) assert cumlPred.shape[0] == nrows assert cp.max(cumlPred) == nclusters - 1 assert cp.min(cumlPred) == 0 score = adjusted_rand_score(labels, cumlPred) print(str(score)) assert 1.0 == score
def test_hdbscan_core_dists_bug_4054(): """ This test explicitly verifies that the MRE from https://github.com/rapidsai/cuml/issues/4054 matches the reference impl """ X, y = datasets.make_moons(n_samples=10000, noise=0.12, random_state=0) cu_labels_ = HDBSCAN(min_samples=25, min_cluster_size=25).fit_predict(X) sk_labels_ = hdbscan.HDBSCAN(min_samples=25, min_cluster_size=25, approx_min_span_tree=False).fit_predict(X) assert adjusted_rand_score(cu_labels_, sk_labels_) > 0.99
def test_duplicate_distances(connectivity): X = cp.array([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [2.0, 2.0, 2.0]]) cuml_agg = AgglomerativeClustering(n_clusters=2, affinity="euclidean", linkage="single", n_neighbors=3, connectivity=connectivity) sk_agg = cluster.AgglomerativeClustering(n_clusters=2, affinity="euclidean", linkage="single") cuml_agg.fit(X) sk_agg.fit(X.get()) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) == 1.0)
def test_hdbscan_sklearn_datasets(dataset, connectivity, cluster_selection_epsilon, cluster_selection_method, min_samples_cluster_size_bounds, allow_single_cluster): min_samples, min_cluster_size, max_cluster_size = \ min_samples_cluster_size_bounds X = dataset.data cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, gen_min_span_tree=True, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) cuml_agg.fit(X) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) assert_condensed_trees(sk_agg, min_cluster_size) assert_cluster_counts(sk_agg, cuml_agg) assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_))) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.85) assert np.allclose(np.sort(sk_agg.cluster_persistence_), np.sort(cuml_agg.cluster_persistence_), rtol=0.1, atol=0.1)
def test_hdbscan_blobs(nrows, ncols, nclusters, connectivity, cluster_selection_epsilon, cluster_selection_method, allow_single_cluster, min_cluster_size, max_cluster_size, min_samples): X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, cluster_std=0.7, shuffle=False, random_state=42) cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) cuml_agg.fit(X) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) assert_condensed_trees(sk_agg, min_cluster_size) assert_cluster_counts(sk_agg, cuml_agg) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) >= 0.95) assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_))) assert np.allclose(np.sort(sk_agg.cluster_persistence_), np.sort(cuml_agg.cluster_persistence_), rtol=0.01, atol=0.01)
def test_hdbscan_cluster_patterns(dataset, nrows, connectivity, cluster_selection_epsilon, cluster_selection_method, min_cluster_size, allow_single_cluster, max_cluster_size, min_samples): # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) cuml_agg.fit(X) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) assert_condensed_trees(sk_agg, min_cluster_size) assert_cluster_counts(sk_agg, cuml_agg) assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_))) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.95) assert np.allclose(np.sort(sk_agg.cluster_persistence_), np.sort(cuml_agg.cluster_persistence_), rtol=0.1, atol=0.1)
plt.scatter(centers_skl[:,0], centers_skl[:,1], c='blue', s=100, alpha=.5) plt.show() # plot the cuml kmeans centers with red circle outlines centers_cuml = kmeans_cuml.cluster_centers_ plt.scatter(cupy.asnumpy(centers_cuml[0].values), cupy.asnumpy(centers_cuml[1].values), facecolors = 'none', edgecolors='red', s=100) plt.title('cuml and sklearn kmeans clustering') plt.show() """## Compare Results""" from cuml.metrics import adjusted_rand_score score_cuml = adjusted_rand_score(host_labels, kmeans_cuml.labels_) from sklearn.metrics import adjusted_rand_score score_skl = adjusted_rand_score(host_labels, kmeans_skl.labels_) threshold = 1e-4 passed = (score_cuml - score_skl) < threshold print('compare kmeans: cuml vs sklearn labels_ are ' + ('equal' if passed else 'NOT equal')) fig = plt.figure(figsize=(16, 10)) plt.scatter(host_data.iloc[:, 0], host_data.iloc[:, 1], c=host_labels, s=50, cmap='viridis') plt.show()