def test_hdbscan_cluster_patterns_extract_clusters( dataset, nrows, connectivity, cluster_selection_epsilon, cluster_selection_method, min_cluster_size, allow_single_cluster, max_cluster_size, min_samples): # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) cuml_agg._extract_clusters(sk_agg.condensed_tree_) assert adjusted_rand_score(cuml_agg.labels_test, sk_agg.labels_) == 1.0 assert np.allclose(cp.asnumpy(cuml_agg.probabilities_test), sk_agg.probabilities_)
def test_hdbscan_sklearn_datasets(dataset, connectivity, cluster_selection_epsilon, cluster_selection_method, min_samples_cluster_size_bounds, allow_single_cluster): min_samples, min_cluster_size, max_cluster_size = \ min_samples_cluster_size_bounds X = dataset.data cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, gen_min_span_tree=True, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) cuml_agg.fit(X) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) assert_condensed_trees(sk_agg, min_cluster_size) assert_cluster_counts(sk_agg, cuml_agg) assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_))) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.85) assert np.allclose(np.sort(sk_agg.cluster_persistence_), np.sort(cuml_agg.cluster_persistence_), rtol=0.1, atol=0.1)
def test_hdbscan_blobs(nrows, ncols, nclusters, connectivity, cluster_selection_epsilon, cluster_selection_method, allow_single_cluster, min_cluster_size, max_cluster_size, min_samples): X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, cluster_std=0.7, shuffle=False, random_state=42) cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) cuml_agg.fit(X) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) assert_condensed_trees(sk_agg, min_cluster_size) assert_cluster_counts(sk_agg, cuml_agg) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) >= 0.95) assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_))) assert np.allclose(np.sort(sk_agg.cluster_persistence_), np.sort(cuml_agg.cluster_persistence_), rtol=0.01, atol=0.01)
def test_hdbscan_empty_cluster_tree(): raw_tree = np.recarray(shape=(5, ), formats=[np.intp, np.intp, float, np.intp], names=('parent', 'child', 'lambda_val', 'child_size')) raw_tree['parent'] = np.asarray([5, 5, 5, 5, 5]) raw_tree['child'] = [0, 1, 2, 3, 4] raw_tree['lambda_val'] = [1.0, 1.0, 1.0, 1.0, 1.0] raw_tree['child_size'] = [1, 1, 1, 1, 1] condensed_tree = CondensedTree(raw_tree, 0.0, True) cuml_agg = HDBSCAN(allow_single_cluster=True, cluster_selection_method="eom") cuml_agg._extract_clusters(condensed_tree) # We just care that all points are assigned to the root cluster assert np.sum(cuml_agg.labels_test.to_output("numpy")) == 0
def test_hdbscan_cluster_patterns(dataset, nrows, connectivity, cluster_selection_epsilon, cluster_selection_method, min_cluster_size, allow_single_cluster, max_cluster_size, min_samples): # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) cuml_agg.fit(X) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) assert_condensed_trees(sk_agg, min_cluster_size) assert_cluster_counts(sk_agg, cuml_agg) assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_))) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.95) assert np.allclose(np.sort(sk_agg.cluster_persistence_), np.sort(cuml_agg.cluster_persistence_), rtol=0.1, atol=0.1)
def test_hdbscan_core_dists_bug_4054(): """ This test explicitly verifies that the MRE from https://github.com/rapidsai/cuml/issues/4054 matches the reference impl """ X, y = datasets.make_moons(n_samples=10000, noise=0.12, random_state=0) cu_labels_ = HDBSCAN(min_samples=25, min_cluster_size=25).fit_predict(X) sk_labels_ = hdbscan.HDBSCAN(min_samples=25, min_cluster_size=25, approx_min_span_tree=False).fit_predict(X) assert adjusted_rand_score(cu_labels_, sk_labels_) > 0.99
def test_hdbscan_plots(): X, y = make_blobs(n_samples=int(100), n_features=100, centers=10, cluster_std=0.7, shuffle=False, random_state=42) cuml_agg = HDBSCAN(gen_min_span_tree=True) cuml_agg.fit(X) assert cuml_agg.condensed_tree_ is not None assert cuml_agg.minimum_spanning_tree_ is not None assert cuml_agg.single_linkage_tree_ is not None cuml_agg = HDBSCAN(gen_min_span_tree=False) cuml_agg.fit(X) assert cuml_agg.minimum_spanning_tree_ is None