def test_hdbscan_sklearn_datasets(dataset, connectivity, cluster_selection_epsilon, cluster_selection_method, min_samples_cluster_size_bounds, allow_single_cluster): min_samples, min_cluster_size, max_cluster_size = \ min_samples_cluster_size_bounds X = dataset.data cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, gen_min_span_tree=True, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) cuml_agg.fit(X) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) assert_condensed_trees(sk_agg, min_cluster_size) assert_cluster_counts(sk_agg, cuml_agg) assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_))) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.85) assert np.allclose(np.sort(sk_agg.cluster_persistence_), np.sort(cuml_agg.cluster_persistence_), rtol=0.1, atol=0.1)
def test_hdbscan_blobs(nrows, ncols, nclusters, connectivity, cluster_selection_epsilon, cluster_selection_method, allow_single_cluster, min_cluster_size, max_cluster_size, min_samples): X, y = make_blobs(n_samples=int(nrows), n_features=ncols, centers=nclusters, cluster_std=0.7, shuffle=False, random_state=42) cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) cuml_agg.fit(X) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) assert_condensed_trees(sk_agg, min_cluster_size) assert_cluster_counts(sk_agg, cuml_agg) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) >= 0.95) assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_))) assert np.allclose(np.sort(sk_agg.cluster_persistence_), np.sort(cuml_agg.cluster_persistence_), rtol=0.01, atol=0.01)
def test_hdbscan_plots(): X, y = make_blobs(n_samples=int(100), n_features=100, centers=10, cluster_std=0.7, shuffle=False, random_state=42) cuml_agg = HDBSCAN(gen_min_span_tree=True) cuml_agg.fit(X) assert cuml_agg.condensed_tree_ is not None assert cuml_agg.minimum_spanning_tree_ is not None assert cuml_agg.single_linkage_tree_ is not None cuml_agg = HDBSCAN(gen_min_span_tree=False) cuml_agg.fit(X) assert cuml_agg.minimum_spanning_tree_ is None
def test_hdbscan_cluster_patterns(dataset, nrows, connectivity, cluster_selection_epsilon, cluster_selection_method, min_cluster_size, allow_single_cluster, max_cluster_size, min_samples): # This also tests duplicate data points X, y = get_pattern(dataset, nrows)[0] cuml_agg = HDBSCAN(verbose=logger.level_info, allow_single_cluster=allow_single_cluster, min_samples=min_samples, max_cluster_size=max_cluster_size, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method) cuml_agg.fit(X) sk_agg = hdbscan.HDBSCAN( allow_single_cluster=allow_single_cluster, approx_min_span_tree=False, gen_min_span_tree=True, min_samples=min_samples, min_cluster_size=min_cluster_size, cluster_selection_epsilon=cluster_selection_epsilon, cluster_selection_method=cluster_selection_method, algorithm="generic") sk_agg.fit(cp.asnumpy(X)) assert_condensed_trees(sk_agg, min_cluster_size) assert_cluster_counts(sk_agg, cuml_agg) assert (len(np.unique(sk_agg.labels_)) == len(cp.unique(cuml_agg.labels_))) assert (adjusted_rand_score(cuml_agg.labels_, sk_agg.labels_) > 0.95) assert np.allclose(np.sort(sk_agg.cluster_persistence_), np.sort(cuml_agg.cluster_persistence_), rtol=0.1, atol=0.1)