def test_correct_number_of_clusters(): # in 'auto' mode n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) # Parameters chosen specifically for this task. # Compute OPTICS clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4, xi=.1) clust.fit(X) # number of clusters, ignoring noise if present n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) assert n_clusters_1 == n_clusters # check attribute types and sizes assert clust.labels_.shape == (len(X),) assert clust.labels_.dtype.kind == 'i' assert clust.reachability_.shape == (len(X),) assert clust.reachability_.dtype.kind == 'f' assert clust.core_distances_.shape == (len(X),) assert clust.core_distances_.dtype.kind == 'f' assert clust.ordering_.shape == (len(X),) assert clust.ordering_.dtype.kind == 'i' assert set(clust.ordering_) == set(range(len(X)))
def test_correct_number_of_clusters(): # in 'auto' mode n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) # Parameters chosen specifically for this task. # Compute OPTICS clust = OPTICS(max_eps=5.0 * 6.0, min_samples=4) clust.fit(X) # number of clusters, ignoring noise if present n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) assert_equal(n_clusters_1, n_clusters) # check attribute types and sizes assert clust.core_sample_indices_.ndim == 1 assert clust.core_sample_indices_.size > 0 assert clust.core_sample_indices_.dtype.kind == 'i' assert clust.labels_.shape == (len(X),) assert clust.labels_.dtype.kind == 'i' assert clust.reachability_.shape == (len(X),) assert clust.reachability_.dtype.kind == 'f' assert clust.core_distances_.shape == (len(X),) assert clust.core_distances_.dtype.kind == 'f' assert clust.ordering_.shape == (len(X),) assert clust.ordering_.dtype.kind == 'i' assert set(clust.ordering_) == set(range(len(X)))
def test_correct_number_of_clusters(): # in 'auto' mode n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) # Parameters chosen specifically for this task. # Compute OPTICS clust = OPTICS(max_bound=5.0 * 6.0, min_samples=4, metric='euclidean') clust.fit(X) # number of clusters, ignoring noise if present n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) assert_equal(n_clusters_1, n_clusters)
def test_optics(): # Tests the optics clustering method and all functions inside it # 'auto' mode n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) print(np.shape(X)) # Parameters chosen specifically for this task. # Compute OPTICS clust = OPTICS(max_bound=5.0 * 6.0, min_samples=4, metric='euclidean') clust.fit(X) # number of clusters, ignoring noise if present n_clusters_1 = len(set(clust.labels_)) - int(-1 in clust.labels_) assert_equal(n_clusters_1, n_clusters)
def test_dbscan(): # Test chaining RadiusNeighborsTransformer and DBSCAN radius = 0.3 n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) # compare the chained version and the compact version est_chain = make_pipeline( RadiusNeighborsTransformer(radius=radius, mode='distance'), DBSCAN(metric='precomputed', eps=radius)) est_compact = DBSCAN(eps=radius) labels_chain = est_chain.fit_predict(X) labels_compact = est_compact.fit_predict(X) assert_array_almost_equal(labels_chain, labels_compact)
def test_birch_predict(): # Test the predict method predicts the nearest centroid. rng = np.random.RandomState(0) X = generate_clustered_data(n_clusters=3, n_features=3, n_samples_per_cluster=10) # n_samples * n_samples_per_cluster shuffle_indices = np.arange(30) rng.shuffle(shuffle_indices) X_shuffle = X[shuffle_indices, :] brc = Birch(n_clusters=4, threshold=1.) brc.fit(X_shuffle) centroids = brc.subcluster_centers_ assert_array_equal(brc.labels_, brc.predict(X_shuffle)) nearest_centroid = pairwise_distances_argmin(X_shuffle, centroids) assert_almost_equal(v_measure_score(nearest_centroid, brc.labels_), 1.0)
from scipy.spatial import distance from scipy import sparse from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_in from sklearn.utils.testing import assert_not_in from sklearn.cluster.dbscan_ import DBSCAN from sklearn.cluster.dbscan_ import dbscan from sklearn.cluster.tests.common import generate_clustered_data from sklearn.metrics.pairwise import pairwise_distances n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters) def test_dbscan_similarity(): # Tests the DBSCAN algorithm with a similarity array. # Parameters chosen specifically for this task. eps = 0.15 min_samples = 10 # Compute similarities D = distance.squareform(distance.pdist(X)) D /= np.max(D) # Compute DBSCAN core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)
from scipy import sparse from sklearn.utils.testing import assert_equal from sklearn.utils.testing import assert_array_equal from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_in from sklearn.utils.testing import assert_not_in from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import if_matplotlib from hdbscan import RobustSingleLinkage from hdbscan import robust_single_linkage from sklearn.cluster.tests.common import generate_clustered_data from sklearn import datasets n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50) def test_rsl_distance_matrix(): D = distance.squareform(distance.pdist(X)) D /= np.max(D) labels, tree = robust_single_linkage(D, 0.25, metric='precomputed') # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) # ignore noise #assert_equal(n_clusters_1, n_clusters) labels = RobustSingleLinkage(metric="precomputed").fit(D).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) #assert_equal(n_clusters_2, n_clusters) def test_rsl_feature_vector():
def clustered_data(n_clusters): return generate_clustered_data(n_clusters=n_clusters)
from sklearn.utils.testing import assert_raises from sklearn.utils.testing import assert_in from sklearn.utils.testing import assert_not_in from sklearn.utils.testing import assert_no_warnings from sklearn.utils.testing import if_matplotlib from hdbscan import HDBSCAN from hdbscan import hdbscan from sklearn.cluster.tests.common import generate_clustered_data from scipy.stats import mode from tempfile import mkdtemp from sklearn import datasets n_clusters = 3 X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50) def relabel(labels): result = np.zeros(labels.shape[0]) labels_to_go = set(labels) i = 0 new_l = 0 while len(labels_to_go) > 0: l = labels[i] if l in labels_to_go: result[labels == l] = new_l new_l += 1 labels_to_go.remove(l) i += 1 return result