def test_agglomerative_clustering_with_distance_threshold(linkage): # Check that we obtain the correct number of clusters with # agglomerative clustering with distance_threshold. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) # test when distance threshold is set to 10 distance_threshold = 10 for conn in [None, connectivity]: clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_threshold, connectivity=conn, linkage=linkage) clustering.fit(X) clusters_produced = clustering.labels_ num_clusters_produced = len(np.unique(clustering.labels_)) # test if the clusters produced match the point in the linkage tree # where the distance exceeds the threshold tree_builder = _TREE_BUILDERS[linkage] children, n_components, n_leaves, parent, distances = \ tree_builder(X, connectivity=conn, n_clusters=None, return_distance=True) num_clusters_at_threshold = np.count_nonzero( distances >= distance_threshold) + 1 # test number of clusters produced assert num_clusters_at_threshold == num_clusters_produced # test clusters produced clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves) assert np.array_equiv(clusters_produced, clusters_at_threshold)
def test_agglomerative_clustering_with_distance_threshold_edge_case( linkage, threshold, y_true): # test boundary case of distance_threshold matching the distance X = [[0], [1]] clusterer = AgglomerativeClustering( n_clusters=None, distance_threshold=threshold, linkage=linkage) y_pred = clusterer.fit_predict(X) assert adjusted_rand_score(y_true, y_pred) == 1
def test_agglomerative_clustering_wrong_arg_memory(): # Test either if an error is raised when memory is not # either a str or a joblib.Memory instance rng = np.random.RandomState(0) n_samples = 100 X = rng.randn(n_samples, 50) memory = 5 clustering = AgglomerativeClustering(memory=memory) with pytest.raises(ValueError): clustering.fit(X)
def test_compute_full_tree(): # Test that the full tree is computed if n_clusters is small rng = np.random.RandomState(0) X = rng.randn(10, 2) connectivity = kneighbors_graph(X, 5, include_self=False) # When n_clusters is less, the full tree should be built # that is the number of merges should be n_samples - 1 agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert n_nodes == n_samples - 1 # When n_clusters is large, greater than max of 100 and 0.02 * n_samples. # we should stop when there are n_clusters. n_clusters = 101 X = rng.randn(200, 2) connectivity = kneighbors_graph(X, 10, include_self=False) agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert n_nodes == n_samples - n_clusters
def test_connectivity_propagation(): # Check that connectivity in the ward tree is propagated correctly during # merging. X = np.array([(.014, .120), (.014, .099), (.014, .097), (.017, .153), (.017, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .152), (.018, .149), (.018, .144)]) connectivity = kneighbors_graph(X, 10, include_self=False) ward = AgglomerativeClustering( n_clusters=4, connectivity=connectivity, linkage='ward') # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def test_dist_threshold_invalid_parameters(): X = [[0], [1]] with pytest.raises(ValueError, match="Exactly one of "): AgglomerativeClustering(n_clusters=None, distance_threshold=None).fit(X) with pytest.raises(ValueError, match="Exactly one of "): AgglomerativeClustering(n_clusters=2, distance_threshold=1).fit(X) X = [[0], [1]] with pytest.raises(ValueError, match="compute_full_tree must be True if"): AgglomerativeClustering(n_clusters=None, distance_threshold=1, compute_full_tree=False).fit(X)
def test_linkage_misc(): # Misc tests on linkage rng = np.random.RandomState(42) X = rng.normal(size=(5, 5)) with pytest.raises(ValueError): AgglomerativeClustering(linkage='foo').fit(X) with pytest.raises(ValueError): linkage_tree(X, linkage='foo') with pytest.raises(ValueError): linkage_tree(X, connectivity=np.ones((4, 4))) # Smoke test FeatureAgglomeration FeatureAgglomeration().fit(X) # test hierarchical clustering on a precomputed distances matrix dis = cosine_distances(X) res = linkage_tree(dis, affinity="precomputed") assert_array_equal(res[0], linkage_tree(X, affinity="cosine")[0]) # test hierarchical clustering on a precomputed distances matrix res = linkage_tree(X, affinity=manhattan_distances) assert_array_equal(res[0], linkage_tree(X, affinity="manhattan")[0])
def test_n_clusters(): # Test that n_clusters param works properly X, y = make_blobs(n_samples=100, centers=10) brc1 = Birch(n_clusters=10) brc1.fit(X) assert len(brc1.subcluster_centers_) > 10 assert len(np.unique(brc1.labels_)) == 10 # Test that n_clusters = Agglomerative Clustering gives # the same results. gc = AgglomerativeClustering(n_clusters=10) brc2 = Birch(n_clusters=gc) brc2.fit(X) assert_array_equal(brc1.subcluster_labels_, brc2.subcluster_labels_) assert_array_equal(brc1.labels_, brc2.labels_) # Test that the wrong global clustering step raises an Error. clf = ElasticNet() brc3 = Birch(n_clusters=clf) with pytest.raises(ValueError): brc3.fit(X) # Test that a small number of clusters raises a warning. brc4 = Birch(threshold=10000.) assert_warns(ConvergenceWarning, brc4.fit, X)
def test_connectivity_ignores_diagonal(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) connectivity_include_self = kneighbors_graph(X, 3, include_self=True) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_callable(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering( connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_agg_n_clusters(): # Test that an error is raised when n_clusters <= 0 rng = np.random.RandomState(0) X = rng.rand(20, 10) for n_clus in [-1, 0]: agc = AgglomerativeClustering(n_clusters=n_clus) msg = ("n_clusters should be an integer greater than 0." " %s was provided." % str(agc.n_clusters)) assert_raise_message(ValueError, msg, agc.fit, X)
def test_connectivity_fixing_non_lil(): # Check non regression of a bug if a non item assignable connectivity is # provided with more than one component. # create dummy data x = np.array([[0, 0], [1, 1]]) # create a mask with several components to force connectivity fixing m = np.array([[True, False], [False, True]]) c = grid_to_graph(n_x=2, n_y=2, mask=m) w = AgglomerativeClustering(connectivity=c, linkage='ward') assert_warns(UserWarning, w.fit, x)
def test_n_components_deprecation(): # Test that a Deprecation warning is thrown when n_components_ # attribute is accessed X = np.array([[1, 2], [1, 4], [1, 0], [4, 2]]) agc = AgglomerativeClustering().fit(X) match = ("``n_components_`` attribute was deprecated " "in favor of ``n_connected_components_``") with pytest.warns(FutureWarning, match=match): n = agc.n_components_ assert n == agc.n_connected_components_
def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]]) true_labels = np.array([0, 0, 1, 1, 2, 2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) connectivity, n_components = _fix_connectivity(X, connectivity, 'euclidean') for linkage in ('single', 'average', 'average', 'ward'): clustering = AgglomerativeClustering(n_clusters=3, linkage=linkage, connectivity=connectivity) clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, true_labels), 1)
def test_single_linkage_clustering(): # Check that we get the correct result in two emblematic cases moons, moon_labels = make_moons(noise=0.05, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(moons) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, moon_labels), 1) circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(circles) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, circle_labels), 1)
def test_small_distance_threshold(): rng = np.random.RandomState(0) n_samples = 10 X = rng.randint(-300, 300, size=(n_samples, 3)) # this should result in all data in their own clusters, given that # their pairwise distances are bigger than .1 (which may not be the case # with a different random seed). clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=1., linkage="single").fit(X) # check that the pairwise distances are indeed all larger than .1 all_distances = pairwise_distances(X, metric='minkowski', p=2) np.fill_diagonal(all_distances, np.inf) assert np.all(all_distances > .1) assert clustering.n_clusters_ == n_samples
def test_cluster_distances_with_distance_threshold(): rng = np.random.RandomState(0) n_samples = 100 X = rng.randint(-10, 10, size=(n_samples, 3)) # check the distances within the clusters and with other clusters distance_threshold = 4 clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_threshold, linkage="single").fit(X) labels = clustering.labels_ D = pairwise_distances(X, metric="minkowski", p=2) # to avoid taking the 0 diagonal in min() np.fill_diagonal(D, np.inf) for label in np.unique(labels): in_cluster_mask = labels == label max_in_cluster_distance = (D[in_cluster_mask][:, in_cluster_mask] .min(axis=0).max()) min_out_cluster_distance = (D[in_cluster_mask][:, ~in_cluster_mask] .min(axis=0).min()) # single data point clusters only have that inf diagonal here if in_cluster_mask.sum() > 1: assert max_in_cluster_distance < distance_threshold assert min_out_cluster_distance >= distance_threshold
def test_agglomerative_clustering(): # Check that we obtain the correct number of clusters with # agglomerative clustering. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average", "single"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert np.size(np.unique(labels)) == 10 finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert np.size(np.unique(clustering.labels_)) == 10 # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix( connectivity.toarray()[:10, :10]), linkage=linkage) with pytest.raises(ValueError): clustering.fit(X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering( n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") with pytest.raises(ValueError): clustering.fit(X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering( n_clusters=10, connectivity=np.ones((n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering( n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)