def test_compute_full_tree(): # Test that the full tree is computed if n_clusters is small rng = np.random.RandomState(0) X = rng.randn(10, 2) connectivity = kneighbors_graph(X, 5, include_self=False) # When n_clusters is less, the full tree should be built # that is the number of merges should be n_samples - 1 agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert n_nodes == n_samples - 1 # When n_clusters is large, greater than max of 100 and 0.02 * n_samples. # we should stop when there are n_clusters. n_clusters = 101 X = rng.randn(200, 2) connectivity = kneighbors_graph(X, 10, include_self=False) agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert n_nodes == n_samples - n_clusters
def test_agglomerative_clustering_with_distance_threshold(linkage): # Check that we obtain the correct number of clusters with # agglomerative clustering with distance_threshold. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) # test when distance threshold is set to 10 distance_threshold = 10 for conn in [None, connectivity]: clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_threshold, connectivity=conn, linkage=linkage) clustering.fit(X) clusters_produced = clustering.labels_ num_clusters_produced = len(np.unique(clustering.labels_)) # test if the clusters produced match the point in the linkage tree # where the distance exceeds the threshold tree_builder = _TREE_BUILDERS[linkage] children, n_components, n_leaves, parent, distances = \ tree_builder(X, connectivity=conn, n_clusters=None, return_distance=True) num_clusters_at_threshold = np.count_nonzero( distances >= distance_threshold) + 1 # test number of clusters produced assert num_clusters_at_threshold == num_clusters_produced # test clusters produced clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves) assert np.array_equiv(clusters_produced, clusters_at_threshold)
def test_connectivity_callable(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering(connectivity=partial( kneighbors_graph, n_neighbors=3, include_self=False)) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_ignores_diagonal(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) connectivity_include_self = kneighbors_graph(X, 3, include_self=True) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_agglomerative_clustering_wrong_arg_memory(): # Test either if an error is raised when memory is not # either a str or a joblib.Memory instance rng = np.random.RandomState(0) n_samples = 100 X = rng.randn(n_samples, 50) memory = 5 clustering = AgglomerativeClustering(memory=memory) with pytest.raises(ValueError): clustering.fit(X)
def test_connectivity_propagation(): # Check that connectivity in the ward tree is propagated correctly during # merging. X = np.array([(.014, .120), (.014, .099), (.014, .097), (.017, .153), (.017, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .152), (.018, .149), (.018, .144)]) connectivity = kneighbors_graph(X, 10, include_self=False) ward = AgglomerativeClustering(n_clusters=4, connectivity=connectivity, linkage='ward') # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def test_single_linkage_clustering(): # Check that we get the correct result in two emblematic cases moons, moon_labels = make_moons(noise=0.05, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(moons) assert_almost_equal( normalized_mutual_info_score(clustering.labels_, moon_labels), 1) circles, circle_labels = make_circles(factor=0.5, noise=0.025, random_state=42) clustering = AgglomerativeClustering(n_clusters=2, linkage='single') clustering.fit(circles) assert_almost_equal( normalized_mutual_info_score(clustering.labels_, circle_labels), 1)
def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]]) true_labels = np.array([0, 0, 1, 1, 2, 2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) connectivity, n_components = _fix_connectivity(X, connectivity, 'euclidean') for linkage in ('single', 'average', 'average', 'ward'): clustering = AgglomerativeClustering(n_clusters=3, linkage=linkage, connectivity=connectivity) clustering.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering.labels_, true_labels), 1)
def test_agglomerative_clustering(): # Check that we obtain the correct number of clusters with # agglomerative clustering. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) for linkage in ("ward", "complete", "average", "single"): clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) clustering.fit(X) # test caching try: tempdir = mkdtemp() clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, memory=tempdir, linkage=linkage) clustering.fit(X) labels = clustering.labels_ assert np.size(np.unique(labels)) == 10 finally: shutil.rmtree(tempdir) # Turn caching off now clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage=linkage) # Check that we obtain the same solution with early-stopping of the # tree building clustering.compute_full_tree = False clustering.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering.labels_, labels), 1) clustering.connectivity = None clustering.fit(X) assert np.size(np.unique(clustering.labels_)) == 10 # Check that we raise a TypeError on dense matrices clustering = AgglomerativeClustering( n_clusters=10, connectivity=sparse.lil_matrix(connectivity.toarray()[:10, :10]), linkage=linkage) with pytest.raises(ValueError): clustering.fit(X) # Test that using ward with another metric than euclidean raises an # exception clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity.toarray(), affinity="manhattan", linkage="ward") with pytest.raises(ValueError): clustering.fit(X) # Test using another metric than euclidean works with linkage complete for affinity in PAIRED_DISTANCES.keys(): # Compare our (structured) implementation to scipy clustering = AgglomerativeClustering(n_clusters=10, connectivity=np.ones( (n_samples, n_samples)), affinity=affinity, linkage="complete") clustering.fit(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=None, affinity=affinity, linkage="complete") clustering2.fit(X) assert_almost_equal( normalized_mutual_info_score(clustering2.labels_, clustering.labels_), 1) # Test that using a distance matrix (affinity = 'precomputed') has same # results (with connectivity constraints) clustering = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, linkage="complete") clustering.fit(X) X_dist = pairwise_distances(X) clustering2 = AgglomerativeClustering(n_clusters=10, connectivity=connectivity, affinity='precomputed', linkage="complete") clustering2.fit(X_dist) assert_array_equal(clustering.labels_, clustering2.labels_)
from mrex.cluster import AgglomerativeClustering ward = AgglomerativeClustering(n_clusters=3, linkage='ward') n_samples = np.logspace(.5, 3, 9) n_features = np.logspace(1, 3.5, 7) N_samples, N_features = np.meshgrid(n_samples, n_features) scikits_time = np.zeros(N_samples.shape) scipy_time = np.zeros(N_samples.shape) for i, n in enumerate(n_samples): for j, p in enumerate(n_features): X = np.random.normal(size=(n, p)) t0 = time.time() ward.fit(X) scikits_time[j, i] = time.time() - t0 t0 = time.time() hierarchy.ward(X) scipy_time[j, i] = time.time() - t0 ratio = scikits_time / scipy_time plt.figure("scikit-learn Ward's method benchmark results") plt.imshow(np.log(ratio), aspect='auto', origin="lower") plt.colorbar() plt.contour(ratio, levels=[ 1, ], colors='k') plt.yticks(range(len(n_features)), n_features.astype(np.int)) plt.ylabel('N features')
for i in range(n_clusters): for j in range(n_clusters): plt.text(i, j, '%5.3f' % avg_dist[i, j], verticalalignment='center', horizontalalignment='center') plt.imshow(avg_dist, interpolation='nearest', cmap=plt.cm.gnuplot2, vmin=0) plt.xticks(range(n_clusters), labels, rotation=45) plt.yticks(range(n_clusters), labels) plt.colorbar() plt.suptitle("Interclass %s distances" % metric, size=18) plt.tight_layout() # Plot clustering results for index, metric in enumerate(["cosine", "euclidean", "cityblock"]): model = AgglomerativeClustering(n_clusters=n_clusters, linkage="average", affinity=metric) model.fit(X) plt.figure() plt.axes([0, 0, 1, 1]) for l, c in zip(np.arange(model.n_clusters), 'rgbk'): plt.plot(X[model.labels_ == l].T, c=c, alpha=.5) plt.axis('tight') plt.axis('off') plt.suptitle("AgglomerativeClustering(affinity=%s)" % metric, size=20) plt.show()
counts = np.zeros(model.children_.shape[0]) n_samples = len(model.labels_) for i, merge in enumerate(model.children_): current_count = 0 for child_idx in merge: if child_idx < n_samples: current_count += 1 # leaf node else: current_count += counts[child_idx - n_samples] counts[i] = current_count linkage_matrix = np.column_stack( [model.children_, model.distances_, counts]).astype(float) # Plot the corresponding dendrogram dendrogram(linkage_matrix, **kwargs) iris = load_iris() X = iris.data # setting distance_threshold=0 ensures we compute the full tree. model = AgglomerativeClustering(distance_threshold=0, n_clusters=None) model = model.fit(X) plt.title('Hierarchical Clustering Dendrogram') # plot the top three levels of the dendrogram plot_dendrogram(model, truncate_mode='level', p=3) plt.xlabel("Number of points in node (or index of point if no parenthesis).") plt.show()
fontdict={ 'weight': 'bold', 'size': 9 }) plt.xticks([]) plt.yticks([]) if title is not None: plt.title(title, size=17) plt.axis('off') plt.tight_layout(rect=[0, 0.03, 1, 0.95]) #---------------------------------------------------------------------- # 2D embedding of the digits dataset print("Computing embedding") X_red = manifold.SpectralEmbedding(n_components=2).fit_transform(X) print("Done.") from mrex.cluster import AgglomerativeClustering for linkage in ('ward', 'average', 'complete', 'single'): clustering = AgglomerativeClustering(linkage=linkage, n_clusters=10) t0 = time() clustering.fit(X_red) print("%s :\t%.2fs" % (linkage, time() - t0)) plot_clustering(X_red, clustering.labels_, "%s linkage" % linkage) plt.show()