def test_compute_full_tree(): # Test that the full tree is computed if n_clusters is small rng = np.random.RandomState(0) X = rng.randn(10, 2) connectivity = kneighbors_graph(X, 5, include_self=False) # When n_clusters is less, the full tree should be built # that is the number of merges should be n_samples - 1 agc = AgglomerativeClustering(n_clusters=2, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert n_nodes == n_samples - 1 # When n_clusters is large, greater than max of 100 and 0.02 * n_samples. # we should stop when there are n_clusters. n_clusters = 101 X = rng.randn(200, 2) connectivity = kneighbors_graph(X, 10, include_self=False) agc = AgglomerativeClustering(n_clusters=n_clusters, connectivity=connectivity) agc.fit(X) n_samples = X.shape[0] n_nodes = agc.children_.shape[0] assert n_nodes == n_samples - n_clusters
def test_connectivity_ignores_diagonal(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) connectivity_include_self = kneighbors_graph(X, 3, include_self=True) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering(connectivity=connectivity_include_self) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_connectivity_callable(): rng = np.random.RandomState(0) X = rng.rand(20, 5) connectivity = kneighbors_graph(X, 3, include_self=False) aglc1 = AgglomerativeClustering(connectivity=connectivity) aglc2 = AgglomerativeClustering( connectivity=partial(kneighbors_graph, n_neighbors=3, include_self=False)) aglc1.fit(X) aglc2.fit(X) assert_array_equal(aglc1.labels_, aglc2.labels_)
def test_isomap_reconstruction_error(): # Same setup as in test_isomap_simple_grid, with an added dimension N_per_side = 5 Npts = N_per_side**2 n_neighbors = Npts - 1 # grid of equidistant points in 2D, n_components = n_dim X = np.array(list(product(range(N_per_side), repeat=2))) # add noise in a third dimension rng = np.random.RandomState(0) noise = 0.1 * rng.randn(Npts, 1) X = np.concatenate((X, noise), 1) # compute input kernel G = neighbors.kneighbors_graph(X, n_neighbors, mode='distance').toarray() centerer = preprocessing.KernelCenterer() K = centerer.fit_transform(-0.5 * G**2) for eigen_solver in eigen_solvers: for path_method in path_methods: clf = manifold.Isomap(n_neighbors=n_neighbors, n_components=2, eigen_solver=eigen_solver, path_method=path_method) clf.fit(X) # compute output kernel G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode='distance').toarray() K_iso = centerer.fit_transform(-0.5 * G_iso**2) # make sure error agrees reconstruction_error = np.linalg.norm(K - K_iso) / Npts assert_almost_equal(reconstruction_error, clf.reconstruction_error())
def test_connectivity_propagation(): # Check that connectivity in the ward tree is propagated correctly during # merging. X = np.array([(.014, .120), (.014, .099), (.014, .097), (.017, .153), (.017, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .153), (.018, .152), (.018, .149), (.018, .144)]) connectivity = kneighbors_graph(X, 10, include_self=False) ward = AgglomerativeClustering( n_clusters=4, connectivity=connectivity, linkage='ward') # If changes are not propagated correctly, fit crashes with an # IndexError ward.fit(X)
def test_isomap_simple_grid(): # Isomap should preserve distances when all neighbors are used N_per_side = 5 Npts = N_per_side**2 n_neighbors = Npts - 1 # grid of equidistant points in 2D, n_components = n_dim X = np.array(list(product(range(N_per_side), repeat=2))) # distances from each point to all others G = neighbors.kneighbors_graph(X, n_neighbors, mode='distance').toarray() for eigen_solver in eigen_solvers: for path_method in path_methods: clf = manifold.Isomap(n_neighbors=n_neighbors, n_components=2, eigen_solver=eigen_solver, path_method=path_method) clf.fit(X) G_iso = neighbors.kneighbors_graph(clf.embedding_, n_neighbors, mode='distance').toarray() assert_array_almost_equal(G, G_iso)
def test_sparse_precomputed_distance(): """Make sure that TSNE works identically for sparse and dense matrix""" random_state = check_random_state(0) X = random_state.randn(100, 2) D_sparse = kneighbors_graph(X, n_neighbors=100, mode='distance', include_self=True) D = pairwise_distances(X) assert sp.issparse(D_sparse) assert_almost_equal(D_sparse.A, D) tsne = TSNE(metric="precomputed", random_state=0) Xt_dense = tsne.fit_transform(D) for fmt in ['csr', 'lil']: Xt_sparse = tsne.fit_transform(D_sparse.asformat(fmt)) assert_almost_equal(Xt_dense, Xt_sparse)
def test_identical_points(): # Ensure identical points are handled correctly when using mst with # a sparse connectivity matrix X = np.array([[0, 0, 0], [0, 0, 0], [1, 1, 1], [1, 1, 1], [2, 2, 2], [2, 2, 2]]) true_labels = np.array([0, 0, 1, 1, 2, 2]) connectivity = kneighbors_graph(X, n_neighbors=3, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) connectivity, n_components = _fix_connectivity(X, connectivity, 'euclidean') for linkage in ('single', 'average', 'average', 'ward'): clustering = AgglomerativeClustering(n_clusters=3, linkage=linkage, connectivity=connectivity) clustering.fit(X) assert_almost_equal(normalized_mutual_info_score(clustering.labels_, true_labels), 1)