def test_scikit_vs_scipy(): # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy n, p, k = 10, 5, 3 rng = np.random.RandomState(0) # Not using a lil_matrix here, just to check that non sparse # matrices are well handled connectivity = np.ones((n, n)) for linkage in _TREE_BUILDERS.keys(): for i in range(5): X = .1 * rng.normal(size=(n, p)) X -= 4. * np.arange(n)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.linkage(X, method=linkage) children_ = out[:, :2].astype(np.int, copy=False) children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity) # Sort the order of child nodes per row for consistency children.sort(axis=1) assert_array_equal(children, children_, 'linkage tree differs' ' from scipy impl for' ' linkage: ' + linkage) cut = _hc_cut(k, children, n_leaves) cut_ = _hc_cut(k, children_, n_leaves) assess_same_labelling(cut, cut_) # Test error management in _hc_cut with pytest.raises(ValueError): _hc_cut(n_leaves + 1, children, n_leaves)
def test_agglomerative_clustering_with_distance_threshold(linkage): # Check that we obtain the correct number of clusters with # agglomerative clustering with distance_threshold. rng = np.random.RandomState(0) mask = np.ones([10, 10], dtype=np.bool) n_samples = 100 X = rng.randn(n_samples, 50) connectivity = grid_to_graph(*mask.shape) # test when distance threshold is set to 10 distance_threshold = 10 for conn in [None, connectivity]: clustering = AgglomerativeClustering( n_clusters=None, distance_threshold=distance_threshold, connectivity=conn, linkage=linkage) clustering.fit(X) clusters_produced = clustering.labels_ num_clusters_produced = len(np.unique(clustering.labels_)) # test if the clusters produced match the point in the linkage tree # where the distance exceeds the threshold tree_builder = _TREE_BUILDERS[linkage] children, n_components, n_leaves, parent, distances = \ tree_builder(X, connectivity=conn, n_clusters=None, return_distance=True) num_clusters_at_threshold = np.count_nonzero( distances >= distance_threshold) + 1 # test number of clusters produced assert num_clusters_at_threshold == num_clusters_produced # test clusters produced clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced, children=children, n_leaves=n_leaves) assert np.array_equiv(clusters_produced, clusters_at_threshold)
def test_vector_scikit_single_vs_scipy_single(seed): n_samples, n_features, n_clusters = 10, 5, 3 rng = np.random.RandomState(seed) X = .1 * rng.normal(size=(n_samples, n_features)) X -= 4. * np.arange(n_samples)[:, np.newaxis] X -= X.mean(axis=1)[:, np.newaxis] out = hierarchy.linkage(X, method='single') children_scipy = out[:, :2].astype(np.int) children, _, n_leaves, _ = _TREE_BUILDERS['single'](X) # Sort the order of child nodes per row for consistency children.sort(axis=1) assert_array_equal( children, children_scipy, 'linkage tree differs' ' from scipy impl for' ' single linkage.') cut = _hc_cut(n_clusters, children, n_leaves) cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves) assess_same_labelling(cut, cut_scipy)
def fit(self, X, y=None): """Fit the agglomerative clustering from features or distance matrix. The stopping rule is used to determine :attr:`n_clusters_``, and the full dendrogram is cut there to compute :attr:`labels_`. Parameters ---------- X : ndarray, shape (n_samples, n_features) or (n_samples, n_samples) Training instances to cluster, or distances between instances if ``affinity='precomputed'``. y : ignored Not used, present here for API consistency by convention. Returns ------- self """ X = check_array(X) if X.shape[0] == 1: self.labels_ = np.array([0]) return self self._build_tree(X) self.n_clusters_ = _num_clusters_histogram(self.distances_, self.freq_threshold, self.n_bins_start) # Cut the tree to find labels # TODO verify whether Daniel Mullner's implementation of this step # offers any advantage self.labels_ = _hc_cut(self.n_clusters_, self.children_, self.n_leaves_) return self