Esempio n. 1
0
def test_sparse_scikit_vs_scipy():
    # Test scikit linkage with full connectivity (i.e. unstructured) vs scipy
    n, p, k = 10, 5, 3
    rng = np.random.RandomState(0)

    # Not using a lil_matrix here, just to check that non sparse
    # matrices are well handled
    connectivity = np.ones((n, n))
    for linkage in _TREE_BUILDERS.keys():
        for i in range(5):
            X = .1 * rng.normal(size=(n, p))
            X -= 4. * np.arange(n)[:, np.newaxis]
            X -= X.mean(axis=1)[:, np.newaxis]

            out = hierarchy.linkage(X, method=linkage)

            children_ = out[:, :2].astype(np.int, copy=False)
            children, _, n_leaves, _ = _TREE_BUILDERS[linkage](X, connectivity)

            # Sort the order of child nodes per row for consistency
            children.sort(axis=1)
            assert_array_equal(
                children, children_, 'linkage tree differs'
                ' from scipy impl for'
                ' linkage: ' + linkage)

            cut = _hc_cut(k, children, n_leaves)
            cut_ = _hc_cut(k, children_, n_leaves)
            assess_same_labelling(cut, cut_)

    # Test error management in _hc_cut
    with pytest.raises(ValueError):
        _hc_cut(n_leaves + 1, children, n_leaves)
def test_vector_scikit_single_vs_scipy_single(seed):
    n_samples, n_features, n_clusters = 10, 5, 3
    rng = np.random.RandomState(seed)
    X = 0.1 * rng.normal(size=(n_samples, n_features))
    X -= 4.0 * np.arange(n_samples)[:, np.newaxis]
    X -= X.mean(axis=1)[:, np.newaxis]

    out = hierarchy.linkage(X, method="single")
    children_scipy = out[:, :2].astype(int)

    children, _, n_leaves, _ = _TREE_BUILDERS["single"](X)

    # Sort the order of child nodes per row for consistency
    children.sort(axis=1)
    assert_array_equal(
        children,
        children_scipy,
        "linkage tree differs"
        " from scipy impl for"
        " single linkage.",
    )

    cut = _hc_cut(n_clusters, children, n_leaves)
    cut_scipy = _hc_cut(n_clusters, children_scipy, n_leaves)
    assess_same_labelling(cut, cut_scipy)
Esempio n. 3
0
def test_agglomerative_clustering_with_distance_threshold(linkage):
    # Check that we obtain the correct number of clusters with
    # agglomerative clustering with distance_threshold.
    rng = np.random.RandomState(0)
    mask = np.ones([10, 10], dtype=np.bool)
    n_samples = 100
    X = rng.randn(n_samples, 50)
    connectivity = grid_to_graph(*mask.shape)
    # test when distance threshold is set to 10
    distance_threshold = 10
    for conn in [None, connectivity]:
        clustering = AgglomerativeClustering(
            n_clusters=None,
            distance_threshold=distance_threshold,
            connectivity=conn,
            linkage=linkage)
        clustering.fit(X)
        clusters_produced = clustering.labels_
        num_clusters_produced = len(np.unique(clustering.labels_))
        # test if the clusters produced match the point in the linkage tree
        # where the distance exceeds the threshold
        tree_builder = _TREE_BUILDERS[linkage]
        children, n_components, n_leaves, parent, distances = \
            tree_builder(X, connectivity=conn, n_clusters=None,
                         return_distance=True)
        num_clusters_at_threshold = np.count_nonzero(
            distances >= distance_threshold) + 1
        # test number of clusters produced
        assert num_clusters_at_threshold == num_clusters_produced
        # test clusters produced
        clusters_at_threshold = _hc_cut(n_clusters=num_clusters_produced,
                                        children=children,
                                        n_leaves=n_leaves)
        assert np.array_equiv(clusters_produced, clusters_at_threshold)
Esempio n. 4
0
    def fit(self, X, y=None):
        """Fit the agglomerative clustering from features or distance matrix.

        The stopping rule is used to determine :attr:`n_clusters_`, and the
        full dendrogram is cut there to compute :attr:`labels_`.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
            Training instances to cluster, or distances between instances if
            ``affinity='precomputed'``.

        y : ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self

        """
        _max_fraction = 1. if self.max_fraction is None else self.max_fraction
        validate_params(
            {
                'freq_threshold': self.freq_threshold,
                'max_fraction': _max_fraction,
                'n_bins_start': self.n_bins_start
            }, self._hyperparameters)
        X = check_array(X)
        if X.shape[0] == 1:
            self.labels_ = np.array([0])
            self.n_clusters_ = 1
            return self

        self._build_tree(X)

        self.n_clusters_ = _num_clusters_histogram(self.distances_,
                                                   self.freq_threshold,
                                                   self.n_bins_start,
                                                   self.max_fraction)

        # Cut the tree to find labels
        # TODO: Verify whether Daniel Mullner's implementation of this step
        #  offers any advantage
        self.labels_ = _hc_cut(self.n_clusters_, self.children_,
                               self.n_leaves_)
        return self
Esempio n. 5
0
    def fit(self, X, y=None):
        """Fit the agglomerative clustering from features or distance matrix.

        The stopping rule is used to determine :attr:`n_clusters_`, and the
        full dendrogram is cut there to compute :attr:`labels_`.

        Parameters
        ----------
        X : ndarray of shape (n_samples, n_features) or (n_samples, n_samples)
            Training instances to cluster, or distances between instances if
            ``affinity='precomputed'``.

        y : ignored
            Not used, present here for API consistency by convention.

        Returns
        -------
        self

        """
        X = check_array(X)
        validate_params(self.get_params(),
                        self._hyperparameters,
                        exclude=['memory'])

        if X.shape[0] == 1:
            self.labels_ = np.array([0])
            self.n_clusters_ = 1
            return self

        self._build_tree(X)

        min_gap_size = self.relative_gap_size * self.distances_[-1]
        self.n_clusters_ = _num_clusters_simple(self.distances_, min_gap_size,
                                                self.max_fraction)

        # Cut the tree to find labels
        # TODO: Verify whether Daniel Mullner's implementation of this step
        #  offers any advantage
        self.labels_ = _hc_cut(self.n_clusters_, self.children_,
                               self.n_leaves_)
        return self