Exemple #1
0
    def fit(self, adjacency: Union[sparse.csr_matrix, np.ndarray]) -> 'KMeans':
        """Apply embedding method followed by K-means.

        Parameters
        ----------
        adjacency:
            Adjacency matrix of the graph.

        Returns
        -------
        self: :class:`KMeans`
        """
        n = adjacency.shape[0]
        check_n_clusters(self.n_clusters, n)

        embedding = self.embedding_method.fit_transform(adjacency)
        kmeans = KMeansDense(self.n_clusters)
        kmeans.fit(embedding)

        if self.sort_clusters:
            labels = reindex_labels(kmeans.labels_)
        else:
            labels = kmeans.labels_

        self.labels_ = labels
        self._secondary_outputs(adjacency)

        return self
def cut_straight(dendrogram: np.ndarray, n_clusters: Optional[int] = None, threshold: Optional[float] = None,
                 sort_clusters: bool = True, return_dendrogram: bool = False) \
                -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
    """Cut a dendrogram and return the corresponding clustering.

    Parameters
    ----------
    dendrogram:
        Dendrogram.
    n_clusters :
        Number of clusters (optional).
        The number of clusters can be larger than n_clusters in case of equal heights in the dendrogram.
    threshold :
        Threshold on height (optional).
        If both n_clusters and threshold are ``None``, n_clusters is set to 2.
    sort_clusters :
        If ``True``,  sorts clusters in decreasing order of size.
    return_dendrogram :
        If ``True``, returns the dendrogram formed by the clusters up to the root.
    Returns
    -------
    labels : np.ndarray
        Cluster of each node.
    dendrogram_aggregate : np.ndarray
        Dendrogram starting from clusters (leaves = clusters).

    Example
    -------
    >>> from sknetwork.hierarchy import cut_straight
    >>> dendrogram = np.array([[0, 1, 0, 2], [2, 3, 1, 3]])
    >>> cut_straight(dendrogram)
    array([0, 0, 1])
    """
    check_dendrogram(dendrogram)
    n = dendrogram.shape[0] + 1

    if return_dendrogram and not np.all(np.diff(dendrogram[:, 2]) >= 0):
        raise ValueError(
            "The third column of the dendrogram must be non-decreasing.")

    cluster = {i: [i] for i in range(n)}
    if n_clusters is None:
        if threshold is None:
            n_clusters = 2
        else:
            n_clusters = n
    else:
        check_n_clusters(n_clusters, n, n_min=1)
    cut = np.sort(dendrogram[:, 2])[n - n_clusters]
    if threshold is not None:
        cut = max(cut, threshold)
    for t in range(n - 1):
        i = int(dendrogram[t][0])
        j = int(dendrogram[t][1])
        if dendrogram[t][2] < cut and i in cluster and j in cluster:
            cluster[n + t] = cluster.pop(i) + cluster.pop(j)

    return get_labels(dendrogram, cluster, sort_clusters, return_dendrogram)
Exemple #3
0
def cut_straight(
    dendrogram: np.ndarray,
    n_clusters: int = 2,
    sort_clusters: bool = True,
    return_dendrogram: bool = False
) -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
    """Cut a dendrogram and return the corresponding clustering.

    Parameters
    ----------
    dendrogram:
        Dendrogram
    n_clusters :
        Number of clusters.
    sort_clusters :
        If ``True``,  sorts clusters in decreasing order of size.
    return_dendrogram :
        If ``True``, returns the dendrogram formed by the clusters up to the root.
    Returns
    -------
    labels : np.ndarray
        Cluster of each node.
    dendrogram_aggregate : np.ndarray
        Dendrogram starting from clusters (leaves = clusters).

    Example
    -------
    >>> from sknetwork.hierarchy import cut_straight
    >>> dendrogram = np.array([[0, 1, 0, 2], [2, 3, 1, 3]])
    >>> cut_straight(dendrogram)
    array([0, 0, 1])
    """
    check_dendrogram(dendrogram)
    n = dendrogram.shape[0] + 1
    check_n_clusters(n_clusters, n, n_min=1)

    if return_dendrogram and not np.all(np.diff(dendrogram[:, 2]) >= 0):
        raise ValueError(
            "The third column of the dendrogram must be non-decreasing.")

    cluster = {i: [i] for i in range(n)}
    cut = np.sort(dendrogram[:, 2])[n - n_clusters]
    for t in range(n - 1):
        i = int(dendrogram[t][0])
        j = int(dendrogram[t][1])
        if dendrogram[t][2] < cut and i in cluster and j in cluster:
            cluster[n + t] = cluster.pop(i) + cluster.pop(j)

    return get_labels(dendrogram, cluster, sort_clusters, return_dendrogram)
Exemple #4
0
def aggregate_dendrogram(dendrogram: np.ndarray, n_clusters: int = 2, return_counts: bool = False) \
                        -> Union[np.ndarray, Tuple[np.ndarray, np.ndarray]]:
    """Aggregate a dendrogram in order to get a certain number of leaves.
    The leaves in the output dendrogram correspond to subtrees in the input one.

    Parameters
    ----------
    dendrogram:
        The input to aggregate.
    n_clusters:
        Number of clusters (or leaves) to keep.
    return_counts
        If ``True``, returns an array of counts corresponding to the sizes of the merged subtrees.
        The sum of the counts is equal to the number of samples in the input dendrogram.

    Returns
    -------
    new_dendrogram:
        Aggregated dendrogram. The nodes are reindexed from 0.
    counts:
        Size of the subtrees corresponding to each leaf in new_dendrogram.
    """
    n_nodes: int = dendrogram.shape[0] + 1
    check_n_clusters(n_clusters, n_nodes, n_min=1)

    new_dendrogram = dendrogram[n_nodes - n_clusters:].copy()
    node_indices = np.array(
        sorted(set(new_dendrogram[:, 0]).union(set(new_dendrogram[:, 1]))))
    new_index = {ix: i for i, ix in enumerate(node_indices)}

    for j in range(2):
        for i in range(new_dendrogram.shape[0]):
            new_dendrogram[i, j] = new_index[new_dendrogram[i, j]]

    if return_counts:
        leaves = node_indices[:n_clusters].astype(int)
        leaves_indices = leaves - n_nodes
        counts = dendrogram[leaves_indices, 3]

        return new_dendrogram, counts.astype(int)
    else:
        return new_dendrogram
    def fit(self, input_matrix: Union[sparse.csr_matrix, np.ndarray]) -> 'KMeans':
        """Apply embedding method followed by K-means.

        Parameters
        ----------
        input_matrix :
            Adjacency matrix or biadjacency matrix of the graph.

        Returns
        -------
        self: :class:`KMeans`
        """
        self._init_vars()

        # input
        check_format(input_matrix)
        if self.co_cluster:
            check_n_clusters(self.n_clusters, np.sum(input_matrix.shape))
        else:
            check_n_clusters(self.n_clusters, input_matrix.shape[0])

        # embedding
        embedding, self.bipartite = get_embedding(input_matrix, self.embedding_method, self.co_cluster)

        # clustering
        kmeans = KMeansDense(self.n_clusters)
        kmeans.fit(embedding)

        # sort
        if self.sort_clusters:
            labels = reindex_labels(kmeans.labels_)
        else:
            labels = kmeans.labels_

        # output
        self.labels_ = labels
        if self.co_cluster:
            self._split_vars(input_matrix.shape)
        self._secondary_outputs(input_matrix)

        return self
Exemple #6
0
    def fit(self, biadjacency: Union[sparse.csr_matrix,
                                     np.ndarray]) -> 'BiKMeans':
        """Apply embedding method followed by clustering to the graph.

        Parameters
        ----------
        biadjacency:
            Biadjacency matrix of the graph.

        Returns
        -------
        self: :class:`BiKMeans`
        """
        n_row, n_col = biadjacency.shape
        check_n_clusters(self.n_clusters, n_row)

        method = self.embedding_method
        method.fit(biadjacency)

        if self.co_cluster:
            embedding = np.vstack(
                (method.embedding_row_, method.embedding_col_))
        else:
            embedding = method.embedding_

        kmeans = KMeansDense(self.n_clusters)
        kmeans.fit(embedding)

        if self.sort_clusters:
            labels = reindex_labels(kmeans.labels_)
        else:
            labels = kmeans.labels_

        self.labels_ = labels
        if self.co_cluster:
            self._split_vars(n_row)
        else:
            self.labels_row_ = labels

        if self.return_membership:
            membership_row = membership_matrix(self.labels_row_,
                                               n_labels=self.n_clusters)
            if self.labels_col_ is not None:
                membership_col = membership_matrix(self.labels_col_,
                                                   n_labels=self.n_clusters)
                self.membership_row_ = normalize(
                    biadjacency.dot(membership_col))
                self.membership_col_ = normalize(
                    biadjacency.T.dot(membership_row))
            else:
                self.membership_row_ = normalize(
                    biadjacency.dot(biadjacency.T.dot(membership_row)))
            self.membership_ = self.membership_row_

        if self.return_aggregate:
            membership_row = membership_matrix(self.labels_row_,
                                               n_labels=self.n_clusters)
            biadjacency_ = sparse.csr_matrix(membership_row.T.dot(biadjacency))
            if self.labels_col_ is not None:
                membership_col = membership_matrix(self.labels_col_,
                                                   n_labels=self.n_clusters)
                biadjacency_ = biadjacency_.dot(membership_col)
            self.biadjacency_ = biadjacency_

        return self