Ejemplo n.º 1
0
    def test_all_points_same_value(self):
        # same value along y axis
        data = np.array([[1, 1], [1, 1], [2, 1]])
        clusters = np.array([0] * 3)
        hulls = compute_concave_hulls(data, clusters, epsilon=0.5)
        self.assertEqual(1, len(hulls))
        self.assertEqual(2, hulls[0].shape[1])  # hull have x and y

        # same value along x axis
        data = np.array([[1, 2], [1, 1], [1, 1]])
        clusters = np.array([0] * 3)
        hulls = compute_concave_hulls(data, clusters, epsilon=0.5)
        self.assertEqual(1, len(hulls))
        self.assertEqual(2, hulls[0].shape[1])  # hull have x and y
Ejemplo n.º 2
0
    def test_compute_concave_hulls(self):
        data = Table.from_file("iris")[:, 2:4]
        clusters = np.array([0] * 50 + [1] * 50 + [2] * 50)

        hulls = compute_concave_hulls(data.X, clusters, epsilon=0.5)
        self.assertEqual(3, len(hulls))
        self.assertEqual(2, hulls[0].shape[1])  # hull have x and y
        self.assertEqual(2, hulls[1].shape[1])  # hull have x and y
        self.assertEqual(2, hulls[2].shape[1])  # hull have x and y

        hulls = compute_concave_hulls(data.X, clusters)
        self.assertEqual(3, len(hulls))
        self.assertEqual(2, hulls[0].shape[1])  # hull have x and y
        self.assertEqual(2, hulls[1].shape[1])  # hull have x and y
        self.assertEqual(2, hulls[2].shape[1])  # hull have x and y
Ejemplo n.º 3
0
    def test_compute_concave_hulls_3_or_less_points(self):
        """
        Concave hull must also work for tree points - it is a special case
        """
        data = np.array([[1, 1], [1, 2], [2, 1]])
        clusters = np.array([0] * 3)
        hulls = compute_concave_hulls(data, clusters, epsilon=0.5)

        self.assertEqual(1, len(hulls))
        self.assertEqual(2, hulls[0].shape[1])  # hull have x and y

        hulls = compute_concave_hulls(data[:2], clusters[:2], epsilon=0.5)
        self.assertEqual(1, len(hulls))
        self.assertEqual(2, hulls[0].shape[1])  # hull have x and y

        hulls = compute_concave_hulls(data[:1], clusters[:1], epsilon=0.5)
        self.assertEqual(1, len(hulls))
        self.assertEqual(2, hulls[0].shape[1])  # hull have x and y
Ejemplo n.º 4
0
    def test_compute_concave_hulls_subsampling(self):
        """
        When more than 1000 points passed they are sub-sampled in order to
        compute a concave hull
        """
        iris = Table.from_file("iris")
        data = np.repeat(iris.X[:, 2:4], 10, axis=0)  # more than 1000 points
        clusters = np.array([0] * 50 * 10 + [1] * 50 * 10 + [2] * 50 * 10)

        hulls = compute_concave_hulls(data, clusters, epsilon=0.5)

        self.assertEqual(3, len(hulls))
        self.assertEqual(2, hulls[0].shape[1])  # hull have x and y
        self.assertEqual(2, hulls[1].shape[1])  # hull have x and y
        self.assertEqual(2, hulls[2].shape[1])  # hull have x and y
Ejemplo n.º 5
0
 def test_non_float_data(self):
     data = np.array([[1, 1], [1, 1], [2, 1]], dtype="object")
     clusters = np.array([0] * 3)
     hulls = compute_concave_hulls(data, clusters, epsilon=0.5)
     self.assertEqual(1, len(hulls))
     self.assertEqual(2, hulls[0].shape[1])  # hull have x and y
Ejemplo n.º 6
0
def annotate_documents(
    corpus: Corpus,
    embedding: np.ndarray,
    clustering_method: int,
    n_components: Optional[int] = None,
    epsilon: Optional[float] = None,
    cluster_labels: Optional[np.ndarray] = None,
    fdr_threshold: float = 0.05,
    n_words_in_cluster: int = 10,
    progress_callback: Optional[Callable] = None
) -> Tuple[np.ndarray, Dict[int, ClusterType], int, float, ScoresType]:
    """
    Annotate documents in corpus, by performing clustering on the corpus and
    assigning characteristic terms to each cluster using Hypergeometric
    distribution.

    Return annotated clusters - for each cluster return a list of keywords
    with scores, cluster center coordinates and concave_hulls coordinates.
    Also return optimal values for n_components/epsilon if calculated and
    scores data (p-values and counts for all keywords).

    Parameters
    ----------
    corpus : Corpus
        Corpus to be annotated.
    embedding : np.ndarray of size len(corpus) × 2
        Usually tSNE projection of BoW of corpus.
    clustering_method : int
        0 for DBSCAN
        1 for Gaussian mixture models
        2 for custom clustering where cluster_labels are used
    n_components: int, optional, default = None
        Number of clusters for Gaussian mixture models. If None, set to the
        number of clusters with maximal silhouette.
    epsilon : float, optional, default = None
        epsilon for DBSCAN. If None, optimal value is computed.
    cluster_labels : np.ndarray, optional
        Custom cluster labels. Usually included in corpus.
    fdr_threshold : float, optional, default = 0.05
        hypergeom_p_values threshold
    n_words_in_cluster : int, optional, default = 10
        Number of characteristic terms in each cluster.
    progress_callback : callable, optional
        Progress callback.

    Returns
    -------
    cluster_labels : np.ndarray of size len(corpus)
        An array of floats (i.e. 0, 1, np.nan) that represent cluster labels
        for all documents in the corpus.
    clusters : dict
        Dictionary of keywords with scores, centroids and concave hulls
        for each cluster.
    n_components : int
        Optimal number of clusters for Gaussian mixture models, if the
        n_components is None, and clustering_method is
        ClusterDocuments.GAUSSIAN_MIXTURE. n_components otherwise.
    epsilon : float
        Optimal value for epsilon for DBSCAN, if the epsilon is None, and
        clustering_method is ClusterDocuments.DBSCAN. epsilon otherwise.
    scores : tuple
        Tuple of all keywords with p-values and counts.

    Raises
    ------
    ValueError when there are no clusters in the embedding.

    """
    if progress_callback is None:
        progress_callback = dummy_callback

    if clustering_method == ClusterDocuments.GAUSSIAN_MIXTURE:
        if n_components is None:
            n_components = ClusterDocuments.gmm_compute_n_components(
                embedding, wrap_callback(progress_callback, end=0.3))
        n_components = min([n_components, len(embedding)])
        cluster_labels = ClusterDocuments.gmm(embedding,
                                              n_components=n_components,
                                              threshold=0.6)

    elif clustering_method == ClusterDocuments.DBSCAN:
        if epsilon is None:
            epsilon = ClusterDocuments.dbscan_compute_epsilon(embedding)
        cluster_labels = ClusterDocuments.dbscan(embedding, eps=epsilon)

    else:
        assert cluster_labels is not None
        cluster_labels[np.isnan(cluster_labels)] = -1

    if len(set(cluster_labels) - {-1}) == 0:
        raise ValueError("There are no clusters using current settings.")

    keywords = _get_characteristic_terms(corpus,
                                         n_keywords=20,
                                         progress_callback=wrap_callback(
                                             progress_callback, start=0.5))
    clusters_keywords, all_keywords, scores, p_values = \
        _hypergeom_clusters(cluster_labels, keywords,
                            fdr_threshold, n_words_in_cluster)

    concave_hulls = compute_concave_hulls(embedding, cluster_labels, epsilon)

    centroids = {
        c: tuple(np.mean(concave_hulls[c], axis=0))
        for c in set(cluster_labels) - {-1}
    }

    clusters = {
        int(key): (clusters_keywords[key], centroids[key], concave_hulls[key])
        for key in clusters_keywords
    }

    cluster_labels = cluster_labels.astype(float)
    cluster_labels[cluster_labels == -1] = np.nan

    scores = (all_keywords, scores, p_values)

    return cluster_labels, clusters, n_components, epsilon, scores
Ejemplo n.º 7
0
    pca = PCA(n_components=2)
    pca_model = pca(transformed_corpus)
    projection = pca_model(transformed_corpus)

    domain = Domain(
        transformed_corpus.domain.attributes,
        transformed_corpus.domain.class_vars,
        chain(transformed_corpus.domain.metas, projection.domain.attributes))
    corpus_ = corpus_.transform(domain)

    embedding_ = corpus_.metas[:, -2:]
    clusters_ = ClusterDocuments.gmm(embedding_, 3, 0.6)
    keywords_ = _get_characteristic_terms(corpus_, 4)
    clusters_keywords_, _, _, _ = \
        _hypergeom_clusters(clusters_, keywords_, 0.2, 5)
    concave_hulls_ = compute_concave_hulls(embedding_, clusters_)
    centroids_ = {
        c: tuple(np.mean(concave_hulls_[c], axis=0))
        for c in set(clusters_) - {-1}
    }

    palette = [
        "#377eb8", "#ff7f00", "#4daf4a", "#f781bf", "#a65628", "#984ea3",
        "#999999", "#e41a1c", "#dede00"
    ]
    for label_ in sorted(set(clusters_)):
        mask = label_ == clusters_
        color = palette[label_] if label_ != -1 else (0.5, 0.5, 0.5)
        plt.scatter(embedding_[mask, 0], embedding_[mask, 1], c=color)

        if label_ == -1: