Ejemplo n.º 1
0
def agglomerate(corpus, threshold=1.4, ignoreOutliers=True):
    """
    Cluster a set of questions using the hierarchical (bottom-up) agglomerative clustering method.

    Parameters:
        corpus (list): Tagged Questions Corpus collection of questions and their ids
        threshold (int): Interger value to determine the distance threshold for the cut-off point as we build the dendrogram
        removeOutliers (bool): A flag to determine whether to remove outliers or not
            (default is True)

    Returns:
        corpus: Corpus that has clusters list attached to it
    """

    repMatrix = makeRepresentationMatrix(corpus)

    outliers = []
    if ignoreOutliers:
        outliers, repMatrix = getOutliers(repMatrix, corpus=corpus)

    clustering = AgglomerativeClustering(linkage="ward",
                                         distance_threshold=threshold,
                                         n_clusters=None)

    clustering.fit(repMatrix)
    mapping = [i[1] - i[0] for i in enumerate(outliers)]
    clustering.labels_ = np.insert(clustering.labels_, mapping, -1)
    clusterMap = createClusterMap(corpus, clustering)
    print(clusterMap)  #JEFFLAG
    corpus = nameClusters(clusterMap)
    print("The corpus's clusters after naming")
    print(corpus.clusters)  #JEFFLAG
    return corpus
Ejemplo n.º 2
0
def update_clusters(clustering: AgglomerativeClustering,
                    new_distance_threshold: float):
    """
    Updates the cluster labels for each datapoint to be consistent with the algorithm's hierarchy and given distance
    threshold. Useful when we already ran the HAC algorithm to determine the points' hierarchy but want to change the
    threshold at which the number of clusters is found.
    :param AgglomerativeClustering clustering: the clustering algorithm with the distances
    :param float new_distance_threshold: the new distance threshold at which the number of clusters is to be determined.
    :return:
    """
    clustering.distance_threshold = new_distance_threshold
    clustering.labels_ = np.full_like(clustering.labels_, -1, dtype=int)
    _update_clusters(clustering)
    clustering.labels_ = np.max(
        clustering.labels_
    ) - clustering.labels_  # invert to follow natural order
    clustering.n_clusters_ = int(np.max(clustering.labels_) + 1)