Exemple #1
0
def kmeansClustering(cluster_list, k, iterations, shuffle = True):
    """
    Compute the k-means clustering of a set of clusters (reads/kmers)
    Note: the function may not mutate cluster_list
    
    Input: List of clusters, k number of clusters, iterations, 
    select initial clusters: randomly or by size?
    Output: List of clusters.
    """
    kclusters = [] # this list to store k clusters to compare with (non-mutable)
    centroids = [] # this list to store the initial k centroids (average stats vectors)
    
    if shuffle:
        # shuffle cluster list
        random.shuffle(cluster_list) 
    else:
        # sort by size
        cluster_list.sort(key = lambda cluster: cluster.getSize(), reverse = True)

    # k initial clusters to define initial centroids
    for cluster in cluster_list[:k]:
        kclusters.append(cluster.copy())
        centroids.append(cluster.getAvgAbundance())
        
    for iteration in range(iterations):
        clusters = []
        # initialize new empty cluster objects at the centroids
        for idx in range(k):
            cluster = Cluster([])
            cluster.avg_abundance_vectors = list(centroids[idx])
            clusters.append(cluster)
        
        # for every cluster in cluster_list
        for num in range(len(cluster_list)):
            best = (float('inf'), -1)
            # compare distance to every centroid at kclusters
            for idx in range(k):
                temp = cluster_list[num].distance(kclusters[idx])
                if temp < best[0]:
                    best = (temp, idx)
            # merge cluster to best centroid in list of mutable clusters
            clusters[best[1]].mergeClusters(cluster_list[num])
        
        # make a copy of re-computed centroids: kclusters and centroids.
        for idx in range(k):
            kclusters[idx] = clusters[idx].copy()
            centroids[idx] = (clusters[idx].getAvgAbundance())
    
    return kclusters