def kmeansClustering(cluster_list, k, iterations, shuffle = True): """ Compute the k-means clustering of a set of clusters (reads/kmers) Note: the function may not mutate cluster_list Input: List of clusters, k number of clusters, iterations, select initial clusters: randomly or by size? Output: List of clusters. """ kclusters = [] # this list to store k clusters to compare with (non-mutable) centroids = [] # this list to store the initial k centroids (average stats vectors) if shuffle: # shuffle cluster list random.shuffle(cluster_list) else: # sort by size cluster_list.sort(key = lambda cluster: cluster.getSize(), reverse = True) # k initial clusters to define initial centroids for cluster in cluster_list[:k]: kclusters.append(cluster.copy()) centroids.append(cluster.getAvgAbundance()) for iteration in range(iterations): clusters = [] # initialize new empty cluster objects at the centroids for idx in range(k): cluster = Cluster([]) cluster.avg_abundance_vectors = list(centroids[idx]) clusters.append(cluster) # for every cluster in cluster_list for num in range(len(cluster_list)): best = (float('inf'), -1) # compare distance to every centroid at kclusters for idx in range(k): temp = cluster_list[num].distance(kclusters[idx]) if temp < best[0]: best = (temp, idx) # merge cluster to best centroid in list of mutable clusters clusters[best[1]].mergeClusters(cluster_list[num]) # make a copy of re-computed centroids: kclusters and centroids. for idx in range(k): kclusters[idx] = clusters[idx].copy() centroids[idx] = (clusters[idx].getAvgAbundance()) return kclusters