def execute(self, docs): """ @see parent """ clusters = [] # First cluster contains all of the docs. cluster = dotdict() cluster.docs = [] for doc in self.bible.docs: doc.cluster = cluster cluster.docs.append(doc) cluster.centroid = self.centroid(cluster.docs) # Append the first cluster to the cluster list. clusters.append(cluster) while len(clusters) != self.k: # Use the abstract select cluster method. cluster = self.select_cluster(clusters) # Remove this cluster from the current set because it will be split. clusters.remove(cluster) max_sim = float("-inf") max_bicluster = None for i in range(self.iter): # Free the docs from whatever cluster they are in. for doc in cluster.docs: doc.cluster = None kmeans = KMeans(2) kmeans.bible = self.bible kmeans.execute(cluster.docs) bicluster = kmeans.clusters sim = kmeans.similarity() if sim > max_sim: max_sim = sim max_bicluster = bicluster # Re-assign the documents to their respective max bicluster. for cluster in bicluster: for doc in cluster.docs: doc.cluster = cluster # Add the new max bicluster to the current cluster set. clusters.extend(bicluster) self.clusters = clusters
def execute(self, docs): """ Overloads UPGMA's execute and runs the results through K-Means. Key arguments: docs -- the docs to cluster. """ super(AgglomerativeUPGMAKMeans, self).execute(docs) kmeans = KMeans(self.k) kmeans.bible = self.bible # Clusters will be changed within this method. kmeans.execute(docs, self.clusters)