def _compare_clusters(**datasets): for name, dataset in datasets.items(): pca = RandomizedPCA(2) pca.fit(dataset) X = pca.transform(dataset) instances = _kmeans() for instance in instances: instance.fit(dataset) # reduce to 2d for visualisation draw_cluster_2d(instance, X, filename="%s-kmeans-%s.png" % (name, instance.k)) ms_instances = _meanshift(dataset) for instance in ms_instances: instance.fit(dataset) compare_pies( [_get_distribution(i) for i in instances] + [_get_distribution(i) for i in ms_instances], ["KMeans(%s)" % i.k for i in instances] + ["MeanShift(%s)" % round(i.bandwidth) for i in ms_instances], filename="%s-pie.png" % name)
def compare_pca(self): """Compare the clusters generated with different values for the dimensions of the PCA """ processors = (TextProcessor(N=50, algorithms=["kmeans"]), TextProcessor(N=100, algorithms=["kmeans"]), TextProcessor(N=200, algorithms=["kmeans"])) users_cluster = defaultdict(list) for processor in processors: # don't use random centers for kmeans to be able to compare them processor._particular_user = "******" processor.run() for user, cluster in processor.clusters['kmeans'].items(): users_cluster[user].append(np.bincount(cluster.labels_)) for user, bincounts in users_cluster.items(): compare_pies(bincounts, "compare_%s.png" % user, self.output_path)
def compare_pca(self): """Compare the clusters generated with different values for the dimensions of the PCA """ processors = ( TextProcessor(N=50, algorithms=["kmeans"]), TextProcessor(N=100, algorithms=["kmeans"]), TextProcessor(N=200, algorithms=["kmeans"]) ) users_cluster = defaultdict(list) for processor in processors: # don't use random centers for kmeans to be able to compare them processor._particular_user = "******" processor.run() for user, cluster in processor.clusters['kmeans'].items(): users_cluster[user].append(np.bincount(cluster.labels_)) for user, bincounts in users_cluster.items(): compare_pies(bincounts, "compare_%s.png" % user, self.output_path)