Beispiel #1
0
 def _compare_clusters(**datasets):
     for name, dataset in datasets.items():
         pca = RandomizedPCA(2)
         pca.fit(dataset)
         X = pca.transform(dataset)
         instances = _kmeans()
         for instance in instances:
             instance.fit(dataset)
             # reduce to 2d for visualisation
             draw_cluster_2d(instance, X, 
                     filename="%s-kmeans-%s.png" % (name, instance.k))
         ms_instances = _meanshift(dataset)
         for instance in ms_instances:
             instance.fit(dataset)
         compare_pies(
                 [_get_distribution(i) for i in instances] + 
                     [_get_distribution(i) for i in ms_instances],
                 ["KMeans(%s)" % i.k for i in instances] + 
                     ["MeanShift(%s)" % round(i.bandwidth) for i in ms_instances],
                 filename="%s-pie.png" % name)
Beispiel #2
0
    def compare_pca(self):
        """Compare the clusters generated with different values for the dimensions
        of the PCA
        """

        processors = (TextProcessor(N=50, algorithms=["kmeans"]),
                      TextProcessor(N=100, algorithms=["kmeans"]),
                      TextProcessor(N=200, algorithms=["kmeans"]))

        users_cluster = defaultdict(list)
        for processor in processors:
            # don't use random centers for kmeans to be able to compare them
            processor._particular_user = "******"

            processor.run()
            for user, cluster in processor.clusters['kmeans'].items():
                users_cluster[user].append(np.bincount(cluster.labels_))

        for user, bincounts in users_cluster.items():
            compare_pies(bincounts, "compare_%s.png" % user, self.output_path)
Beispiel #3
0
    def compare_pca(self):
        """Compare the clusters generated with different values for the dimensions
        of the PCA
        """

        processors = (
                TextProcessor(N=50, algorithms=["kmeans"]), 
                TextProcessor(N=100, algorithms=["kmeans"]), 
                TextProcessor(N=200, algorithms=["kmeans"])
        )

        users_cluster = defaultdict(list)
        for processor in processors:
            # don't use random centers for kmeans to be able to compare them
            processor._particular_user = "******"

            processor.run()
            for user, cluster in processor.clusters['kmeans'].items():
                users_cluster[user].append(np.bincount(cluster.labels_))

        for user, bincounts in users_cluster.items():
            compare_pies(bincounts, "compare_%s.png" % user, self.output_path)
Beispiel #4
0
 def _compare_clusters(**datasets):
     for name, dataset in datasets.items():
         pca = RandomizedPCA(2)
         pca.fit(dataset)
         X = pca.transform(dataset)
         instances = _kmeans()
         for instance in instances:
             instance.fit(dataset)
             # reduce to 2d for visualisation
             draw_cluster_2d(instance,
                             X,
                             filename="%s-kmeans-%s.png" %
                             (name, instance.k))
         ms_instances = _meanshift(dataset)
         for instance in ms_instances:
             instance.fit(dataset)
         compare_pies(
             [_get_distribution(i) for i in instances] +
             [_get_distribution(i) for i in ms_instances],
             ["KMeans(%s)" % i.k for i in instances] +
             ["MeanShift(%s)" % round(i.bandwidth) for i in ms_instances],
             filename="%s-pie.png" % name)