Example #1
0
 def test_centroid(self):
     # Assert center of list of vectors.
     v = vector.centroid([{
         "cat": 1
     }, {
         "cat": 0.5,
         "dog": 1
     }],
                         keys=["cat", "dog"])
     self.assertEqual(v, {"cat": 0.75, "dog": 0.5})
     print "pattern.vector.centroid()"
Example #2
0
    def _generate_clusters(self, k=10, p=0.8, maxlevel=10):
        """Use KMEANS method by default, and choose the initial k values by KMPP method.
        k is the number of clusters.
        p is to control the error of KMEANS, when p=1.0 is faster with small error.
        """
        if self.method == "kmeans":

            from pattern.vector import KMEANS, KMPP
            self.clusters = self.corpus.cluster(method=KMEANS,
                                                k=k,
                                                seed=KMPP,
                                                p=p,
                                                iterations=10)
            doc_list = []
            # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid.
            for cluster in self.clusters:
                c = centroid(cluster)
                d_min = (cluster[0].vector, c)
                for doc in cluster:
                    d = distance(doc.vector, c)
                    if distance(doc.vector, c) < d_min:
                        d_min = d
                        doc_min = doc
                doc_list.append(doc_min)
            self.centroids = [i.name for i in doc_list]
            self.clusters = [[i.name for i in cluster]
                             for cluster in self.clusters]

        elif self.method == 'covertree':

            def mydistance(doc_name1, doc_name2):
                v1 = self.corpus.document(doc_name1).vector
                v2 = self.corpus.document(doc_name2).vector
                return distance(v1, v2)

            self.covertree = Covertree(mydistance, maxlevel)

            for i, doc in enumerate(self.corpus):
                tree_node = myTree(doc.name)
                self.covertree.insert(tree_node, self.covertree.ct, 0)

            self.covertree.merge_levels()
            self.centroids, self.clusters = self.covertree.clustering_from_ct(
                k)
Example #3
0
    def _generate_clusters(self, k=10, p=0.8, maxlevel=10):
        """Use KMEANS method by default, and choose the initial k values by KMPP method.
        k is the number of clusters.
        p is to control the error of KMEANS, when p=1.0 is faster with small error.
        """
        if self.method == "kmeans":

            from pattern.vector import KMEANS, KMPP
            self.clusters = self.corpus.cluster(method=KMEANS, k=k, seed=KMPP, p=p, iterations=10)
            doc_list = []
            # For each cluster, calculate the centroid, and calculate the doc (vector) which is nearest to the centroid.
            for cluster in self.clusters:
                c = centroid(cluster)
                d_min = (cluster[0].vector, c)
                for doc in cluster:
                    d = distance(doc.vector, c)
                    if distance(doc.vector, c) < d_min:
                        d_min = d
                        doc_min = doc
                doc_list.append(doc_min)
            self.centroids = [i.name for i in doc_list]
            self.clusters = [[i.name for i in cluster] for cluster in self.clusters]

        elif self.method == 'covertree':

            def mydistance(doc_name1, doc_name2):
                v1 = self.corpus.document(doc_name1).vector
                v2 = self.corpus.document(doc_name2).vector
                return distance(v1, v2)

            self.covertree = Covertree(mydistance, maxlevel)

            for i, doc in enumerate(self.corpus):
                tree_node = myTree(doc.name)
                self.covertree.insert(tree_node, self.covertree.ct, 0)

            self.covertree.merge_levels()
            self.centroids, self.clusters = self.covertree.clustering_from_ct(k)
Example #4
0
 def test_centroid(self):
     # Assert center of list of vectors.
     v = vector.centroid([{"cat":1}, {"cat":0.5, "dog":1}], features=["cat", "dog"])
     self.assertEqual(v, {"cat":0.75, "dog":0.5})
     print("pattern.vector.centroid()")
Example #5
0
 def test_centroid(self):
     # Assert centroid of recursive Cluster.
     v = vector.Cluster(({"a": 1}, vector.Cluster(({"a": 2}, {"a": 4}))))
     self.assertAlmostEqual(vector.centroid(v)["a"], 2.33, places=2)
     print("pattern.vector.centroid()")
Example #6
0
 def test_centroid(self):
     # Assert centroid of recursive Cluster.
     v = vector.Cluster(({"a": 1}, vector.Cluster(({"a": 2}, {"a": 4}))))
     self.assertAlmostEqual(vector.centroid(v)["a"], 2.33, places=2)
     print "pattern.vector.centroid()"
 def variance(cluster):
     return avg([distance(centroid(cluster), v) for v in cluster])