def cluster_vectorspace(self, vectors, trace=False):
        # create a cluster for each vector
        clusters = [[vector] for vector in vectors]

      # This copy module and function is from the python standard library
        vector_sum = copy.copy(vectors)
        norm_sum = [normalize(vsum) for vsum in vector_sum]
        
        cluster_matrix = numpy.zeros([len(vectors[0]), len(vectors)])
        i = 0
        for v in norm_sum:
            cluster_matrix[:, i] = v
            i = i + 1
        
        dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix)

        while len(clusters) > max(self._num_clusters, 1):
            # find the two best candidate clusters to merge
            max_sim = self.array_max(dot_store_matrix)
            i = max_sim[0]
            j = max_sim[1]
            similarity = max_sim[2]
            if i == j:
                print "got stuck when at " + str(len(clusters)) + " clusters"
                break
            vsum = clusters[i] + clusters[j]
            if trace:
                print 'merging %d and %d' % (i, j)

            clusters[i] = vsum
            del clusters[j]
            vector_sum[i] = vector_sum[i] + vector_sum[j]
            norm_sum[i] = normalize(vector_sum[i])
            cluster_matrix[:, i] = norm_sum[i]
            del vector_sum[j]
            del norm_sum[j]
            cluster_matrix = numpy.delete(cluster_matrix, j, 1)
            
            dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix)
            
            self._dendogram.merge(similarity, i, j)
            if self._names:
                self._name_dendogram.merge(similarity, i, j)
            if len(clusters) % 50 == 0:
                print len(clusters)
                if self.log_it is not None:
                    self.log_it(str(len(clusters)))

        if self._iterative_reassign:
            for i in range(self._max_reassign):
                self.get_iteratively_reassigned_clusters(i, saveit=True)

        self.update_clusters(len(clusters))
Beispiel #2
0
    def cluster_vectorspace(self, vectors, trace=False):
        # create a cluster for each vector
        clusters = [[vector] for vector in vectors]

        # This copy module and function is from the python standard library
        vector_sum = copy.copy(vectors)
        norm_sum = [normalize(vsum) for vsum in vector_sum]

        cluster_matrix = numpy.zeros([len(vectors[0]), len(vectors)])
        i = 0
        for v in norm_sum:
            cluster_matrix[:, i] = v
            i = i + 1

        if self.msg_handle is not None:
            self.msg_handle.dm("initializing dot_store_matrix")
        dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix)
        start = time.time()
        while len(clusters) > max(self._num_clusters, 1):
            # find the two best candidate clusters to merge
            max_sim = self.array_max(dot_store_matrix)
            i = max_sim[0]
            j = max_sim[1]
            if i == j:
                print "got stuck when at " + str(len(clusters)) + " clusters"
                break;
            vsum = clusters[i] + clusters[j]
            if trace:
                print 'merging %d and %d' % (i, j)

            clusters[i] = vsum
            del clusters[j]
            vector_sum[i] = vector_sum[i] + vector_sum[j]
            norm_sum[i] = normalize(vector_sum[i])
            cluster_matrix[:, i] = norm_sum[i]
            del vector_sum[j]
            del norm_sum[j]
            cluster_matrix = numpy.delete(cluster_matrix, j, 1)

            dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix)

            self._dendogram.merge(i, j)
            if self._names:
                self._name_dendogram.merge(i, j)
            end = time.time()
            if end - start > 5:
                if self.msg_handle is not None:
                    self.msg_handle.dm(str(len(clusters)))
                    start = end
                else:
                    print len(clusters)
        self.update_clusters(len(clusters))
Beispiel #3
0
    def get_iteratively_reassigned_clusters(self, num_clusters, max_iterations=100, saveit=True):
        if num_clusters in self._reassigned_clusters:
            return self._reassigned_clusters[num_clusters]
        self.update_clusters(num_clusters)
        new_centroids = self._centroids
        clusters = self._dendogram.groups(num_clusters)

        for iter in range(max_iterations):
            number_reassigned = 0
            new_clusters = [[] for i in range(len(self._centroids))]
            for cluster_number, cluster in enumerate(clusters):
                for vec in cluster:
                    dps = [numpy.dot(numpy.transpose(centroid), vec) for centroid in new_centroids]
                    new_cluster_index = dps.index(max(dps))
                    new_clusters[new_cluster_index].append(vec)
                    if new_cluster_index != cluster_number:
                        number_reassigned += 1
            new_centroids = []
            for cluster in new_clusters:
                assert len(cluster) > 0
                if self._should_normalise:
                    centroid = self._normalise(cluster[0])
                else:
                    centroid = numpy.array(cluster[0])
                for vector in cluster[1:]:
                    if self._should_normalise:
                        centroid += self._normalise(vector)
                    else:
                        centroid += vector
                # centroid /= float(len(cluster))  # was this supposed to be some sort of normalizing?
                norm_centroid = normalize(centroid)
                new_centroids.append(norm_centroid)
            print [len(cluster) for cluster in new_clusters]
            print "Number reassigned = %i" % number_reassigned
    def cluster_vectorspace(self, vectors, trace=False):
        # create a cluster for each vector
        clusters = [[vector] for vector in vectors]

        # This copy module and function is from the python standard library
        vector_sum = copy.copy(vectors)
        norm_sum = [normalize(vsum) for vsum in vector_sum]

        cluster_matrix = numpy.zeros([len(vectors[0]), len(vectors)])
        i = 0
        for v in norm_sum:
            cluster_matrix[:, i] = v
            i = i + 1

        dot_store_matrix = numpy.dot(cluster_matrix.transpose(),
                                     cluster_matrix)

        while len(clusters) > max(self._num_clusters, 1):
            # find the two best candidate clusters to merge
            max_sim = self.array_max(dot_store_matrix)
            i = max_sim[0]
            j = max_sim[1]
            if i == j:
                print "got stuck when at " + str(len(clusters)) + " clusters"
                break
            vsum = clusters[i] + clusters[j]
            if trace:
                print 'merging %d and %d' % (i, j)

            clusters[i] = vsum
            del clusters[j]
            vector_sum[i] = vector_sum[i] + vector_sum[j]
            norm_sum[i] = normalize(vector_sum[i])
            cluster_matrix[:, i] = norm_sum[i]
            del vector_sum[j]
            del norm_sum[j]
            cluster_matrix = numpy.delete(cluster_matrix, j, 1)

            dot_store_matrix = numpy.dot(cluster_matrix.transpose(),
                                         cluster_matrix)

            self._dendogram.merge(i, j)
            if self._names:
                self._name_dendogram.merge(i, j)
            if len(clusters) % 50 == 0:
                print len(clusters)
        self.update_clusters(len(clusters))
Beispiel #5
0
    def cluster_vectorspace(self, vectors, trace=False):
        # create a cluster for each vector
        clusters = [[vector] for vector in vectors]

        # This copy module and function is from the python standard library
        vector_sum = copy.copy(vectors)
        norm_sum = [normalize(vsum) for vsum in vector_sum]

        while len(clusters) > max(self._num_clusters, 1):
            # find the two best candidate clusters to merge
            best = None
            
            for i in range(len(clusters)):
                for j in range(i + 1, len(clusters)):
                    sim = numpy.dot(norm_sum[i], norm_sum[j])
                    if not best or sim > best[0]:
                        best = (sim, i, j)

            # merge them and replace in cluster list
            i, j = best[1:]
            csum = clusters[i] + clusters[j]
            if trace:
                print 'merging %d and %d' % (i, j)
                # print len(clusters)

            clusters[i] = csum
            del clusters[j]
            vector_sum[i] = vector_sum[i] + vector_sum[j]
            norm_sum[i] = normalize(vector_sum[i])
            del vector_sum[j]
            del norm_sum[j]
            self._dendogram.merge(i, j)
            if self._names:
                self._name_dendogram.merge(i, j)
            if len(clusters) % 50 == 0:
                print len(clusters)
        self.update_clusters(self._num_clusters)
    def cluster_vectorspace(self, vectors, trace=False):
        # create a cluster for each vector
        clusters = [[vector] for vector in vectors]

        # This copy module and function is from the python standard library
        vector_sum = copy.copy(vectors)
        norm_sum = [normalize(vsum) for vsum in vector_sum]

        while len(clusters) > max(self._num_clusters, 1):
            # find the two best candidate clusters to merge
            best = None

            for i in range(len(clusters)):
                for j in range(i + 1, len(clusters)):
                    sim = numpy.dot(norm_sum[i], norm_sum[j])
                    if not best or sim > best[0]:
                        best = (sim, i, j)

            # merge them and replace in cluster list
            i, j = best[1:]
            csum = clusters[i] + clusters[j]
            if trace:
                print 'merging %d and %d' % (i, j)
                # print len(clusters)

            clusters[i] = csum
            del clusters[j]
            vector_sum[i] = vector_sum[i] + vector_sum[j]
            norm_sum[i] = normalize(vector_sum[i])
            del vector_sum[j]
            del norm_sum[j]
            self._dendogram.merge(i, j)
            if self._names:
                self._name_dendogram.merge(i, j)
            if len(clusters) % 50 == 0:
                print len(clusters)
        self.update_clusters(self._num_clusters)
 def compute_centroids(self, clusters):
     centroids = []
     for cluster in clusters:
         assert len(cluster) > 0
         if self._should_normalise:
             centroid = self._normalise(cluster[0])
         else:
             centroid = numpy.array(cluster[0])
         for vector in cluster[1:]:
             if self._should_normalise:
                 centroid += self._normalise(vector)
             else:
                 centroid += vector
         # centroid /= float(len(cluster))  # was this supposed to be some sort of normalizing?
         norm_centroid = normalize(centroid)
         centroids.append(norm_centroid)
     return centroids
 def update_clusters(self, num_clusters):
     clusters = self._dendogram.groups(num_clusters)
     self._centroids = []
     for cluster in clusters:
         assert len(cluster) > 0
         if self._should_normalise:
             centroid = self._normalise(cluster[0])
         else:
             centroid = numpy.array(cluster[0])
         for vector in cluster[1:]:
             if self._should_normalise:
                 centroid += self._normalise(vector)
             else:
                 centroid += vector
         # centroid /= float(len(cluster))  # was this supposed to be some sort of normalizing?
         norm_centroid = normalize(centroid)
         self._centroids.append(norm_centroid)
     self._num_clusters = len(self._centroids)
 def update_clusters(self, num_clusters):
     print "entering update clusters with num_clusters = " + str(
         num_clusters)
     clusters = self._dendogram.groups(num_clusters)
     self._centroids = []
     for cluster in clusters:
         assert len(cluster) > 0
         if self._should_normalise:
             centroid = self._normalise(cluster[0])
         else:
             centroid = numpy.array(cluster[0])
         for vector in cluster[1:]:
             if self._should_normalise:
                 centroid += self._normalise(vector)
             else:
                 centroid += vector
         # centroid /= float(len(cluster))  # was this supposed to be some sort of normalizing?
         norm_centroid = normalize(centroid)
         self._centroids.append(norm_centroid)
     self._num_clusters = len(self._centroids)