Ejemplo n.º 1
0
 def cluster(self, vectors, assign_clusters=False, trace=False):
     # stores the merge order
     self._dendogram = Dendogram([numpy.array(vector, numpy.float64) for vector in vectors])
     if self._names:
         self._name_dendogram = Dendogram(self._names)
     self._vectors_to_cluster = vectors
     return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
 def cluster(self, vectors, assign_clusters=False, trace=False):
     # stores the merge order
     self._dendogram = Dendogram(
         [numpy.array(vector, numpy.float64) for vector in vectors])
     if self._names:
         self._name_dendogram = Dendogram(self._names)
     self._vectors_to_cluster = vectors
     return VectorSpaceClusterer.cluster(self, vectors, assign_clusters,
                                         trace)
Ejemplo n.º 3
0
 def cluster(self, vectors, assign_clusters=False, trace=False):
     # stores the merge order
     if self.msg_handle is not None:
         self.msg_handle.dm(str(len(vectors)))
         self.msg_handle.tile_yield()
     self._dendogram = Dendogram(
         [numpy.array(vector, numpy.float64) for vector in vectors])
     if self._names:
         self._name_dendogram = Dendogram(self._names)
     self._vectors_to_cluster = vectors
     return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)
Ejemplo n.º 4
0
class GAAClusterer(VectorSpaceClusterer):
    """
    The Group Average Agglomerative starts with each of the N vectors as singleton
    clusters. It then iteratively merges pairs of clusters which have the
    closest centroids.  This continues until there is only one cluster. The
    order of merges gives rise to a dendogram: a tree with the earlier merges
    lower than later merges. The membership of a given number of clusters c, 1
    <= c <= N, can be found by cutting the dendogram at depth c.

    This clusterer uses the cosine similarity metric only, which allows for
    efficient speed-up in the clustering process. 
    """

    def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
        self._num_clusters = num_clusters
        self._dendogram = None
        self._groups_values = None

    def cluster(self, vectors, assign_clusters=False, trace=False):
        # stores the merge order
        self._dendogram = Dendogram(
            [numpy.array(vector, numpy.float64) for vector in vectors])
        return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)

    def cluster_vectorspace(self, vectors, trace=False):
        # create a cluster for each vector
        clusters = [[vector] for vector in vectors]

        # the sum vectors
        vector_sum = copy.copy(vectors)

        while len(clusters) > max(self._num_clusters, 1):
            # find the two best candidate clusters to merge, based on their
            # S(union c_i, c_j)
            best = None
            for i in range(len(clusters)):
                for j in range(i + 1, len(clusters)):
                    sim = self._average_similarity(
                                vector_sum[i], len(clusters[i]),
                                vector_sum[j], len(clusters[j]))
                    if not best or sim > best[0]:
                        best = (sim, i, j)

            # merge them and replace in cluster list
            i, j = best[1:]
            vsum = clusters[i] + clusters[j]
            if trace: print 'merging %d and %d' % (i, j)

            clusters[i] = vsum
            del clusters[j]
            vector_sum[i] = vector_sum[i] + vector_sum[j]
            del vector_sum[j]

            self._dendogram.merge(i, j)

        self.update_clusters(self._num_clusters)

    def update_clusters(self, num_clusters):
        clusters = self._dendogram.groups(num_clusters)
        self._centroids = []
        for cluster in clusters:
            assert len(cluster) > 0
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            centroid /= float(len(cluster))
            self._centroids.append(centroid)
        self._num_clusters = len(self._centroids)
        
    def compute_rss(self, num_clusters):
        clusters = self._dendogram.groups(num_clusters)
        rss = 0
        for cluster in clusters:
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            centroid = centroid / float(len(cluster))
            for vector in cluster:
                diff = vector - centroid
                rss = rss + numpy.sqrt(numpy.vdot(diff, diff))
        return rss

    def classify_vectorspace(self, vector):
        best = None
        for i in range(self._num_clusters):
            centroid = self._centroids[i]
            sim = self._average_similarity(vector, 1, centroid, 1)
            if not best or sim > best[0]:
                best = (sim, i)
        return best[1]

    def dendogram(self):
        """
        @return: The dendogram representing the current clustering
        @rtype:  Dendogram
        """
        return self._dendogram

    def num_clusters(self):
        return self._num_clusters

    def _average_similarity(self, v1, l1, v2, l2):
        asum = v1 + v2
        length = l1 + l2
        return (numpy.dot(asum, asum) - length) / (length * (length - 1))

    def __repr__(self):
        return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
Ejemplo n.º 5
0
class OptCentroidClusterer(VectorSpaceClusterer):
    def __init__(self, vector_names = None, num_clusters=1, normalise=True, svd_dimensions=None, msg_handle=None):
        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
        self._num_clusters = num_clusters
        self._dendogram = None
        self._groups_values = None
        self._names = vector_names
        self._name_dendogram = None
        self._reassigned_clusters = {}
        self.msg_handle = msg_handle

    def array_max(self, ar):
        for i in range(ar.shape[0]):
            ar[i, i] = -9e9
        location = ar.argmax()
        r = int(round (location / ar.shape[1]))
        c = int(numpy.mod(location, ar.shape[1]))
        return [r, c]
        
    def cluster(self, vectors, assign_clusters=False, trace=False):
        # stores the merge order
        if self.msg_handle is not None:
            self.msg_handle.dm(str(len(vectors)))
            self.msg_handle.tile_yield()
        self._dendogram = Dendogram(
            [numpy.array(vector, numpy.float64) for vector in vectors])
        if self._names:
            self._name_dendogram = Dendogram(self._names)
        self._vectors_to_cluster = vectors
        return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)

    def cluster_vectorspace(self, vectors, trace=False):
        # create a cluster for each vector
        clusters = [[vector] for vector in vectors]

        # This copy module and function is from the python standard library
        vector_sum = copy.copy(vectors)
        norm_sum = [normalize(vsum) for vsum in vector_sum]

        cluster_matrix = numpy.zeros([len(vectors[0]), len(vectors)])
        i = 0
        for v in norm_sum:
            cluster_matrix[:, i] = v
            i = i + 1

        if self.msg_handle is not None:
            self.msg_handle.dm("initializing dot_store_matrix")
        dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix)
        start = time.time()
        while len(clusters) > max(self._num_clusters, 1):
            # find the two best candidate clusters to merge
            max_sim = self.array_max(dot_store_matrix)
            i = max_sim[0]
            j = max_sim[1]
            if i == j:
                print "got stuck when at " + str(len(clusters)) + " clusters"
                break;
            vsum = clusters[i] + clusters[j]
            if trace:
                print 'merging %d and %d' % (i, j)

            clusters[i] = vsum
            del clusters[j]
            vector_sum[i] = vector_sum[i] + vector_sum[j]
            norm_sum[i] = normalize(vector_sum[i])
            cluster_matrix[:, i] = norm_sum[i]
            del vector_sum[j]
            del norm_sum[j]
            cluster_matrix = numpy.delete(cluster_matrix, j, 1)

            dot_store_matrix = numpy.dot(cluster_matrix.transpose(), cluster_matrix)

            self._dendogram.merge(i, j)
            if self._names:
                self._name_dendogram.merge(i, j)
            end = time.time()
            if end - start > 5:
                if self.msg_handle is not None:
                    self.msg_handle.dm(str(len(clusters)))
                    start = end
                else:
                    print len(clusters)
        self.update_clusters(len(clusters))

    def update_clusters(self, num_clusters):
        print "entering update clusters with num_clusters = " + str(num_clusters)
        clusters = self._dendogram.groups(num_clusters)
        self._centroids = []
        for cluster in clusters:
            assert len(cluster) > 0
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            # centroid /= float(len(cluster))  # was this supposed to be some sort of normalizing?
            norm_centroid = normalize(centroid)
            self._centroids.append(norm_centroid)
        self._num_clusters = len(self._centroids)
        
    def compute_rss(self, num_clusters):
        clusters = self._dendogram.groups(num_clusters)
        rss = 0
        for cluster in clusters:
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            if self._should_normalise:
                centroid = self._normalise(centroid)
            for vector in cluster:
                diff = vector - centroid
                rss = rss + numpy.sqrt(numpy.vdot(diff, diff))
        return rss

    def get_iteratively_reassigned_clusters(self, num_clusters, max_iterations=100, saveit=True):
        if num_clusters in self._reassigned_clusters:
            return self._reassigned_clusters[num_clusters]
        self.update_clusters(num_clusters)
        new_centroids = self._centroids
        clusters = self._dendogram.groups(num_clusters)

        for iter in range(max_iterations):
            number_reassigned = 0
            new_clusters = [[] for i in range(len(self._centroids))]
            for cluster_number, cluster in enumerate(clusters):
                for vec in cluster:
                    dps = [numpy.dot(numpy.transpose(centroid), vec) for centroid in new_centroids]
                    new_cluster_index = dps.index(max(dps))
                    new_clusters[new_cluster_index].append(vec)
                    if new_cluster_index != cluster_number:
                        number_reassigned += 1
            new_centroids = []
            for cluster in new_clusters:
                assert len(cluster) > 0
                if self._should_normalise:
                    centroid = self._normalise(cluster[0])
                else:
                    centroid = numpy.array(cluster[0])
                for vector in cluster[1:]:
                    if self._should_normalise:
                        centroid += self._normalise(vector)
                    else:
                        centroid += vector
                # centroid /= float(len(cluster))  # was this supposed to be some sort of normalizing?
                norm_centroid = normalize(centroid)
                new_centroids.append(norm_centroid)
            print [len(cluster) for cluster in new_clusters]
            print "Number reassigned = %i" % number_reassigned
            if number_reassigned == 0:
                print "Stable after %i iterations" % iter
                break;
            clusters = new_clusters

            if saveit:
                self._reassigned_clusters[num_clusters] = (new_centroids,[len(cluster) for cluster in new_clusters], clusters)
Ejemplo n.º 6
0
class CentroidClusterer(VectorSpaceClusterer):
    def __init__(self, vector_names=None, num_clusters=1, normalise=True, svd_dimensions=None):
        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
        self._num_clusters = num_clusters
        self._dendogram = None
        self._groups_values = None
        self._names = vector_names
        self._name_dendogram = None

    def cluster(self, vectors, assign_clusters=False, trace=False):
        # stores the merge order
        self._dendogram = Dendogram([numpy.array(vector, numpy.float64) for vector in vectors])
        if self._names:
            self._name_dendogram = Dendogram(self._names)
        self._vectors_to_cluster = vectors
        return VectorSpaceClusterer.cluster(self, vectors, assign_clusters, trace)

    def cluster_vectorspace(self, vectors, trace=False):
        # create a cluster for each vector
        clusters = [[vector] for vector in vectors]

        # This copy module and function is from the python standard library
        vector_sum = copy.copy(vectors)
        norm_sum = [normalize(vsum) for vsum in vector_sum]

        while len(clusters) > max(self._num_clusters, 1):
            # find the two best candidate clusters to merge
            best = None
            
            for i in range(len(clusters)):
                for j in range(i + 1, len(clusters)):
                    sim = numpy.dot(norm_sum[i], norm_sum[j])
                    if not best or sim > best[0]:
                        best = (sim, i, j)

            # merge them and replace in cluster list
            i, j = best[1:]
            csum = clusters[i] + clusters[j]
            if trace:
                print 'merging %d and %d' % (i, j)
                # print len(clusters)

            clusters[i] = csum
            del clusters[j]
            vector_sum[i] = vector_sum[i] + vector_sum[j]
            norm_sum[i] = normalize(vector_sum[i])
            del vector_sum[j]
            del norm_sum[j]
            self._dendogram.merge(i, j)
            if self._names:
                self._name_dendogram.merge(i, j)
            if len(clusters) % 50 == 0:
                print len(clusters)
        self.update_clusters(self._num_clusters)

    def update_clusters(self, num_clusters):
        clusters = self._dendogram.groups(num_clusters)
        # print clusters
        self._centroids = []
        for cluster in clusters:
            assert len(cluster) > 0
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            centroid /= float(len(cluster))  # was this supposed to be some sort of normalizing?
            if self._should_normalise:
                self._centroids.append(normalize(centroid))
            else:
                self._centroids.append(centroid)
        self._num_clusters = len(self._centroids)
        
    def compute_rss(self, num_clusters):
        clusters = self._dendogram.groups(num_clusters)
        rss = 0
        for cluster in clusters:
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            centroid /= float(len(cluster))
            for vector in cluster:
                diff = vector - centroid
                rss = rss + numpy.sqrt(numpy.vdot(diff, diff))
        return rss

    def classify_vectorspace(self, vector):
        best = None
        for i in range(self._num_clusters):
            centroid = self._centroids[i]
            sim = self._similarity(vector, centroid)
            if not best or sim > best[0]:
                best = (sim, i)
        return best[1]

    def dendogram(self):
        """
        @return: The dendogram representing the current clustering
        @rtype:  Dendogram
        """
        return self._dendogram
    
    def name_dendogram(self):
        return self._name_dendogram

    def num_clusters(self):
        return self._num_clusters

    def _similarity(self, v1, v2):
        return (numpy.dot(v1, v2))
    def __repr__(self):
        return '<Centroid Clusterer n=%d>' % self._num_clusters
class GAAClusterer(VectorSpaceClusterer):
    """
    The Group Average Agglomerative starts with each of the N vectors as singleton
    clusters. It then iteratively merges pairs of clusters which have the
    closest centroids.  This continues until there is only one cluster. The
    order of merges gives rise to a dendogram: a tree with the earlier merges
    lower than later merges. The membership of a given number of clusters c, 1
    <= c <= N, can be found by cutting the dendogram at depth c.

    This clusterer uses the cosine similarity metric only, which allows for
    efficient speed-up in the clustering process. 
    """
    def __init__(self, num_clusters=1, normalise=True, svd_dimensions=None):
        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
        self._num_clusters = num_clusters
        self._dendogram = None
        self._groups_values = None

    def cluster(self, vectors, assign_clusters=False, trace=False):
        # stores the merge order
        self._dendogram = Dendogram(
            [numpy.array(vector, numpy.float64) for vector in vectors])
        return VectorSpaceClusterer.cluster(self, vectors, assign_clusters,
                                            trace)

    def cluster_vectorspace(self, vectors, trace=False):
        # create a cluster for each vector
        clusters = [[vector] for vector in vectors]

        # the sum vectors
        vector_sum = copy.copy(vectors)

        while len(clusters) > max(self._num_clusters, 1):
            # find the two best candidate clusters to merge, based on their
            # S(union c_i, c_j)
            best = None
            for i in range(len(clusters)):
                for j in range(i + 1, len(clusters)):
                    sim = self._average_similarity(vector_sum[i],
                                                   len(clusters[i]),
                                                   vector_sum[j],
                                                   len(clusters[j]))
                    if not best or sim > best[0]:
                        best = (sim, i, j)

            # merge them and replace in cluster list
            i, j = best[1:]
            vsum = clusters[i] + clusters[j]
            if trace: print 'merging %d and %d' % (i, j)

            clusters[i] = vsum
            del clusters[j]
            vector_sum[i] = vector_sum[i] + vector_sum[j]
            del vector_sum[j]

            self._dendogram.merge(i, j)

        self.update_clusters(self._num_clusters)

    def update_clusters(self, num_clusters):
        clusters = self._dendogram.groups(num_clusters)
        self._centroids = []
        for cluster in clusters:
            assert len(cluster) > 0
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            centroid /= float(len(cluster))
            self._centroids.append(centroid)
        self._num_clusters = len(self._centroids)

    def compute_rss(self, num_clusters):
        clusters = self._dendogram.groups(num_clusters)
        rss = 0
        for cluster in clusters:
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            for vector in cluster:
                diff = vector - centroid
                rss = rss + numpy.sqrt(numpy.vdot(diff, diff))
        return rss

    def classify_vectorspace(self, vector):
        best = None
        for i in range(self._num_clusters):
            centroid = self._centroids[i]
            sim = self._average_similarity(vector, 1, centroid, 1)
            if not best or sim > best[0]:
                best = (sim, i)
        return best[1]

    def dendogram(self):
        """
        @return: The dendogram representing the current clustering
        @rtype:  Dendogram
        """
        return self._dendogram

    def num_clusters(self):
        return self._num_clusters

    def _average_similarity(self, v1, l1, v2, l2):
        asum = v1 + v2
        length = l1 + l2
        return (numpy.dot(asum, asum) - length) / (length * (length - 1))

    def __repr__(self):
        return '<GroupAverageAgglomerative Clusterer n=%d>' % self._num_clusters
class OptCentroidClusterer(VectorSpaceClusterer):
    def __init__(self,
                 vector_names=None,
                 num_clusters=1,
                 normalise=True,
                 svd_dimensions=None):
        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
        self._num_clusters = num_clusters
        self._dendogram = None
        self._groups_values = None
        self._names = vector_names
        self._name_dendogram = None

    def array_max(self, ar):
        for i in range(ar.shape[0]):
            ar[i, i] = -9e9
        location = ar.argmax()
        r = int(round(location / ar.shape[1]))
        c = int(numpy.mod(location, ar.shape[1]))
        return [r, c]

    def cluster(self, vectors, assign_clusters=False, trace=False):
        # stores the merge order
        self._dendogram = Dendogram(
            [numpy.array(vector, numpy.float64) for vector in vectors])
        if self._names:
            self._name_dendogram = Dendogram(self._names)
        self._vectors_to_cluster = vectors
        return VectorSpaceClusterer.cluster(self, vectors, assign_clusters,
                                            trace)

    def cluster_vectorspace(self, vectors, trace=False):
        # create a cluster for each vector
        clusters = [[vector] for vector in vectors]

        # This copy module and function is from the python standard library
        vector_sum = copy.copy(vectors)
        norm_sum = [normalize(vsum) for vsum in vector_sum]

        cluster_matrix = numpy.zeros([len(vectors[0]), len(vectors)])
        i = 0
        for v in norm_sum:
            cluster_matrix[:, i] = v
            i = i + 1

        dot_store_matrix = numpy.dot(cluster_matrix.transpose(),
                                     cluster_matrix)

        while len(clusters) > max(self._num_clusters, 1):
            # find the two best candidate clusters to merge
            max_sim = self.array_max(dot_store_matrix)
            i = max_sim[0]
            j = max_sim[1]
            if i == j:
                print "got stuck when at " + str(len(clusters)) + " clusters"
                break
            vsum = clusters[i] + clusters[j]
            if trace:
                print 'merging %d and %d' % (i, j)

            clusters[i] = vsum
            del clusters[j]
            vector_sum[i] = vector_sum[i] + vector_sum[j]
            norm_sum[i] = normalize(vector_sum[i])
            cluster_matrix[:, i] = norm_sum[i]
            del vector_sum[j]
            del norm_sum[j]
            cluster_matrix = numpy.delete(cluster_matrix, j, 1)

            dot_store_matrix = numpy.dot(cluster_matrix.transpose(),
                                         cluster_matrix)

            self._dendogram.merge(i, j)
            if self._names:
                self._name_dendogram.merge(i, j)
            if len(clusters) % 50 == 0:
                print len(clusters)
        self.update_clusters(len(clusters))

    def update_clusters(self, num_clusters):
        print "entering update clusters with num_clusters = " + str(
            num_clusters)
        clusters = self._dendogram.groups(num_clusters)
        self._centroids = []
        for cluster in clusters:
            assert len(cluster) > 0
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            # centroid /= float(len(cluster))  # was this supposed to be some sort of normalizing?
            norm_centroid = normalize(centroid)
            self._centroids.append(norm_centroid)
        self._num_clusters = len(self._centroids)

    def compute_rss(self, num_clusters):
        clusters = self._dendogram.groups(num_clusters)
        rss = 0
        for cluster in clusters:
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            if self._should_normalise:
                centroid = self._normalise(centroid)
            for vector in cluster:
                diff = vector - centroid
                rss = rss + numpy.sqrt(numpy.vdot(diff, diff))
        return rss

    def classify_vectorspace(self, vector):
        best = None
        for i in range(self._num_clusters):
            centroid = self._centroids[i]
            sim = self._similarity(vector, centroid)
            if not best or sim > best[0]:
                best = (sim, i)
        return best[1]

    def dendogram(self):
        """
        @return: The dendogram representing the current clustering
        @rtype:  Dendogram
        """
        return self._dendogram

    def name_dendogram(self):
        return self._name_dendogram

    def num_clusters(self):
        return self._num_clusters

    def _similarity(self, v1, v2):
        return (numpy.dot(v1, v2))

    def __repr__(self):
        return '<Opt Centroid Clusterer n=%d>' % self._num_clusters
class CentroidClusterer(VectorSpaceClusterer):
    def __init__(self,
                 vector_names=None,
                 num_clusters=1,
                 normalise=True,
                 svd_dimensions=None):
        VectorSpaceClusterer.__init__(self, normalise, svd_dimensions)
        self._num_clusters = num_clusters
        self._dendogram = None
        self._groups_values = None
        self._names = vector_names
        self._name_dendogram = None

    def cluster(self, vectors, assign_clusters=False, trace=False):
        # stores the merge order
        self._dendogram = Dendogram(
            [numpy.array(vector, numpy.float64) for vector in vectors])
        if self._names:
            self._name_dendogram = Dendogram(self._names)
        self._vectors_to_cluster = vectors
        return VectorSpaceClusterer.cluster(self, vectors, assign_clusters,
                                            trace)

    def cluster_vectorspace(self, vectors, trace=False):
        # create a cluster for each vector
        clusters = [[vector] for vector in vectors]

        # This copy module and function is from the python standard library
        vector_sum = copy.copy(vectors)
        norm_sum = [normalize(vsum) for vsum in vector_sum]

        while len(clusters) > max(self._num_clusters, 1):
            # find the two best candidate clusters to merge
            best = None

            for i in range(len(clusters)):
                for j in range(i + 1, len(clusters)):
                    sim = numpy.dot(norm_sum[i], norm_sum[j])
                    if not best or sim > best[0]:
                        best = (sim, i, j)

            # merge them and replace in cluster list
            i, j = best[1:]
            csum = clusters[i] + clusters[j]
            if trace:
                print 'merging %d and %d' % (i, j)
                # print len(clusters)

            clusters[i] = csum
            del clusters[j]
            vector_sum[i] = vector_sum[i] + vector_sum[j]
            norm_sum[i] = normalize(vector_sum[i])
            del vector_sum[j]
            del norm_sum[j]
            self._dendogram.merge(i, j)
            if self._names:
                self._name_dendogram.merge(i, j)
            if len(clusters) % 50 == 0:
                print len(clusters)
        self.update_clusters(self._num_clusters)

    def update_clusters(self, num_clusters):
        clusters = self._dendogram.groups(num_clusters)
        # print clusters
        self._centroids = []
        for cluster in clusters:
            assert len(cluster) > 0
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            # centroid /= float(len(cluster))  # was this supposed to be some sort of normalizing?
            if self._should_normalise:
                self._centroids.append(normalize(centroid))
            else:
                self._centroids.append(centroid)
        self._num_clusters = len(self._centroids)

    def compute_rss(self, num_clusters):
        clusters = self._dendogram.groups(num_clusters)
        rss = 0
        for cluster in clusters:
            if self._should_normalise:
                centroid = self._normalise(cluster[0])
            else:
                centroid = numpy.array(cluster[0])
            for vector in cluster[1:]:
                if self._should_normalise:
                    centroid += self._normalise(vector)
                else:
                    centroid += vector
            for vector in cluster:
                diff = vector - centroid
                rss = rss + numpy.sqrt(numpy.vdot(diff, diff))
        return rss

    def classify_vectorspace(self, vector):
        best = None
        for i in range(self._num_clusters):
            centroid = self._centroids[i]
            sim = self._similarity(vector, centroid)
            if not best or sim > best[0]:
                best = (sim, i)
        return best[1]

    def dendogram(self):
        """
        @return: The dendogram representing the current clustering
        @rtype:  Dendogram
        """
        return self._dendogram

    def name_dendogram(self):
        return self._name_dendogram

    def num_clusters(self):
        return self._num_clusters

    def _similarity(self, v1, v2):
        print "othertest"
        return (numpy.dot(v1, v2))

    def __repr__(self):
        return '<Centroid Clusterer n=%d>' % self._num_clusters