Example #1
0
    def updatedist(self, c):
        """
        Cluster c has changed, re-compute all intercluster distances
        """
        self.removedist(c)

        for x in self.clusters:
            if x==c: continue
            d=kernel_dist(x.center,c.center)
            t=Dist(x,c,d)
            heapq.heappush(self.dist,t)
Example #2
0
    def updatedist(self, c):
        """
        Cluster c has changed, re-compute all intercluster distances
        """
        self.removedist(c)

        for x in self.clusters:
            if x == c: continue
            d = kernel_dist(x.center, c.center)
            t = Dist(x, c, d)
            heapq.heappush(self.dist, t)
Example #3
0
    def cluster(self, document):
        '''
        Performs clustering for a new document. It takes as input 
        a document object from the db and finds the closer cluster for it.
        '''
        doc_index = self.add_document(document)
        doc_id = str(document.id)
        doc_content = document.content

        self.construct_term_doc_matrix(index=doc_index, document=doc_content)
        
        print 'N', len(self.clusters)
        print 'clustering', doc_index
        if doc_index > 0: #ignore the first document
            #e = doc_index
            e = self.td_matrix
            newc=OnlineCluster(a=e, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) 
            
            #If the new term vector is larger then change all the cluster centers
            #However, if the new term vector is smaller then pad the new cluster's center
            if len(self.clusters) > 0:
                if len(newc.term_vector) > len(self.clusters[0].term_vector):
                    self.resize()
                else:
                    newc.resize(self.clusters[0].term_vector)
                    e = newc.center
                
            if len(self.clusters)>0: 
                # Compare the new document to each existing cluster
                c=[ ( i, kernel_dist(x.center, e) ) for i,x in enumerate(self.clusters)]
                closest_cluster = min( c , key=operator.itemgetter(1))
                if closest_cluster[1] < 1.0:
                    closest=self.clusters[closest_cluster[0]]
                    closest.add(e, doc_id, doc_content)
                    # invalidate dist-cache for this cluster
                    self.updatedist(closest)
                else:
                    # make a new cluster for this point
                    self.clusters.append(newc)
                    self.updatedist(newc)
    
            if len(self.clusters)>=self.N and len(self.clusters)>1:
                # merge closest two clusters. It doesn't matter which ones, Only the closest
                m=heapq.heappop(self.dist)
                m.x.merge(m.y)
                self.clusters.remove(m.y)
                self.removedist(m.y)
                self.updatedist(m.x)
                self.cluster_id_counter += 1
        else:
            newc=OnlineCluster(a=self.td_matrix, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) 
            self.clusters.append(newc)
            self.updatedist(newc)
Example #4
0
    def cluster(self, document):
        '''
        Performs clustering for a new document. It takes as input 
        a document object from the db and finds the closer cluster for it.
        '''
        doc_index = self.add_document(document)
        doc_id = str(document.id)
        doc_content = document.content

        self.construct_term_doc_matrix(index=doc_index, document=doc_content)

        print 'N', len(self.clusters)
        print 'clustering', doc_index
        if doc_index > 0:  #ignore the first document
            #e = doc_index
            e = self.td_matrix
            newc = OnlineCluster(a=e,
                                 cluster_id=self.cluster_id_counter,
                                 doc_id=doc_id,
                                 doc_content=doc_content,
                                 term_vector=self.attributes)

            #If the new term vector is larger then change all the cluster centers
            #However, if the new term vector is smaller then pad the new cluster's center
            if len(self.clusters) > 0:
                if len(newc.term_vector) > len(self.clusters[0].term_vector):
                    self.resize()
                else:
                    newc.resize(self.clusters[0].term_vector)
                    e = newc.center

            if len(self.clusters) > 0:
                # Compare the new document to each existing cluster
                c = [(i, kernel_dist(x.center, e))
                     for i, x in enumerate(self.clusters)]
                closest_cluster = min(c, key=operator.itemgetter(1))
                if closest_cluster[1] < 1.0:
                    closest = self.clusters[closest_cluster[0]]
                    closest.add(e, doc_id, doc_content)
                    # invalidate dist-cache for this cluster
                    self.updatedist(closest)
                else:
                    # make a new cluster for this point
                    self.clusters.append(newc)
                    self.updatedist(newc)

            if len(self.clusters) >= self.N and len(self.clusters) > 1:
                # merge closest two clusters. It doesn't matter which ones, Only the closest
                m = heapq.heappop(self.dist)
                m.x.merge(m.y)
                self.clusters.remove(m.y)
                self.removedist(m.y)
                self.updatedist(m.x)
                self.cluster_id_counter += 1
        else:
            newc = OnlineCluster(a=self.td_matrix,
                                 cluster_id=self.cluster_id_counter,
                                 doc_id=doc_id,
                                 doc_content=doc_content,
                                 term_vector=self.attributes)
            self.clusters.append(newc)
            self.updatedist(newc)