def updatedist(self, c): """ Cluster c has changed, re-compute all intercluster distances """ self.removedist(c) for x in self.clusters: if x==c: continue d=kernel_dist(x.center,c.center) t=Dist(x,c,d) heapq.heappush(self.dist,t)
def updatedist(self, c): """ Cluster c has changed, re-compute all intercluster distances """ self.removedist(c) for x in self.clusters: if x == c: continue d = kernel_dist(x.center, c.center) t = Dist(x, c, d) heapq.heappush(self.dist, t)
def cluster(self, document): ''' Performs clustering for a new document. It takes as input a document object from the db and finds the closer cluster for it. ''' doc_index = self.add_document(document) doc_id = str(document.id) doc_content = document.content self.construct_term_doc_matrix(index=doc_index, document=doc_content) print 'N', len(self.clusters) print 'clustering', doc_index if doc_index > 0: #ignore the first document #e = doc_index e = self.td_matrix newc=OnlineCluster(a=e, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) #If the new term vector is larger then change all the cluster centers #However, if the new term vector is smaller then pad the new cluster's center if len(self.clusters) > 0: if len(newc.term_vector) > len(self.clusters[0].term_vector): self.resize() else: newc.resize(self.clusters[0].term_vector) e = newc.center if len(self.clusters)>0: # Compare the new document to each existing cluster c=[ ( i, kernel_dist(x.center, e) ) for i,x in enumerate(self.clusters)] closest_cluster = min( c , key=operator.itemgetter(1)) if closest_cluster[1] < 1.0: closest=self.clusters[closest_cluster[0]] closest.add(e, doc_id, doc_content) # invalidate dist-cache for this cluster self.updatedist(closest) else: # make a new cluster for this point self.clusters.append(newc) self.updatedist(newc) if len(self.clusters)>=self.N and len(self.clusters)>1: # merge closest two clusters. It doesn't matter which ones, Only the closest m=heapq.heappop(self.dist) m.x.merge(m.y) self.clusters.remove(m.y) self.removedist(m.y) self.updatedist(m.x) self.cluster_id_counter += 1 else: newc=OnlineCluster(a=self.td_matrix, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) self.clusters.append(newc) self.updatedist(newc)
def cluster(self, document): ''' Performs clustering for a new document. It takes as input a document object from the db and finds the closer cluster for it. ''' doc_index = self.add_document(document) doc_id = str(document.id) doc_content = document.content self.construct_term_doc_matrix(index=doc_index, document=doc_content) print 'N', len(self.clusters) print 'clustering', doc_index if doc_index > 0: #ignore the first document #e = doc_index e = self.td_matrix newc = OnlineCluster(a=e, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) #If the new term vector is larger then change all the cluster centers #However, if the new term vector is smaller then pad the new cluster's center if len(self.clusters) > 0: if len(newc.term_vector) > len(self.clusters[0].term_vector): self.resize() else: newc.resize(self.clusters[0].term_vector) e = newc.center if len(self.clusters) > 0: # Compare the new document to each existing cluster c = [(i, kernel_dist(x.center, e)) for i, x in enumerate(self.clusters)] closest_cluster = min(c, key=operator.itemgetter(1)) if closest_cluster[1] < 1.0: closest = self.clusters[closest_cluster[0]] closest.add(e, doc_id, doc_content) # invalidate dist-cache for this cluster self.updatedist(closest) else: # make a new cluster for this point self.clusters.append(newc) self.updatedist(newc) if len(self.clusters) >= self.N and len(self.clusters) > 1: # merge closest two clusters. It doesn't matter which ones, Only the closest m = heapq.heappop(self.dist) m.x.merge(m.y) self.clusters.remove(m.y) self.removedist(m.y) self.updatedist(m.x) self.cluster_id_counter += 1 else: newc = OnlineCluster(a=self.td_matrix, cluster_id=self.cluster_id_counter, doc_id=doc_id, doc_content=doc_content, term_vector=self.attributes) self.clusters.append(newc) self.updatedist(newc)