def add(self, docid) : vector = self.reader.read_docId(docid) maxdistvector = 0 for i, vid in enumerate(self.vectorsid) : currvec = self.reader.read_docId(vid) currdist = cosdist(vector, currvec) if currdist > self.maxdist[i] : self.maxdist[i] = currdist if currdist > maxdistvector : maxdistvector = currdist self.vectorsid.append(docid) self.maxdist.append(maxdistvector) newclustroidid = self.vectorsid[np.argmin(self.maxdist)] if newclustroidid != self.clustroidid : self.clustroidid = newclustroidid self.clustroid = self.reader.read_docId(newclustroidid)
def merge(self, cluster) : selfvectors = [self.reader.read_docId(i) for i in self.vectorsid] clustervectors = [cluster.reader.read_docId(i) for i in cluster.vectorsid] clustermaxdist = cluster.maxdist for i, selfvec in enumerate(selfvectors) : for j, clustervec in enumerate(clustervectors) : dist = cosdist(selfvec, clustervec) if dist > self.maxdist[i] : self.maxdist[i] = dist if dist > clustermaxdist[j] : clustermaxdist[j] = dist self.vectorsid.extend(cluster.vectorsid) self.maxdist.extend(clustermaxdist) newclustroidid = self.vectorsid[np.argmin(self.maxdist)] if newclustroidid != self.clustroidid : self.clustroidid = newclustroidid self.clustroid = self.reader.read_docId(newclustroidid)
#clusters = [Cluster(tfidf_list[c_idx]) for c_idx in centroids_idx] # Inititalize centroids by hierarchical clustering on a sample #nsample = math.ceil(0.05*npoint) nsample = 5*ncluster idxsample = rnd.sample(range(npoint), nsample) docIdsample = [reader.read_idx(i)[0] for i in idxsample] clusters_s = [Cluster(docIdsample[i], reader) for i in range(nsample)] distmat = np.zeros((nsample, nsample)) print('Calculating initial clustroids') for i in range(nsample) : distmat[i,i] = None for j in range(i+1,nsample) : dist = cosdist(clusters_s[i].clustroid, clusters_s[j].clustroid) distmat[i,j] = dist distmat[j,i] = dist nmerged = nsample while nmerged != ncluster : flatidx = np.nanargmin(distmat) i = flatidx//nsample j = flatidx%nsample clusters_s[i].merge(clusters_s[j]) clusters_s[j] = None distmat[j,:] = None distmat[:,j] = None for k in range(nsample) : if clusters_s[k] != None and k != i : dist = cosdist(clusters_s[k].clustroid, clusters_s[i].clustroid)