def do_stemmed(): generate_blogfile_stem() blognames, words, data = clusters.readfile('datafiles/blogtop500_stemmed.txt') clust = clusters.hcluster(data) with open("datafiles/blogtop500stemmed_asciideno.txt", "w+") as out: clusters.printclust2file(clust, out, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500stemmed_deno.jpg') with open("datafiles/kmeans_blogtop500stemmed.txt", "w+") as kout: for k in [5, 10, 20]: print("For k=%d" % k) kout.write("K=%d\n" % k) kout.write("Iterations\n") centriods = clusters.kcluster_toFile(data, k=k, out=kout) kout.write("Centroid Values\n-------------------------\n") for count, centriod in enumerate(centriods, 1): print("Centroid #%d" % count) kout.write("Centroid #%d\n" % count) values = [] for idx in centriod: print(blognames[idx]) values.append(blognames[idx]) kout.write("%s\n" % ', '.join(values)) kout.write("=================================\n") print("-------") with open("datafiles/dimensionReductionStemmed.txt", "w+") as dout: scaled = clusters.scaledown_logiter(data, out=dout) clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500stemmed_clust2d.jpg')
def do_non_stem(): # generate the blog file generate_blogfile() # read the data in blognames, words, data = clusters.readfile('datafiles/blogtop500.txt') # do clustering clust = clusters.hcluster(data) # write out asci denogram with open("datafiles/blogtop500_asciideno.txt", "w+") as out: clusters.printclust2file(clust, out, labels=blognames) # generate jpg version of same denogram clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500_deno.jpg') # do kmeans and log to file with open("datafiles/kmeans_blogtop500.txt", "w+") as kout: for k in [5, 10, 20]: print("For k=%d" % k) kout.write("K=%d\n" % k) kout.write("Iterations\n") # kmeans for value k centriods = clusters.kcluster_toFile(data, k=k, out=kout) kout.write("Centroid Values\n-------------------------\n") # log centroid values for count, centriod in enumerate(centriods, 1): print("Centroid #%d" % count) kout.write("Centroid #%d\n" % count) values = [] for idx in centriod: print(blognames[idx]) values.append(blognames[idx]) kout.write("%s\n" % ', '.join(values)) kout.write("=================================\n") print("-------") # do the dimensionality reduction with open("datafiles/dimensionReductionNonStemmed.txt","w+") as dout: scaled = clusters.scaledown_logiter(data,out=dout) # generated the similar blog jpg clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500_clust2d.jpg')