def do_stemmed(): generate_blogfile_stem() blognames, words, data = clusters.readfile('datafiles/blogtop500_stemmed.txt') clust = clusters.hcluster(data) with open("datafiles/blogtop500stemmed_asciideno.txt", "w+") as out: clusters.printclust2file(clust, out, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500stemmed_deno.jpg') with open("datafiles/kmeans_blogtop500stemmed.txt", "w+") as kout: for k in [5, 10, 20]: print("For k=%d" % k) kout.write("K=%d\n" % k) kout.write("Iterations\n") centriods = clusters.kcluster_toFile(data, k=k, out=kout) kout.write("Centroid Values\n-------------------------\n") for count, centriod in enumerate(centriods, 1): print("Centroid #%d" % count) kout.write("Centroid #%d\n" % count) values = [] for idx in centriod: print(blognames[idx]) values.append(blognames[idx]) kout.write("%s\n" % ', '.join(values)) kout.write("=================================\n") print("-------") with open("datafiles/dimensionReductionStemmed.txt", "w+") as dout: scaled = clusters.scaledown_logiter(data, out=dout) clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500stemmed_clust2d.jpg')
def do_non_stem(): # generate the blog file generate_blogfile() # read the data in blognames, words, data = clusters.readfile('datafiles/blogtop500.txt') # do clustering clust = clusters.hcluster(data) # write out asci denogram with open("datafiles/blogtop500_asciideno.txt", "w+") as out: clusters.printclust2file(clust, out, labels=blognames) # generate jpg version of same denogram clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500_deno.jpg') # do kmeans and log to file with open("datafiles/kmeans_blogtop500.txt", "w+") as kout: for k in [5, 10, 20]: print("For k=%d" % k) kout.write("K=%d\n" % k) kout.write("Iterations\n") # kmeans for value k centriods = clusters.kcluster_toFile(data, k=k, out=kout) kout.write("Centroid Values\n-------------------------\n") # log centroid values for count, centriod in enumerate(centriods, 1): print("Centroid #%d" % count) kout.write("Centroid #%d\n" % count) values = [] for idx in centriod: print(blognames[idx]) values.append(blognames[idx]) kout.write("%s\n" % ', '.join(values)) kout.write("=================================\n") print("-------") # do the dimensionality reduction with open("datafiles/dimensionReductionNonStemmed.txt","w+") as dout: scaled = clusters.scaledown_logiter(data,out=dout) # generated the similar blog jpg clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500_clust2d.jpg')
for i in range(len(v1)): d += (v1[i] - v2[i])**2 return math.sqrt(d) #getBlogs() #main() blognames, words, data = clusters.readfile('similarblogdata.txt') print(blognames) print(words) print(data) for i in range(len(data[1:])): if len(data[i + 1]) != len(data[i]): print(blognames[i + 1]) print(len(data[i + 1])) print(blognames[i]) print(len(data[i])) clust = clusters.hcluster(data) clusters.printclust(clust, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='sblogclust.jpg') kclust = clusters.kcluster(data, k=5) printkclustValues(kclust) kclust = clusters.kcluster(data, k=10) printkclustValues(kclust) kclust = clusters.kcluster(data, k=20) printkclustValues(kclust) coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='sblogs2d.jpg')
def prefer2d(): reload(clusters) blognames,words,data=clusters.readfile('blogdata.txt') coords=clusters.scaledown(data) clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg')
#!/usr/local/bin/python # all code here stolen shamelessly from # "Programming Collective Intelligence, Chapter 3" import sys import argparse sys.path.insert(0, '../libs') import clusters blognames,words,data=clusters.readfile('../producedFiles/blogtermmatrix.txt') coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='../producedFiles/2dBlogSpace.jpg')
def multidim(): jobnames,projects,data=clusters.readfile('job_projects') coords = clusters.scaledown(data) clusters.draw2d(coords,jobnames,jpeg='job_multidim.jpg')
def main(): blognames,words,data=clusters.readfile('blogdata.txt') coords=clusters.scaledown(data) clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg')
def mds(): blognames, words, data = clusters.readfile('blogdata.txt') coords, itercount = clusters.scaledown(data) clusters.draw2d(coords, labels=blognames, jpeg='mds.jpg') print ('Iteration count: %d' % itercount)
#!/usr/bin/python import clusters blognames, words, data = clusters.readfile('blogdata1.txt') clust = clusters.hcluster(data) #Question 2 clusters.printclust(clust, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='dengrogram.jpg') #Question 3 print "K = 5" kclust5 = clusters.kcluster(data, k=5) print "\nK = 10" kclust10 = clusters.kcluster(data, k=10) print "\nK = 20" kclust20 = clusters.kcluster(data, k=20) #Question 4 coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='MDS.jpg')
if frac>0.1 and frac<0.5: wordlist.append(w) except: print 'Failed to parse feed %s' % feedurl # Create a textfile containing matrix of all wordcounts from all blogs out=file('blogdata.txt','w') out.write('Blog') for word in wordlist: out.write('\t%s' % word) out.write('\n') for blog,wc in wordcounts.items(): print blog out.write(blog.encode('utf8')) for word in wordlist: if word in wc: out.write('\t%d' % wc[word]) else: out.write('\t0') out.write('\n') """ Using a smaller feedlist, the original generated blogdata was 32M. ************************ Printing out the cluster ************************ import clusters as clusters blogentries, words, data = clusters.readfile('blogdata5.txt') coords = clusters.scaledown(data) clusters.draw2d(coords, blogentries, jpeg='blog_entries.jpg') "
#!/usr/local/bin/python import clusters blog,words,data=clusters.readfile('blogdata.txt') coordinates = clusters.scaledown(data) clusters.draw2d(coordinates, blog, jpeg='blogs.jpg')
import clusters blog,words,data=clusters.readfile('blogdata.txt') coords=clusters.scaledown(data) clusters.draw2d(coords,blog,jpeg='blogsMDS.jpg')
#!/usr/local/bin/python import clusters (blognames, words, data)=clusters.readfile('blogdata.txt') daata = clusters.scaledown(data) clusters.draw2d(daata, blognames, jpeg='MDS.jpg')
#!/usr/bin/env python import clusters datafile = "../data/word_data.tsv" blognames, words, data = clusters.readfile(datafile) iterations, coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg="../question4.jpg") print "iterations: {}".format(iterations)
import clusters with open('1000_terms.csv') as f: file = f.readlines() users, words, data = clusters.readfile(file) coords = clusters.scaledown(data) clusters.draw2d(coords, users, jpeg='q5_data/twitter_user_MDS.jpg')
import clusters name, word, data = clusters.readfile('blogdata1 (copy).txt') cluster = clusters.scaledown(data) clusters.draw2d(cluster, name, jpeg='mds.jpg')
import clusters import Image moviename, words, data = clusters.readfile('res/blogdata2.txt') print 'Processing......' clust = clusters.hcluster( data) print 'Output image is generating...' clusters.drawdendrogram(clust, moviename, jpeg = 'output/finaloutput.jpg') print "Scaling down..." coords = clusters.scaledown(data) clusters.draw2d(coords, moviename, jpeg = 'output/finaloutput2d.jpg') image = Image.open('output/finaloutput.jpg') image.show() x = input("Press any key to quit....")
## Main driver added if __name__ == "__main__": import clusters # hierarchical clustering blognames,words,data=clusters.readfile('blogdata.txt') clust=clusters.hcluster(data) # ASCII dendrodram out=file('C:/Python27/myFiles/Assignment 9/ASCII-Dendrogram.txt','w') # redirect standard output to our file orig_stdout = sys.stdout sys.stdout = out clusters.printclust(clust,labels=blognames) out.close() sys.stdout = orig_stdout # JPEG dendrogram clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg') print "Dendrodrams complete." # K-Means Clustering print "K=5" kclust=clusters.kcluster(data,k=5) print "\n" print "K=10" kclust=clusters.kcluster(data,k=10) print "\n" print "K=20" kclust=clusters.kcluster(data,k=20) # Multidimensional scaling coords=clusters.scaledown(data) clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg') \end{verbatim}
import clusters blognames, words, data = clusters.readfile("blogdata.txt") # hierarchy clustering # clust=clusters.hcluster(data) ##clusters.printclust(clust, labels=blognames) # clusters.drawdendrogram(clust, blognames, jpeg='blogclust.jpg') # column clustering # rdata = clusters.rotatematrix(data) # clust=clusters.hcluster(rdata) # clusters.drawdendrogram(clust, words, jpeg='wordclust.jpg') # k-means clustering # kclust=clusters.kcluster(data, k=10) # print [blognames[r] for r in kclust[0]] # zebo.txt # wants, people, data=clusters.readfile('zebo.txt') # clust = clusters.hcluster(data, distance = clusters.tanimoto) # clusters.drawdendrogram(clust, wants, jpeg='zebo_wants_clust.jpg') # mds wants, people, data = clusters.readfile("zebo.txt") loc = clusters.scaledown(data, wants) clusters.draw2d(loc, wants) print "hello world"
import clusters blognames, words, data = clusters.readfile('blogdata.txt') coordinates = clusters.scaledown(data) clusters.draw2d(coordinates, blognames, jpeg='blogs2d.jpg') # for mds
#!/usr/bin/env python # -*- coding: utf-8 -*- import clusters blognames,words,data = clusters.readfile( './../data/feed_list.csv' ) coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg="2d.jpg")
''' clust = cl.hcluster(data) cl.printclust(clust,labels=blognames) cl.drawdendrogram(clust,blognames,jpeg='blogclust.jpg') rdata = cl.rotatematrix(data) wordclust = cl.hcluster(rdata) cl.printclust(wordclust,labels=words) cl.drawdendrogram(wordclust,words,jpeg='wordclust.jpg') k = 4 kclust = cl.kcluster(data,k=k) l = [[blognames[r] for r in kclust[i]] for i in range(k)] for ll in l: print len(ll),ll kclust = cl.kcluster_np(data,k=k) l = [[blognames[r] for r in kclust[i]] for i in range(k)] for ll in l: print len(ll),ll wants,people,data = cl.readfile('zebo') clust = cl.hcluster(data,distance=cl.tanimoto) cl.drawdendrogram(clust,wants) ''' coords = cl.scaledown(data) cl.draw2d(coords, blognames, jpeg='blogs2d.jpg')
def createMDS(): blognames,words,data=clusters.readfile('blogdata.txt') coords=clusters.scaledown(data) clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg')
# -*- coding: utf-8 -*- import clusters if __name__ == '__main__': blognames, terms, data = clusters.readfile('blog_term_matrix.csv') coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='mds_blog_2d.jpg')
#!/usr/bin/env python import clusters datafile = '../data/word_data_tfidf.tsv'; blognames,words,data=clusters.readfile(datafile) iterations, coords=clusters.scaledown(data) clusters.draw2d(coords,blognames,jpeg='../question5c.jpg') print "iterations: {}".format(iterations)
import clusters blogname, words, data = clusters.readfile('blogdata2.txt') coords = clusters.scaledown(data) clusters.draw2d(coords, blogname, jpeg='blog2d.jpg') rdata = clusters.rotatematrix(data) wordclust = clusters.hcluster(rdata) clusters.drawdendrogram(wordclust, labels=words, jpeg='wordclust.jpg')
import clusters blognames, words, data = clusters.readfile('blogdata.txt') coords, itercount = clusters.scaledown(data) clusters.draw2d(coords, labels=blognames, jpeg='mds.jpg') print ('Iteration count: %d' % itercount)
def createMDS(): blognames,words,data=clusters.readfile('blogVector.txt') coords,iterationCount=clusters.scaledown(data) clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg') print 'iterationCount', iterationCount
#!/usr/local/bin/python import clusters blog, words, data = clusters.readfile('blogdata.txt') coordinates = clusters.scaledown(data) clusters.draw2d(coordinates, blog, jpeg='blogs.jpg')
#!/usr/bin/python import clusters blognames, words, data = clusters.readfile('blogdata1.txt') clust=clusters.hcluster(data) #Question 2 clusters.printclust(clust, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='dengrogram.jpg') #Question 3 print "K = 5" kclust5 = clusters.kcluster(data, k=5) print "\nK = 10" kclust10 = clusters.kcluster(data, k=10) print "\nK = 20" kclust20 = clusters.kcluster(data, k=20) #Question 4 coords=clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='MDS.jpg')
#!/usr/local/bin/python # all code here stolen shamelessly from # "Programming Collective Intelligence, Chapter 3" import sys sys.path.insert(0, '../libs') import clusters blognames,words,data=clusters.readfile('../q1/blogdata500.txt') coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='blogs2d.jpg')
#!/usr/bin/env python # -*- coding: utf-8 -*- import clusters blognames, words, data = clusters.readfile('./../data/feed_list.csv') coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg="2d.jpg")