def PrintClust(self): self.DataMatrix() #creating canvas for printing cluster self.maincanvas=Canvas(self.root,bg='#FFFFFF',width=100,height=320,) self.maincanvas.grid(row=8,rowspan=8,column=1,columnspan=16,pady=10,sticky="WE") self.xscrollbar = Scrollbar(self.root, orient=HORIZONTAL,command=self.maincanvas.xview) self.xscrollbar.grid(row=15,column=1,columnspan=16,sticky="WSE") self.yscrollbar = Scrollbar(self.root,orient=VERTICAL,command=self.maincanvas.yview) self.yscrollbar.grid(column=17,row=8,rowspan=8,sticky="WNS") self.maincanvas.config(xscrollcommand=self.xscrollbar,yscrollcommand=self.yscrollbar) courses, words, data = clusters.readfile("data.txt") #creating if condition for distance methods, tanimoto and pearson if str(self.var.get())=="1": clust=clusters.hcluster(data,distance=clusters.pearson) self.printclust=clusters.clust2str(clust, labels=courses) self.maincanvas.create_text(350,10,font="Helvetiva 10",text=self.printclust,anchor="nw") elif str(self.var.get())=="2": clust=clusters.hcluster(data,distance=clusters.tanimoto) self.printclust=clusters.clust2str(clust, labels=courses) self.maincanvas.create_text(350,10,font="Helvetiva 10",text=self.printclust,anchor="nw") else: tkMessageBox.showerror("ERROR","Please select distance method.") self.DataMatrix()
def clustering(self, canvas): # Clusters District or Parties columns, rows, percentages = clusters.readfile('data.txt') clust = clusters.hcluster(percentages, distance=clusters.sim_distance) clusters.drawdendrogram(clust, columns, jpeg='cl.jpg') img = ImageTk.PhotoImage(Image.open("cl.jpg")) canvas.create_image(20, 20, anchor=NW, image=img) canvas.image = img
def do_stemmed(): generate_blogfile_stem() blognames, words, data = clusters.readfile('datafiles/blogtop500_stemmed.txt') clust = clusters.hcluster(data) with open("datafiles/blogtop500stemmed_asciideno.txt", "w+") as out: clusters.printclust2file(clust, out, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500stemmed_deno.jpg') with open("datafiles/kmeans_blogtop500stemmed.txt", "w+") as kout: for k in [5, 10, 20]: print("For k=%d" % k) kout.write("K=%d\n" % k) kout.write("Iterations\n") centriods = clusters.kcluster_toFile(data, k=k, out=kout) kout.write("Centroid Values\n-------------------------\n") for count, centriod in enumerate(centriods, 1): print("Centroid #%d" % count) kout.write("Centroid #%d\n" % count) values = [] for idx in centriod: print(blognames[idx]) values.append(blognames[idx]) kout.write("%s\n" % ', '.join(values)) kout.write("=================================\n") print("-------") with open("datafiles/dimensionReductionStemmed.txt", "w+") as dout: scaled = clusters.scaledown_logiter(data, out=dout) clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500stemmed_clust2d.jpg')
def kmeans(x): jobnames,projects,data=clusters.readfile('job_projects') cl, matches = clusters.kcluster(data, k=x) #print cl matches_with_names = [] for i in range(x): matches_with_names.append([jobnames[r] for r in matches[i]]) return matches_with_names
def main(): blognames, words, data = clusters.readfile('blogdata.txt') print "K value is 5" kclust = clusters.kcluster(data, k=5) print "K value is 10" kclust = clusters.kcluster(data, k=10) print "K value is 20" kclust = clusters.kcluster(data, k=20)
def main(): blognames,words,data=clusters.readfile('blogdata.txt') print "K value is 5" kclust=clusters.kcluster(data,k=5) print "K value is 10" kclust=clusters.kcluster(data,k=10) print "K value is 20" kclust=clusters.kcluster(data,k=20)
def createDendrogram(): blogs, colnames, data = clusters.readfile('blogdata.txt') cluster = clusters.hcluster(data) clusters.drawdendrogram(cluster, blogs, jpeg='Dendrogram.jpg') f = open("ASCII.txt", 'w') sys.stdout = f clusters.printclust(cluster, labels=blogs) f.close() sys.stderr.close()
def cluster_parties(self): self.state = "party" #if user clickes cluster parties state changes to party. self.analysis_frame.pack(side=TOP, fill=BOTH) self.canvas.delete("all") #clearing canvas # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas self.party_list, self.district_list, self.data = clusters.readfile( "matrix.txt") clust = clusters.hcluster(self.data, distance=clusters.sim_distance) clusters.drawdendrogram(clust, self.party_list, jpeg='parties.jpg') self.insert_image("parties.jpg") #insert clustered image to canvas
def main(): # returns blog titles, words in blog (10%-50% boundaries), list of frequency info blognames,words,data=clusters.readfile('blogdata.txt') # returns a tree of foo.id, foo.left, foo.right clust=clusters.hcluster(data) # walks tree and prints ascii approximation of a dendogram; distance measure is Pearson's r clusters.printclust(clust,labels=blognames)
def createJPegDendogram(): ''' blognames,words,data=clusters.readfile('blogVector.txt') clust=clusters.hcluster(data) clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg') ''' blognames,words,data=clusters.readfile('blogVectorTFIDFVersion.txt') clust=clusters.hcluster(data) clusters.drawdendrogram(clust,blognames,jpeg='blogclustTFIDFVersion.jpg')
def createKMeansClusters(kValue): if( kValue>0 ): blognames,words,data=clusters.readfile('blogVector.txt') kclust=clusters.kcluster(data,k=kValue) count = 0 for cluster in kclust: if( len(cluster) > 0 ): print 'cluster', count for instance in cluster: print '...',blognames[instance] count += 1
def cluster_district(self): self.state = "district" #if user clickes cluster districts state changes to district. self.analysis_frame.pack(side=TOP, fill=BOTH) self.canvas.delete("all") #clearing canvas # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas self.party_list, self.district_list, self.data = clusters.readfile( "matrix.txt") new_data = clusters.rotatematrix(self.data) #we need to rotated matrix to cluster districts. clust = clusters.hcluster(new_data, distance=clusters.sim_distance) clusters.drawdendrogram(clust, self.district_list, jpeg='districts.jpg') self.insert_image("districts.jpg") #insert clustered image to canvas
def kmeans(): karr = [5, 10, 20] blogs, colnames, data = clusters.readfile('Outputs/blogdata.txt') for i in karr: kclust, itercount = clusters.kcluster(data, k=i) print(kclust) f = open("Outputs/kclust_%d.txt" % i, 'w') f.write("Iteration count: %d \n" % itercount) print(len(kclust)) for cluster in kclust: f.write("****************************\n") f.write("[") for blogid in cluster: f.write(blogs[blogid] + ", ") f.write("]\n")
def get_clusture(self, param): """ param - str -> Parameeter will be specified in self.writefiles if param is Country it will show Country clusters if param is Criterias it will show data clusters """ country_names, records, records_data = clusters.readfile( self.writed_names) if param == "Country": clust = clusters.hcluster(records_data) label = country_names elif param == "Criterias": rotated = clusters.rotatematrix(records_data) clust = clusters.hcluster(rotated) label = records self.jpg_names = 'clustured2.jpg' clusters.drawdendrogram(clust, labels=label, jpeg=self.jpg_names) self.show_image()
def getKmeans(): blognames, words, data = clusters.readfile("blogdata.txt") print "K value is 5" kclust = clusters.kcluster(data, k=5) print "\t\t" + str([blognames[r] for r in kclust[0]]) # print blognames in 1st centroid print "\t\t" + str([blognames[r] for r in kclust[1]]) # print blognames in 2nd centroid print "\t\t" + str([blognames[r] for r in kclust[2]]) # print blognames in 3rd centroid print "\t\t" + str([blognames[r] for r in kclust[3]]) # print blognames in 4th centroid print "\t\t" + str([blognames[r] for r in kclust[4]]) # print blognames in 5th centroid print "K value is 10" kclust = clusters.kcluster(data, k=10) print "\t\t" + str([blognames[r] for r in kclust[0]]) # print blognames in 1st centroid print "\t\t" + str([blognames[r] for r in kclust[1]]) # print blognames in 2nd centroid print "\t\t" + str([blognames[r] for r in kclust[2]]) # print blognames in 3rd centroid print "\t\t" + str([blognames[r] for r in kclust[3]]) # print blognames in 4th centroid print "\t\t" + str([blognames[r] for r in kclust[4]]) # print blognames in 5th centroid print "\t\t" + str([blognames[r] for r in kclust[5]]) # print blognames in 6th centroid print "\t\t" + str([blognames[r] for r in kclust[6]]) # print blognames in 7th centroid print "\t\t" + str([blognames[r] for r in kclust[7]]) # print blognames in 8th centroid print "\t\t" + str([blognames[r] for r in kclust[8]]) # print blognames in 9th centroid print "\t\t" + str([blognames[r] for r in kclust[9]]) # print blognames in 10th centroid print "K value is 20" kclust = clusters.kcluster(data, k=20) print "\t\t" + str([blognames[r] for r in kclust[0]]) # print blognames in 1st centroid print "\t\t" + str([blognames[r] for r in kclust[1]]) # print blognames in 2nd centroid print "\t\t" + str([blognames[r] for r in kclust[2]]) # print blognames in 3rd centroid print "\t\t" + str([blognames[r] for r in kclust[3]]) # print blognames in 4th centroid print "\t\t" + str([blognames[r] for r in kclust[4]]) # print blognames in 5th centroid print "\t\t" + str([blognames[r] for r in kclust[5]]) # print blognames in 6th centroid print "\t\t" + str([blognames[r] for r in kclust[6]]) # print blognames in 7th centroid print "\t\t" + str([blognames[r] for r in kclust[7]]) # print blognames in 8th centroid print "\t\t" + str([blognames[r] for r in kclust[8]]) # print blognames in 9th centroid print "\t\t" + str([blognames[r] for r in kclust[9]]) # print blognames in 10th centroid print "\t\t" + str([blognames[r] for r in kclust[10]]) # print blognames in 11th centroid print "\t\t" + str([blognames[r] for r in kclust[11]]) # print blognames in 12th centroid print "\t\t" + str([blognames[r] for r in kclust[12]]) # print blognames in 13th centroid print "\t\t" + str([blognames[r] for r in kclust[13]]) # print blognames in 14th centroid print "\t\t" + str([blognames[r] for r in kclust[14]]) # print blognames in 15th centroid print "\t\t" + str([blognames[r] for r in kclust[15]]) # print blognames in 16th centroid print "\t\t" + str([blognames[r] for r in kclust[16]]) # print blognames in 17th centroid print "\t\t" + str([blognames[r] for r in kclust[17]]) # print blognames in 18th centroid print "\t\t" + str([blognames[r] for r in kclust[18]]) # print blognames in 19th centroid print "\t\t" + str([blognames[r] for r in kclust[19]]) # print blognames in 20th centroid
def getKmeans(): blognames,words,data=clusters.readfile('blogdata.txt') print "K value is 5" kclust=clusters.kcluster(data,k=5) print "\t\t"+str([blognames[r] for r in kclust[0]]) print "\t\t"+str([blognames[r] for r in kclust[1]]) print "\t\t"+str([blognames[r] for r in kclust[2]]) print "\t\t"+str([blognames[r] for r in kclust[3]]) print "\t\t"+str([blognames[r] for r in kclust[4]]) print "K value is 10" kclust=clusters.kcluster(data,k=10) print "\t\t"+str([blognames[r] for r in kclust[0]]) print "\t\t"+str([blognames[r] for r in kclust[1]]) print "\t\t"+str([blognames[r] for r in kclust[2]]) print "\t\t"+str([blognames[r] for r in kclust[3]]) print "\t\t"+str([blognames[r] for r in kclust[4]]) print "\t\t"+str([blognames[r] for r in kclust[5]]) print "\t\t"+str([blognames[r] for r in kclust[6]]) print "\t\t"+str([blognames[r] for r in kclust[7]]) print "\t\t"+str([blognames[r] for r in kclust[8]]) print "\t\t"+str([blognames[r] for r in kclust[9]]) print "K value is 20" kclust=clusters.kcluster(data,k=20) print "\t\t"+str([blognames[r] for r in kclust[0]]) print "\t\t"+str([blognames[r] for r in kclust[1]]) print "\t\t"+str([blognames[r] for r in kclust[2]]) print "\t\t"+str([blognames[r] for r in kclust[3]]) print "\t\t"+str([blognames[r] for r in kclust[4]]) print "\t\t"+str([blognames[r] for r in kclust[5]]) print "\t\t"+str([blognames[r] for r in kclust[6]]) print "\t\t"+str([blognames[r] for r in kclust[7]]) print "\t\t"+str([blognames[r] for r in kclust[8]]) print "\t\t"+str([blognames[r] for r in kclust[9]]) print "\t\t"+str([blognames[r] for r in kclust[10]]) print "\t\t"+str([blognames[r] for r in kclust[11]]) print "\t\t"+str([blognames[r] for r in kclust[12]]) print "\t\t"+str([blognames[r] for r in kclust[13]]) print "\t\t"+str([blognames[r] for r in kclust[14]]) print "\t\t"+str([blognames[r] for r in kclust[15]]) print "\t\t"+str([blognames[r] for r in kclust[16]]) print "\t\t"+str([blognames[r] for r in kclust[17]]) print "\t\t"+str([blognames[r] for r in kclust[18]]) print "\t\t"+str([blognames[r] for r in kclust[19]])
def kMean(): kMeanValues = [5, 10, 20] blogs, colnames, data = clusters.readfile('blogdata.txt') for i in kMeanValues: kclust, itercount = clusters.kcluster(data, k=i) print(kclust) f = open("kclust_%d.txt" % i, 'w') f.write("Total Number Of Iterations: %d \n" % itercount) print(len(kclust)) clusterCount = 1 for cluster in kclust: i = 1 f.write("---\n") f.write("Cluster %d \n" % clusterCount) for blogid in cluster: f.write(str(i) + ".\t" + blogs[blogid] + "\n") i += 1 f.write("\n") clusterCount += 1
def Clustering(self): try: dic = self.fetcher_Journalist_with_Titles() if len(data_dict1) == 0: self.Error_Message() return Matrix = self.Make_Matrix() Journlist, word, freq = clusters.readfile('Matrix') if self.Radio_Values3.get() == 0: H_clustering = hcluster(freq) self.All_Results.delete(0, END) for i in range( len( clust2str(H_clustering, labels=Journlist).split('\n'))): self.All_Results.insert( END, clust2str(H_clustering, labels=Journlist).split('\n')[i]) elif self.Radio_Values3.get() == 1: K_Vlaue = self.Valueof_k.get() Cluster_Value = kcluster(freq, k=int(K_Vlaue)) Journalists = dic.keys() list = [(len(i), i) for i in Cluster_Value] list.sort(reverse=True) counter = 0 self.All_Results.delete(0, END) for i, j in list: list1 = [Journalists[k] for k in range(len(j))] new_str = "" for i in list1: new_str += str(i) + " " self.All_Results.insert( END, "Cluster %d:{" % (counter + 1) + new_str + "}" + "\n") counter += 1 except: pass
def main(): file = '' with open('1000_terms.csv') as f: file = f.readlines() users, words, data = clusters.readfile(file) kclust5, kclust10, kclust20 = get_clusts(data) five = get_users_clust(kclust5, users) ten = get_users_clust(kclust10, users) twenty = get_users_clust(kclust20, users) print(five) print(ten) print(twenty) output_clusters(five, 5) output_clusters(ten, 10) output_clusters(twenty, 20) return 0
def clustering_button(self): if len(database) == 0: self.Error_Message_Function() return prof_names, words, data = clusters.readfile("Will_be_Cluestered.txt") type_of_clustering = values_of_clustering[int( self.Radio_Values3.get())] #Determining the type of clustering with the dictionary. if type_of_clustering == "Hierarcial": clust = clusters.hcluster(data) self.All_Results_Part.delete(0, END) for i in range( len( clusters.clust2str(clust, labels=prof_names).split( '\n')) - 1): #split method is used for proper showing of cluster. self.All_Results_Part.insert( END, clusters.clust2str(clust, labels=prof_names).split('\n')[i]) #Last line of list will be empty string so it is neglected. elif type_of_clustering == "K-Means": clust = clusters.kcluster(data, k=int(self.Value_of_k.get())) #k is getting from the entry. prof_names = database.keys() new_list_with_length_of_elements = [(len(i), i) for i in clust] new_list_with_length_of_elements.sort(reverse=True) counter = 0 self.All_Results_Part.delete(0, END) for i, j in new_list_with_length_of_elements: new_proper_list = [prof_names[k] for k in range(len(j))] new_str = "" for i in new_proper_list: new_str += str(i) + " " self.All_Results_Part.insert( END, "Cluster %d:{" % (counter + 1) + new_str + "}" + "\n") counter += 1
def do_non_stem(): # generate the blog file generate_blogfile() # read the data in blognames, words, data = clusters.readfile('datafiles/blogtop500.txt') # do clustering clust = clusters.hcluster(data) # write out asci denogram with open("datafiles/blogtop500_asciideno.txt", "w+") as out: clusters.printclust2file(clust, out, labels=blognames) # generate jpg version of same denogram clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500_deno.jpg') # do kmeans and log to file with open("datafiles/kmeans_blogtop500.txt", "w+") as kout: for k in [5, 10, 20]: print("For k=%d" % k) kout.write("K=%d\n" % k) kout.write("Iterations\n") # kmeans for value k centriods = clusters.kcluster_toFile(data, k=k, out=kout) kout.write("Centroid Values\n-------------------------\n") # log centroid values for count, centriod in enumerate(centriods, 1): print("Centroid #%d" % count) kout.write("Centroid #%d\n" % count) values = [] for idx in centriod: print(blognames[idx]) values.append(blognames[idx]) kout.write("%s\n" % ', '.join(values)) kout.write("=================================\n") print("-------") # do the dimensionality reduction with open("datafiles/dimensionReductionNonStemmed.txt","w+") as dout: scaled = clusters.scaledown_logiter(data,out=dout) # generated the similar blog jpg clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500_clust2d.jpg')
#!/usr/bin/env python import clusters import question2b_conference_truth import question2b_policy_truth import question2b_race_truth import question2b_review_truth import question2b_story_truth from svm import * import svmutil datafile = '../data/blog_entries_word_data.tsv'; training = 50 blognames,words,data=clusters.readfile(datafile) def calculate_conference(): correct = 0 answers = [] input = [] count = 0 for d in data: answers.append(question2b_conference_truth.truth[count]) input.append(d) if count == 49: break count += 1 prob = svmutil.svm_problem(answers, input) param = svmutil.svm_parameter('-t 2 -c 4') param.cross_validation = 1
#Shawn Jones #!/usr/local/bin/python # all code here stolen shamelessly from # "Programming Collective Intelligence, Chapter 3" import sys sys.path.insert(0, '../libs') import clusters blognames,words,data=clusters.readfile('blogdata1V2.txt') clust = clusters.hcluster(data) # print ASCII dendrogram clusters.printclust(clust, labels=blognames) # save JPEG dendrogram clusters.drawdendrogram(clust, blognames, jpeg='blogclust.jpg')
# Take the average of the top k results for i in range(k): idx=dlist[i][1] avg+=data[idx]['result'] avg=avg/k return avg def exclude(data, idx): new_data = [] for i in range(len(data)): if i!=idx: new_data.append(data[i]) return new_data if __name__ == '__main__': blognames, terms, data = clusters.readfile('blog_term_matrix.csv') rows = [] for i in range(len(data)): int_d = [int(c) for c in data[i]] rows.append({ 'input' : tuple(data[i]), 'result' : i }) for k in [1,2,5,10,20]: print('Using k=%s' % k) print('=' * len('Using k=%s' % k)) # Nearest blog for 99th blog : F-Measure # Exclude 99th row from rows
####54页调用generatefeedvector生成blogdata文件失败。是因为feedlist里面的网址无法打开吗? ###downloadzebodata生成zebo.txt也失败。sigh import clusters blognames,words,data = clusters.readfile('blogdatadown.txt')#1 #clust = clusters.hcluster(data) #print (clust)#果然函数中这个值输出也都不一样呢。 #print(blognames) #clusters.printclust(clust, labels = blognames)#2 #clusters.drawdendrogram(clust, blognames, jpeg = 'blogclust.jpg')#3 rdata = clusters.rotatematrix(data)#4 wordclust = clusters.hcluster(rdata) clusters.drawdendrogram(wordclust, labels = words, jpeg = 'wordclust.jpg') ''' kclust = clusters.kcluster(data, k = 4)#5 print ([blognames[r] for r in kclust[0]]) print ([blognames[r] for r in kclust[1]]) import urllib.request#6 from bs4 import BeautifulSoup c = urllib.request.urlopen('https://en.wikipedia.org/wiki/Jon_Snow') soup = BeautifulSoup(c.read(),"lxml")#这里非常有趣! 感觉有空需要看下这个源代码库呀。 links = soup('a')#所以我还是不懂beautiful soup 的用法呀。 print(links[10]) print(links[10]['href']) #这一段是教BS的。
def countword(): blognames,words,data=clusters.readfile('blogdata1.txt') clust=clusters.hcluster(data)
def prefer(): reload(clusters) wants,people,data=clusters.readfile('zebo.txt') clust=clusters.hcluster(data,distance=clusters.tanamoto) clusters.drawdendrogram(clust,wants)
def ColumnClustering(): reload(clusters) blognames,words,data=clusters.readfile('blogdata1.txt') rdata=clusters.rotatematrix(data) wordclust=clusters.hcluster(rdata) clusters.drawdendrogram(wordclust,labels=words,jpeg='wordclust.jpg')
#!/usr/bin/env python # -*- coding: utf-8 -*- import clusters blognames,words,data = clusters.readfile( './../data/feed_list.csv' ) coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg="2d.jpg")
import clusters blognames, words, data = clusters.readfile('blogVectorResult.txt') clust = clusters.hcluster(data) #clusters.printclust(clust,labels=blognames) reload(clusters) #clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg') #kclust=clusters.kcluster(data,k=20) coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='blogs2d.jpg')
import clusters blognames,words,data=clusters.readfile('blogList500-matrix.txt') clust = clusters.hcluster(data) # print ASCII dendrogram clusters.printclust(clust, labels=blognames) # save JPEG dendrogram clusters.drawdendrogram(clust, blognames, jpeg='blogclust-q5.jpg')
def multidim(): jobnames,projects,data=clusters.readfile('job_projects') coords = clusters.scaledown(data) clusters.draw2d(coords,jobnames,jpeg='job_multidim.jpg')
def createMDS(): blognames,words,data=clusters.readfile('blogdata.txt') coords=clusters.scaledown(data) clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg')
#!/usr/local/bin/python # all code here stolen shamelessly from # "Programming Collective Intelligence, Chapter 3" import sys sys.path.insert(0, '../libs') import clusters blognames,words,data=clusters.readfile('../q1/blogdata500.txt') coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='blogs2d.jpg')
def draw_dendogram(): jobnames,projects,data=clusters.readfile('job_projects') clust=clusters.hcluster(data) #clusters.printclust(clust,labels=jobnames) clusters.drawdendrogram(clust,jobnames,jpeg='jobclust.jpg')
import clusters import numpredict def findNearestNeighbour(i, data, k): testing = data[i] neighbors = numpredict.knnestimate(data, testing, k) for i in neighbors: print(blogs[i[1]]) blogs, text, data = clusters.readfile("blogDataForknn.txt") for name in "F-Measure", "Web Science and Digital Libraries Research Group": for k in 1, 2, 5, 10, 20: print("Blog Name", name) print("For K", k) findNearestNeighbour(blogs.index(name), data, k) print("\n\n")
def kmean(): reload(clusters) rownames,words,data=clusters.readfile('blogdata.txt') kclust=clusters.kcluster(data,k=2) [rownames[r] for r in kclust[0]] [rownames[r] for r in kclust[1]]
import codecs import clusters, random import shelve sh = shelve.open("melone_data") standnames, words, data = clusters.readfile("jojodata.txt") datasize = len(data) sh["standnames"] = standnames sh["words"] = words sh["data"] = data tmpvec = [] def geneticoptimize(costf,popsize=50,\ mutprob=0.3,elite=0.3,maxiter=100): def mutate(vec): j = random.randint(1, len(vec) - 1) except_vec = [ y for y in filter(lambda x: x not in vec, [x for x in range(datasize)]) ] new_dna = random.sample(except_vec, j) return new_dna + vec[j:] def crossover(r1, r2): i = random.randint(1, datasize - 2) result = r1[0:i] + r2[i:] while len(set(result)) != 5: result = r1[0:i] + r2[i:] return result
def prefer2d(): reload(clusters) blognames,words,data=clusters.readfile('blogdata.txt') coords=clusters.scaledown(data) clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg')
def printwords(list, data, words): vecsum = zeros(len(data[0])) for l in list: vecsum = add(data[l],vecsum) topwrds = sorted(range(len(vecsum)), key=lambda x: vecsum[x])[-5:] for r in topwrds: print vecsum[r] print "The top words of this cluster are: \n" for r in topwrds: print words[r] moviename, words, data = clusters.readfile('res/blogdata2.txt') print 'Processing......' kclust = clusters.kcluster( data, k = 5) print "\t\t******* CLUSTER 1 *******" printwords( kclust[0], data, words) print '\n' print [moviename[r] for r in kclust[0]] print '\n\n\n' print "\t\t******* CLUSTER 2 *******" printwords( kclust[1], data, words) print '\n' print [moviename[r] for r in kclust[1]] print '\n\n\n'
# Valentina Neblitt-Jones # CS 595 Introduction to Web Science # Fall 2013 # Assignment 9 Question 2 import sys sys.path.insert(0, '/Users/vneblitt/Documents/cs595-f13/assignment09/library') import clusters # Create clusters blognames,words,data=clusters.readfile('/Users/vneblitt/Documents/cs595-f13/assignment09/q01/blogdata1.txt') clust=clusters.hcluster(data) # Create ASCII dendrogram clusters.printclust(clust,labels=blognames) # Create Nicer dendrogram with PIL clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
import clusters blognames,words,data=clusters.readfile('blogVectorResult.txt') clust=clusters.hcluster(data) #clusters.printclust(clust,labels=blognames) reload(clusters) #clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg') #kclust=clusters.kcluster(data,k=20) coords=clusters.scaledown(data) clusters.draw2d(coords,blognames,jpeg='blogs2d.jpg')
import clusters docs, words, data = clusters.readfile('titles_vectors.txt') distance_func = clusters.euclidean similarity_matrix = [] similarity_matrix.append([]) similarity_matrix[0].append(" ") padded_docs = ["{:<5}".format(d) for d in docs] similarity_matrix[0].extend(padded_docs) for i in range(1, len(data) + 1): similarity_matrix.append([]) similarity_matrix[i].append(docs[i - 1]) similarity_matrix[i].extend([' ' * 5] * len(data)) print("Euclidean distance - for documents") for i in range(len(data) - 1): for j in range(i + 1, len(data)): dist = distance_func(data[i], data[j]) similarity_matrix[i + 1][j + 1] = "{:<5}".format("{:.2f}".format( 1.0 / (1.0 + dist))) clusters.print_2d_array(similarity_matrix)
rows.append(row) except: skip -= 1 size = size + skip print(skip, size) #print(rows) f = open("newsdata.txt", "w") f.write("|||".join(word_list) + "\n") for i in range(size): row = rows[i] last_row = [] last_row.append(row["TITLE"]) for word in word_list: if word in row: last_row.append(str(row[word])) else: last_row.append(str(0)) f.write("|||".join(last_row) + "\n") f.close() # what's data :data.append([float(x) for x in p[1:]]) p:row blognames, words, data = clusters.readfile("newsdata.txt") clust = clusters.hcluster(data) clusters.printclust(clust, labels=blognames)
#!/usr/local/bin/python import clusters blog, words, data = clusters.readfile('blogdata.txt') coordinates = clusters.scaledown(data) clusters.draw2d(coordinates, blog, jpeg='blogs.jpg')
#!/usr/local/bin/python # all code here stolen shamelessly from # "Programming Collective Intelligence, Chapter 3" import sys import argparse sys.path.insert(0, '../libs') import clusters blognames,words,data=clusters.readfile('../producedFiles/blogtermmatrix.txt') coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='../producedFiles/2dBlogSpace.jpg')
#!/usr/bin/env python # -*- coding: utf-8 -*- import clusters blognames,words,data=clusters.readfile( './../data/banpaku_utf8.csv' ) clust=clusters.hcluster(data) # CUIで結果を表示 #clusters.printclust( clust, labels=blognames) # 画像で結果を表示 reload(clusters) clusters.drawdendrogram(clust, blognames, jpeg="banpaku_reg.jpg")
import clusters blognames,words,data=clusters.readfile('blogsdata.txt') kclust=clusters.kcluster(data,k=20) print([blognames[r] for r in kclust[0]]) print([blognames[r] for r in kclust[1]]) print([blognames[r] for r in kclust[2]]) print([blognames[r] for r in kclust[3]]) print([blognames[r] for r in kclust[4]]) print([blognames[r] for r in kclust[5]]) print([blognames[r] for r in kclust[6]]) print([blognames[r] for r in kclust[7]]) print([blognames[r] for r in kclust[8]]) print([blognames[r] for r in kclust[9]]) print([blognames[r] for r in kclust[10]]) print([blognames[r] for r in kclust[11]]) print([blognames[r] for r in kclust[12]]) print([blognames[r] for r in kclust[13]]) print([blognames[r] for r in kclust[14]]) print([blognames[r] for r in kclust[15]]) print([blognames[r] for r in kclust[16]]) print([blognames[r] for r in kclust[17]]) print([blognames[r] for r in kclust[18]]) print([blognames[r] for r in kclust[19]])
distancelist.sort() return distancelist def euclidean(v1, v2): #NEED TO REIMPLEMENT. d = 0.0 for i in range(len(v1)): d += (v1[i] - v2[i])**2 return math.sqrt(d) #getBlogs() #main() blognames, words, data = clusters.readfile('similarblogdata.txt') print(blognames) print(words) print(data) for i in range(len(data[1:])): if len(data[i + 1]) != len(data[i]): print(blognames[i + 1]) print(len(data[i + 1])) print(blognames[i]) print(len(data[i])) clust = clusters.hcluster(data) clusters.printclust(clust, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='sblogclust.jpg') kclust = clusters.kcluster(data, k=5) printkclustValues(kclust)
def drawingtheDendrogram(): blognames,words,data=clusters.readfile('blogdata1.txt') clust=clusters.hcluster(data) reload(clusters) clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
import clusters blognames, words, data = clusters.readfile('blogdataascii.txt') clust = clusters.hcluster(data) clusters.drawdendrogram(clust, blognames, jpeg='blogcluster.jpg')
import clusters def printCentroid(name, kcluster, n): for x in range(n): print("Centroid ", str(x + 1), ":") print([name[r] for r in kcluster[x]]) print("Centroid ", str(x + 1), ":", file=open("kcluster" + str(n) + ".txt", 'a+')) print([name[r] for r in kcluster[x]], file=open("kcluster" + str(n) + ".txt", 'a+')) name, word, data = clusters.readfile('blogdata1 (copy).txt') kcluster =clusters.kcluster(data,k=5) printCentroid(name,kcluster,5) kcluster =clusters.kcluster(data,k=10) printCentroid(name,kcluster,10) kcluster =clusters.kcluster(data,k=20) printCentroid(name,kcluster,20)