def PrintClust(self): self.DataMatrix() #creating canvas for printing cluster self.maincanvas=Canvas(self.root,bg='#FFFFFF',width=100,height=320,) self.maincanvas.grid(row=8,rowspan=8,column=1,columnspan=16,pady=10,sticky="WE") self.xscrollbar = Scrollbar(self.root, orient=HORIZONTAL,command=self.maincanvas.xview) self.xscrollbar.grid(row=15,column=1,columnspan=16,sticky="WSE") self.yscrollbar = Scrollbar(self.root,orient=VERTICAL,command=self.maincanvas.yview) self.yscrollbar.grid(column=17,row=8,rowspan=8,sticky="WNS") self.maincanvas.config(xscrollcommand=self.xscrollbar,yscrollcommand=self.yscrollbar) courses, words, data = clusters.readfile("data.txt") #creating if condition for distance methods, tanimoto and pearson if str(self.var.get())=="1": clust=clusters.hcluster(data,distance=clusters.pearson) self.printclust=clusters.clust2str(clust, labels=courses) self.maincanvas.create_text(350,10,font="Helvetiva 10",text=self.printclust,anchor="nw") elif str(self.var.get())=="2": clust=clusters.hcluster(data,distance=clusters.tanimoto) self.printclust=clusters.clust2str(clust, labels=courses) self.maincanvas.create_text(350,10,font="Helvetiva 10",text=self.printclust,anchor="nw") else: tkMessageBox.showerror("ERROR","Please select distance method.") self.DataMatrix()
def clustering(self, canvas): # Clusters District or Parties columns, rows, percentages = clusters.readfile('data.txt') clust = clusters.hcluster(percentages, distance=clusters.sim_distance) clusters.drawdendrogram(clust, columns, jpeg='cl.jpg') img = ImageTk.PhotoImage(Image.open("cl.jpg")) canvas.create_image(20, 20, anchor=NW, image=img) canvas.image = img
def main(nameof_file): (countries, vectors) = kmeans.readfile(nameof_file) clusters = utils.hcluster(vectors, distance=choice_dist) utils.drawdendrogram( clusters, list(map(lambda x: x[1], countries)), jpeg="C:/Users/akars/clustering_lab/processedhierarchical.jpg") opt_clust = [ clusters.left, clusters.right.left, clusters.right.right.left.left, clusters.right.right.left.right.left, clusters.right.right.left.right.right, clusters.right.right.right.right, clusters.right.right.right.left ] cluster_level = [] for cluster in opt_clust: country_ids = [] get(cluster, country_ids) cluster_level.append(country_ids) for i in range(clusternum): print('cluster {}:'.format(i + 1)) print([countries[r] for r in cluster_level[i]]) print("SSE: " + str(kmeans.sse(cluster_level, vectors))) if __name__ == "__main__": main("C:/Users/akars/clustering_lab/processed/preprocessed.csv")
def main(input_f): (countries, vectors) = k_means.read_file(input_f) clusters = utils.hcluster(vectors, distance=distance_function) utils.drawdendrogram(clusters, list(map(lambda x: x[1], countries)), jpeg='data/hierarchical.jpg') # Self-picked clusters from the graph which I considered good good_clusters = [ clusters.left, clusters.right.left, clusters.right.right.left.left, clusters.right.right.left.right.left, clusters.right.right.left.right.right, clusters.right.right.right.right, clusters.right.right.right.left ] cluster_level = [] for cluster in good_clusters: country_ids = [] get_all(cluster, country_ids) cluster_level.append(country_ids) for i in range(num_clusters): print('cluster {}:'.format(i + 1)) print([countries[r] for r in cluster_level[i]]) print("SSE: " + str(k_means.sse(cluster_level, vectors)))
def __main__(): blognames, words, data = clusters.read_file("blogdata.txt") clust = clusters.hcluster(data, distance=euclidean_distance) clusters.draw_dendogram(clust, blognames, jpeg="ex3dendrogram.jpg") # I think this weights against groupings that have similar word use rates but different word use counts.
def do_stemmed(): generate_blogfile_stem() blognames, words, data = clusters.readfile('datafiles/blogtop500_stemmed.txt') clust = clusters.hcluster(data) with open("datafiles/blogtop500stemmed_asciideno.txt", "w+") as out: clusters.printclust2file(clust, out, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500stemmed_deno.jpg') with open("datafiles/kmeans_blogtop500stemmed.txt", "w+") as kout: for k in [5, 10, 20]: print("For k=%d" % k) kout.write("K=%d\n" % k) kout.write("Iterations\n") centriods = clusters.kcluster_toFile(data, k=k, out=kout) kout.write("Centroid Values\n-------------------------\n") for count, centriod in enumerate(centriods, 1): print("Centroid #%d" % count) kout.write("Centroid #%d\n" % count) values = [] for idx in centriod: print(blognames[idx]) values.append(blognames[idx]) kout.write("%s\n" % ', '.join(values)) kout.write("=================================\n") print("-------") with open("datafiles/dimensionReductionStemmed.txt", "w+") as dout: scaled = clusters.scaledown_logiter(data, out=dout) clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500stemmed_clust2d.jpg')
def createDendrogram(): blogs, colnames, data = clusters.readfile('blogdata.txt') cluster = clusters.hcluster(data) clusters.drawdendrogram(cluster, blogs, jpeg='Dendrogram.jpg') f = open("ASCII.txt", 'w') sys.stdout = f clusters.printclust(cluster, labels=blogs) f.close() sys.stderr.close()
def hierarchical(self): self.clst = 1 if len(self.data) != 0: clust = clusters.hcluster(self.data) names = self.authors output = clusters.clust2str(clust, names) self.t3.get("1.0", END) self.t3.delete("1.0", END) self.t3.insert(END, output)
def get_clusture(self, param): """ param - str -> Parameeter will be specified in self.writefiles if param is Country it will show Country clusters if param is Criterias it will show data clusters """ country_names, records, records_data = clusters.readfile( self.writed_names) if param == "Country": clust = clusters.hcluster(records_data) label = country_names elif param == "Criterias": rotated = clusters.rotatematrix(records_data) clust = clusters.hcluster(rotated) label = records self.jpg_names = 'clustured2.jpg' clusters.drawdendrogram(clust, labels=label, jpeg=self.jpg_names) self.show_image()
def cluster_parties(self): self.state = "party" #if user clickes cluster parties state changes to party. self.analysis_frame.pack(side=TOP, fill=BOTH) self.canvas.delete("all") #clearing canvas # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas self.party_list, self.district_list, self.data = clusters.readfile( "matrix.txt") clust = clusters.hcluster(self.data, distance=clusters.sim_distance) clusters.drawdendrogram(clust, self.party_list, jpeg='parties.jpg') self.insert_image("parties.jpg") #insert clustered image to canvas
def main(): # returns blog titles, words in blog (10%-50% boundaries), list of frequency info blognames,words,data=clusters.readfile('blogdata.txt') # returns a tree of foo.id, foo.left, foo.right clust=clusters.hcluster(data) # walks tree and prints ascii approximation of a dendogram; distance measure is Pearson's r clusters.printclust(clust,labels=blognames)
def createJPegDendogram(): ''' blognames,words,data=clusters.readfile('blogVector.txt') clust=clusters.hcluster(data) clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg') ''' blognames,words,data=clusters.readfile('blogVectorTFIDFVersion.txt') clust=clusters.hcluster(data) clusters.drawdendrogram(clust,blognames,jpeg='blogclustTFIDFVersion.jpg')
def cluster_district(self): self.state = "district" #if user clickes cluster districts state changes to district. self.analysis_frame.pack(side=TOP, fill=BOTH) self.canvas.delete("all") #clearing canvas # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas self.party_list, self.district_list, self.data = clusters.readfile( "matrix.txt") new_data = clusters.rotatematrix(self.data) #we need to rotated matrix to cluster districts. clust = clusters.hcluster(new_data, distance=clusters.sim_distance) clusters.drawdendrogram(clust, self.district_list, jpeg='districts.jpg') self.insert_image("districts.jpg") #insert clustered image to canvas
def testNormal(self): rows = [[6, 4, 2], [2, 4, 6], [1, 2, 3], [3, 2, 1.01]] clust = [clusters.bicluster(rows[i], id=i) for i in range(len(rows))] c0 = clusters.bicluster(clusters.mergevecs(rows[1], rows[2]), left=clust[1], right=clust[2], id=-1, distance=0.0) c1 = clusters.bicluster(clusters.mergevecs(rows[0], rows[3]), left=clust[0], right=clust[3], id=-2, distance=clusters.pearson_dist(rows[0], rows[3])) c2 = clusters.bicluster(clusters.mergevecs(c0.vec, c1.vec), left=c0, right=c1, id=-3, distance=clusters.pearson_dist(c0.vec, c1.vec)) self.assertEquals(c2, clusters.hcluster(rows))
def cluster_poli(self, event): # function to cluster according to parties if self.run == 0: # checks if it is the first time that clustering has been made self.create_rest_of_gui() self.run += 1 self.update_idletasks() self.var.set( "party") # sets the variable for usage in refined analysis clust = clusters.hcluster( clusters.rotatematrix(self.create_matrix()), distance=sim_distance ) # calls a function from clusters.py to do the clustering clusters.drawdendrogram( clust, self.data_center.list_of_parties ) # calls a function from clusters.py to draw the dendogram self.create_rest_of_gui( ) # recreates the 2. GUI part so everything is reset self.img = ImageTk.PhotoImage(Image.open("clusters.jpg")) self.canvas.create_image( 0, 0, anchor=NW, image=self.img) # Inserts the dendogram to the canvas
def testNormal(self): rows = [[6, 4, 2], [2, 4, 6], [1, 2, 3], [3, 2, 1.01]] clust = [clusters.bicluster(rows[i], id=i) for i in range(len(rows))] c0 = clusters.bicluster(clusters.mergevecs(rows[1], rows[2]), left=clust[1], right=clust[2], id=-1, distance=0.0) c1 = clusters.bicluster(clusters.mergevecs(rows[0], rows[3]), left=clust[0], right=clust[3], id=-2, distance=clusters.pearson_dist( rows[0], rows[3])) c2 = clusters.bicluster(clusters.mergevecs(c0.vec, c1.vec), left=c0, right=c1, id=-3, distance=clusters.pearson_dist(c0.vec, c1.vec)) self.assertEquals(c2, clusters.hcluster(rows))
def clustering_button(self): if len(database) == 0: self.Error_Message_Function() return prof_names, words, data = clusters.readfile("Will_be_Cluestered.txt") type_of_clustering = values_of_clustering[int( self.Radio_Values3.get())] #Determining the type of clustering with the dictionary. if type_of_clustering == "Hierarcial": clust = clusters.hcluster(data) self.All_Results_Part.delete(0, END) for i in range( len( clusters.clust2str(clust, labels=prof_names).split( '\n')) - 1): #split method is used for proper showing of cluster. self.All_Results_Part.insert( END, clusters.clust2str(clust, labels=prof_names).split('\n')[i]) #Last line of list will be empty string so it is neglected. elif type_of_clustering == "K-Means": clust = clusters.kcluster(data, k=int(self.Value_of_k.get())) #k is getting from the entry. prof_names = database.keys() new_list_with_length_of_elements = [(len(i), i) for i in clust] new_list_with_length_of_elements.sort(reverse=True) counter = 0 self.All_Results_Part.delete(0, END) for i, j in new_list_with_length_of_elements: new_proper_list = [prof_names[k] for k in range(len(j))] new_str = "" for i in new_proper_list: new_str += str(i) + " " self.All_Results_Part.insert( END, "Cluster %d:{" % (counter + 1) + new_str + "}" + "\n") counter += 1
def do_non_stem(): # generate the blog file generate_blogfile() # read the data in blognames, words, data = clusters.readfile('datafiles/blogtop500.txt') # do clustering clust = clusters.hcluster(data) # write out asci denogram with open("datafiles/blogtop500_asciideno.txt", "w+") as out: clusters.printclust2file(clust, out, labels=blognames) # generate jpg version of same denogram clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500_deno.jpg') # do kmeans and log to file with open("datafiles/kmeans_blogtop500.txt", "w+") as kout: for k in [5, 10, 20]: print("For k=%d" % k) kout.write("K=%d\n" % k) kout.write("Iterations\n") # kmeans for value k centriods = clusters.kcluster_toFile(data, k=k, out=kout) kout.write("Centroid Values\n-------------------------\n") # log centroid values for count, centriod in enumerate(centriods, 1): print("Centroid #%d" % count) kout.write("Centroid #%d\n" % count) values = [] for idx in centriod: print(blognames[idx]) values.append(blognames[idx]) kout.write("%s\n" % ', '.join(values)) kout.write("=================================\n") print("-------") # do the dimensionality reduction with open("datafiles/dimensionReductionNonStemmed.txt","w+") as dout: scaled = clusters.scaledown_logiter(data,out=dout) # generated the similar blog jpg clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500_clust2d.jpg')
import clusters blog, words, data = clusters.readfile('tfidf.txt') variable = clusters.hcluster(data) # print ASCII dendrogram clusters.printclust(variable, labels=blog) # save JPEG dendrogram clusters.drawdendrogram(variable, blog, jpeg='clusterblogtfidf.jpg')
import clusters docs, words, data = clusters.readfile('titles_vectors.txt') rdata = clusters.rotatematrix(data) clust = clusters.hcluster(rdata, distance=clusters.pearson) print('clusters by pearson correlation') clusters.printhclust(clust, labels=words) clusters.drawdendrogram(clust, words, jpeg='wordsclustpearson.jpg') clust = clusters.hcluster(rdata, distance=clusters.tanimoto) print('clusters by tanimoto coefficient') clusters.printhclust(clust, labels=words) clusters.drawdendrogram(clust, words, jpeg='wordsclusttanimoto.jpg') clust = clusters.hcluster(rdata, distance=clusters.euclidean) print('clusters by euclidean distance') clusters.printhclust(clust, labels=words) clusters.drawdendrogram(clust, words, jpeg='wordsclusteuclidean.jpg')
def __main__(): entries, words, data = clusters.read_file('entrydata.txt') clust = clusters.hcluster(data) clusters.draw_dendogram(clust, entries, jpeg="ex2dend.jpg")
def drawingtheDendrogram(): blognames,words,data=clusters.readfile('blogdata1.txt') clust=clusters.hcluster(data) reload(clusters) clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
import clusters row_names, column_names, data = clusters.readfile('dataset_vectors.txt') clust = clusters.hcluster(data) print('clusters by euclidean distance') clusters.printhclust(clust, labels=row_names) clusters.drawdendrogram(clust, row_names, jpeg='hcluster_euclidean_centroid.jpg') print() clust = clusters.hcluster(data, clusters.find_by_min) print('clusters by euclidean distance') clusters.printhclust(clust, labels=row_names) clusters.drawdendrogram(clust, row_names, jpeg='hcluster_euclidean_min.jpg') print() clust = clusters.hcluster(data, clusters.find_by_max) print('clusters by euclidean distance') clusters.printhclust(clust, labels=row_names) clusters.drawdendrogram(clust, row_names, jpeg='hcluster_euclidean_max.jpg')
import kmcluster import clusters import word_cloud country, data = kmcluster.read_file("processed_data.csv") #for i in ['min', 'max']: clust = clusters.hcluster(data, distance=clusters.cosine, inter_dis=max) print(clust) print('cosine similarity')
def getHCluster(inputFile): """Do Hierarchical Clustering""" blognames, words, data = clusters.readfile(inputFile) return blognames, words, clusters.hcluster(data)
####54页调用generatefeedvector生成blogdata文件失败。是因为feedlist里面的网址无法打开吗? ###downloadzebodata生成zebo.txt也失败。sigh import clusters blognames,words,data = clusters.readfile('blogdatadown.txt')#1 #clust = clusters.hcluster(data) #print (clust)#果然函数中这个值输出也都不一样呢。 #print(blognames) #clusters.printclust(clust, labels = blognames)#2 #clusters.drawdendrogram(clust, blognames, jpeg = 'blogclust.jpg')#3 rdata = clusters.rotatematrix(data)#4 wordclust = clusters.hcluster(rdata) clusters.drawdendrogram(wordclust, labels = words, jpeg = 'wordclust.jpg') ''' kclust = clusters.kcluster(data, k = 4)#5 print ([blognames[r] for r in kclust[0]]) print ([blognames[r] for r in kclust[1]]) import urllib.request#6 from bs4 import BeautifulSoup c = urllib.request.urlopen('https://en.wikipedia.org/wiki/Jon_Snow') soup = BeautifulSoup(c.read(),"lxml")#这里非常有趣! 感觉有空需要看下这个源代码库呀。 links = soup('a')#所以我还是不懂beautiful soup 的用法呀。 print(links[10]) print(links[10]['href']) #这一段是教BS的。
#!/usr/bin/env python # -*- coding: utf-8 -*- import clusters blognames,words,data=clusters.readfile( './../data/banpaku_utf8.csv' ) clust=clusters.hcluster(data) # CUIで結果を表示 #clusters.printclust( clust, labels=blognames) # 画像で結果を表示 reload(clusters) clusters.drawdendrogram(clust, blognames, jpeg="banpaku_reg.jpg")
def createAsciiDendogram(): blognames,words,data=clusters.readfile('blogVector.txt') clust=clusters.hcluster(data) clusters.printclust(clust,labels=blognames)
def drawDendogram(): blognames,words,data=clusters.readfile('blogdata.txt') clust=clusters.hcluster(data) clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
import clusters blogname, words, data = clusters.readfile('blogdata2.txt') coords = clusters.scaledown(data) clusters.draw2d(coords, blogname, jpeg='blog2d.jpg') rdata = clusters.rotatematrix(data) wordclust = clusters.hcluster(rdata) clusters.drawdendrogram(wordclust, labels=words, jpeg='wordclust.jpg')
def draw_dendogram(): jobnames,projects,data=clusters.readfile('job_projects') clust=clusters.hcluster(data) #clusters.printclust(clust,labels=jobnames) clusters.drawdendrogram(clust,jobnames,jpeg='jobclust.jpg')
def draw_dendogram(): blognames, words, data = clusters.readfile('Outputs/blogdata.txt') clust = clusters.hcluster(data) clusters.drawdendrogram(clust, blognames, jpeg='blogclust.jpg')
except IndexError: outfile.close() return outfile.write('\n') if __name__ == "__main__": allw, articlew, artt = getarticlewords() wordmatrix, wordvec = makematrix(allw, articlew) # print wordvec[0:10] # print artt[1] # print wordmatrix[1][0:10] # hierarchical clustering import clusters clust = clusters.hcluster(wordmatrix) clusters.drawdendrogram(clust, artt, jpeg='news.jpg') # non-negative matrix factorization import nmf # m1 = np.matrix([[1, 2, 3], [4, 5, 6]]) # m2 = np.matrix([[1, 2], [3, 4], [5, 6]]) # w, h = nmf.factorize(m1 * m2, pc = 3, iter = 100) # print w * h v = np.matrix(wordmatrix) weights, feats = nmf.factorize(v, pc=20, iter=50) topp, pn = showfeatures(weights, feats, artt, wordvec) showarticles(artt, topp, pn)
def __main__(): tag_list = build_tag_list("programming") tags, urls, data = build_tag_matrix(tag_list) cluster = hcluster(data) draw_dendogram(cluster, urls, jpeg="delicious.jpg")
import clusters import json if __name__ == "__main__": docs, words, data = clusters.readfile('data/grocery_vectors.txt') clust = clusters.hcluster(data, distance=clusters.tanimoto) print('clusters by tanimoto coefficient') clusters.drawdendrogram(clust, docs, jpeg='img/groceries_tanimoto.jpg') json_obj = {} clusters.jsonify(clust, json_obj) with open("json/tanimoto.json", "w") as output: json.dump(json_obj, output) #clust=clusters.hcluster(data,distance=clusters.pearson) #print('clusters by pearson correlation') #clusters.drawdendrogram(clust,docs,jpeg='groceries_pearson.jpg') #clust = clusters.hcluster(data, distance=clusters.cosine) #print('clusters by cosine similarity') #clusters.drawdendrogram(clust,docs,jpeg='groceries_cosine.jpg') #clust=clusters.hcluster(data,distance=clusters.euclidean) #print('clusters by euclidean distance') #clusters.drawdendrogram(clust,docs,jpeg='groceries_euclidean.jpg')
def prefer(): reload(clusters) wants,people,data=clusters.readfile('zebo.txt') clust=clusters.hcluster(data,distance=clusters.tanamoto) clusters.drawdendrogram(clust,wants)
def clustering(): print '## Clustering' import clusters allw,artw,artt,wordmatrix,wordvec=readpickle() clust=clusters.hcluster(wordmatrix) clusters.drawdendrogram(clust,artt,jpeg='cluster.jpg')
def ColumnClustering(): reload(clusters) blognames,words,data=clusters.readfile('blogdata1.txt') rdata=clusters.rotatematrix(data) wordclust=clusters.hcluster(rdata) clusters.drawdendrogram(wordclust,labels=words,jpeg='wordclust.jpg')
import clusters blog,words,data=clusters.readfile('tfidf.txt') variable = clusters.hcluster(data) # print ASCII dendrogram clusters.printclust(variable, labels=blog) # save JPEG dendrogram clusters.drawdendrogram(variable, blog, jpeg='clusterblogtfidf.jpg')
def countword(): blognames,words,data=clusters.readfile('blogdata1.txt') clust=clusters.hcluster(data)
def __main__(): wants, people, data = clusters.read_file('zebo.txt') clust = clusters.hcluster(data, distance=manhattan_distance) clusters.draw_dendogram(clust, wants, jpeg="ex4dend.jpg")
except IndexError: outfile.close() return outfile.write('\n') if __name__ == "__main__": allw, articlew, artt = getarticlewords() wordmatrix, wordvec = makematrix(allw, articlew) # print wordvec[0:10] # print artt[1] # print wordmatrix[1][0:10] # hierarchical clustering import clusters clust = clusters.hcluster(wordmatrix) clusters.drawdendrogram(clust, artt, jpeg = 'news.jpg') # non-negative matrix factorization import nmf # m1 = np.matrix([[1, 2, 3], [4, 5, 6]]) # m2 = np.matrix([[1, 2], [3, 4], [5, 6]]) # w, h = nmf.factorize(m1 * m2, pc = 3, iter = 100) # print w * h v = np.matrix(wordmatrix) weights, feats = nmf.factorize(v, pc = 20, iter = 50) topp, pn = showfeatures(weights, feats, artt, wordvec) showarticles(artt, topp, pn)
def generateAscii(): blognames,words,data=clusters.readfile('blogdata.txt') clust=clusters.hcluster(data) clusters.printclust(clust,labels=blognames)
__author__ = 'feng' import clusters sessionIds, data = clusters.readSessionFile('session.csv') clust = clusters.hcluster(data, distance=clusters.session_dissimilarity) #clusters.printclust(clust, labels=sessionIds) clusters.drawdendrogram(clust, sessionIds, jpeg='sessionclust.jpg')
import clusters blognames, words, data = clusters.readfile('blogdataascii.txt') clust = clusters.hcluster(data) clusters.drawdendrogram(clust, blognames, jpeg='blogcluster.jpg')
import clusters # pprint(clusters.readfile('blogdata.txt')) blognames, words, data = clusters.readfile('blogdata.txt') clust = clusters.hcluster(data) # clusters.printclust(clust, labels=blognames) # k-means kclust1 = clusters.kcluster(data, k=10) # pprint([[blognames[i] for i in kclust[j]] for j in range(10)]) kclust2, clusters_pos = clusters.kcluster_exercise(data, k=10) # pprint(clusters_pos) # clusters on preferences wants, people, data = clusters.readfile('zebo.txt') clust = clusters.hcluster(data, distance=clusters.tanamoto) clusters.drawdendrogram(clust, wants)