def clustering(self, canvas): # Clusters District or Parties columns, rows, percentages = clusters.readfile('data.txt') clust = clusters.hcluster(percentages, distance=clusters.sim_distance) clusters.drawdendrogram(clust, columns, jpeg='cl.jpg') img = ImageTk.PhotoImage(Image.open("cl.jpg")) canvas.create_image(20, 20, anchor=NW, image=img) canvas.image = img
def do_stemmed(): generate_blogfile_stem() blognames, words, data = clusters.readfile('datafiles/blogtop500_stemmed.txt') clust = clusters.hcluster(data) with open("datafiles/blogtop500stemmed_asciideno.txt", "w+") as out: clusters.printclust2file(clust, out, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500stemmed_deno.jpg') with open("datafiles/kmeans_blogtop500stemmed.txt", "w+") as kout: for k in [5, 10, 20]: print("For k=%d" % k) kout.write("K=%d\n" % k) kout.write("Iterations\n") centriods = clusters.kcluster_toFile(data, k=k, out=kout) kout.write("Centroid Values\n-------------------------\n") for count, centriod in enumerate(centriods, 1): print("Centroid #%d" % count) kout.write("Centroid #%d\n" % count) values = [] for idx in centriod: print(blognames[idx]) values.append(blognames[idx]) kout.write("%s\n" % ', '.join(values)) kout.write("=================================\n") print("-------") with open("datafiles/dimensionReductionStemmed.txt", "w+") as dout: scaled = clusters.scaledown_logiter(data, out=dout) clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500stemmed_clust2d.jpg')
def main(input_f): (countries, vectors) = k_means.read_file(input_f) clusters = utils.hcluster(vectors, distance=distance_function) utils.drawdendrogram(clusters, list(map(lambda x: x[1], countries)), jpeg='data/hierarchical.jpg') # Self-picked clusters from the graph which I considered good good_clusters = [ clusters.left, clusters.right.left, clusters.right.right.left.left, clusters.right.right.left.right.left, clusters.right.right.left.right.right, clusters.right.right.right.right, clusters.right.right.right.left ] cluster_level = [] for cluster in good_clusters: country_ids = [] get_all(cluster, country_ids) cluster_level.append(country_ids) for i in range(num_clusters): print('cluster {}:'.format(i + 1)) print([countries[r] for r in cluster_level[i]]) print("SSE: " + str(k_means.sse(cluster_level, vectors)))
def main(nameof_file): (countries, vectors) = kmeans.readfile(nameof_file) clusters = utils.hcluster(vectors, distance=choice_dist) utils.drawdendrogram( clusters, list(map(lambda x: x[1], countries)), jpeg="C:/Users/akars/clustering_lab/processedhierarchical.jpg") opt_clust = [ clusters.left, clusters.right.left, clusters.right.right.left.left, clusters.right.right.left.right.left, clusters.right.right.left.right.right, clusters.right.right.right.right, clusters.right.right.right.left ] cluster_level = [] for cluster in opt_clust: country_ids = [] get(cluster, country_ids) cluster_level.append(country_ids) for i in range(clusternum): print('cluster {}:'.format(i + 1)) print([countries[r] for r in cluster_level[i]]) print("SSE: " + str(kmeans.sse(cluster_level, vectors))) if __name__ == "__main__": main("C:/Users/akars/clustering_lab/processed/preprocessed.csv")
def createDendrogram(): blogs, colnames, data = clusters.readfile('blogdata.txt') cluster = clusters.hcluster(data) clusters.drawdendrogram(cluster, blogs, jpeg='Dendrogram.jpg') f = open("ASCII.txt", 'w') sys.stdout = f clusters.printclust(cluster, labels=blogs) f.close() sys.stderr.close()
def cluster_parties(self): self.state = "party" #if user clickes cluster parties state changes to party. self.analysis_frame.pack(side=TOP, fill=BOTH) self.canvas.delete("all") #clearing canvas # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas self.party_list, self.district_list, self.data = clusters.readfile( "matrix.txt") clust = clusters.hcluster(self.data, distance=clusters.sim_distance) clusters.drawdendrogram(clust, self.party_list, jpeg='parties.jpg') self.insert_image("parties.jpg") #insert clustered image to canvas
def main(): given_clusters, item_hash, cluster_names = get_clusters( "data/itemHierarchy.csv") with open("json/{}.json".format(METRIC)) as data: json_obj = json.load(data) created_clusters = clusters.read_json(json_obj) labels = make_vectors.get_list("data/itemIndex.txt") label_nodes(created_clusters, labels, item_hash, cluster_names) prune(created_clusters, given_clusters, MIN_ACCURACY, 0) clusters.drawdendrogram(created_clusters, labels, jpeg='img/{}_experiment.jpg'.format(METRIC))
def createJPegDendogram(): ''' blognames,words,data=clusters.readfile('blogVector.txt') clust=clusters.hcluster(data) clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg') ''' blognames,words,data=clusters.readfile('blogVectorTFIDFVersion.txt') clust=clusters.hcluster(data) clusters.drawdendrogram(clust,blognames,jpeg='blogclustTFIDFVersion.jpg')
def cluster_district(self): self.state = "district" #if user clickes cluster districts state changes to district. self.analysis_frame.pack(side=TOP, fill=BOTH) self.canvas.delete("all") #clearing canvas # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas self.party_list, self.district_list, self.data = clusters.readfile( "matrix.txt") new_data = clusters.rotatematrix(self.data) #we need to rotated matrix to cluster districts. clust = clusters.hcluster(new_data, distance=clusters.sim_distance) clusters.drawdendrogram(clust, self.district_list, jpeg='districts.jpg') self.insert_image("districts.jpg") #insert clustered image to canvas
def get_clusture(self, param): """ param - str -> Parameeter will be specified in self.writefiles if param is Country it will show Country clusters if param is Criterias it will show data clusters """ country_names, records, records_data = clusters.readfile( self.writed_names) if param == "Country": clust = clusters.hcluster(records_data) label = country_names elif param == "Criterias": rotated = clusters.rotatematrix(records_data) clust = clusters.hcluster(rotated) label = records self.jpg_names = 'clustured2.jpg' clusters.drawdendrogram(clust, labels=label, jpeg=self.jpg_names) self.show_image()
def cluster_poli(self, event): # function to cluster according to parties if self.run == 0: # checks if it is the first time that clustering has been made self.create_rest_of_gui() self.run += 1 self.update_idletasks() self.var.set( "party") # sets the variable for usage in refined analysis clust = clusters.hcluster( clusters.rotatematrix(self.create_matrix()), distance=sim_distance ) # calls a function from clusters.py to do the clustering clusters.drawdendrogram( clust, self.data_center.list_of_parties ) # calls a function from clusters.py to draw the dendogram self.create_rest_of_gui( ) # recreates the 2. GUI part so everything is reset self.img = ImageTk.PhotoImage(Image.open("clusters.jpg")) self.canvas.create_image( 0, 0, anchor=NW, image=self.img) # Inserts the dendogram to the canvas
def do_non_stem(): # generate the blog file generate_blogfile() # read the data in blognames, words, data = clusters.readfile('datafiles/blogtop500.txt') # do clustering clust = clusters.hcluster(data) # write out asci denogram with open("datafiles/blogtop500_asciideno.txt", "w+") as out: clusters.printclust2file(clust, out, labels=blognames) # generate jpg version of same denogram clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500_deno.jpg') # do kmeans and log to file with open("datafiles/kmeans_blogtop500.txt", "w+") as kout: for k in [5, 10, 20]: print("For k=%d" % k) kout.write("K=%d\n" % k) kout.write("Iterations\n") # kmeans for value k centriods = clusters.kcluster_toFile(data, k=k, out=kout) kout.write("Centroid Values\n-------------------------\n") # log centroid values for count, centriod in enumerate(centriods, 1): print("Centroid #%d" % count) kout.write("Centroid #%d\n" % count) values = [] for idx in centriod: print(blognames[idx]) values.append(blognames[idx]) kout.write("%s\n" % ', '.join(values)) kout.write("=================================\n") print("-------") # do the dimensionality reduction with open("datafiles/dimensionReductionNonStemmed.txt","w+") as dout: scaled = clusters.scaledown_logiter(data,out=dout) # generated the similar blog jpg clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500_clust2d.jpg')
import clusters file = '' with open('1000_terms.csv') as f: file = f.readlines() screen_names, words, data = clusters.readfile(file) print(clusters.readfile(file)) clust = clusters.hcluster(data) # #clusters.printclust(clust, labels=screen_names) clusters.drawdendrogram(clust, screen_names, jpeg='q3_data/userclust.jpg')
except IndexError: outfile.close() return outfile.write('\n') if __name__ == "__main__": allw, articlew, artt = getarticlewords() wordmatrix, wordvec = makematrix(allw, articlew) # print wordvec[0:10] # print artt[1] # print wordmatrix[1][0:10] # hierarchical clustering import clusters clust = clusters.hcluster(wordmatrix) clusters.drawdendrogram(clust, artt, jpeg='news.jpg') # non-negative matrix factorization import nmf # m1 = np.matrix([[1, 2, 3], [4, 5, 6]]) # m2 = np.matrix([[1, 2], [3, 4], [5, 6]]) # w, h = nmf.factorize(m1 * m2, pc = 3, iter = 100) # print w * h v = np.matrix(wordmatrix) weights, feats = nmf.factorize(v, pc=20, iter=50) topp, pn = showfeatures(weights, feats, artt, wordvec) showarticles(artt, topp, pn)
def main(args): def usage(): print >> sys.stderr, "Usage:" print >> sys.stderr, "sctoolbox correlates common_tracks [user1] [user2]" print >> sys.stderr, "sctoolbox correlates pearson_tastes [user1] [user2]" print >> sys.stderr, "sctoolbox suggest [user] bestlikes [n]" print >> sys.stderr, "sctoolbox suggest [user] following_tournament [n]" print >> sys.stderr, "sctoolbox suggest [user] following_tournament_short [n]" print >> sys.stderr, "sctoolbox suggest [user] following_tournament [n] --nomix" print >> sys.stderr, "sctoolbox suggest [user] following_tournament_short [n] --nomix" print >> sys.stderr, "sctoolbox suggest [user] following_tournament_playlimit [n] --nomix [playlimit]" print >> sys.stderr, "sctoolbox suggest [user] following_tournament_playlimit [n] [playlimit]" print >> sys.stderr, "sctoolbox searchUser [username]" print >> sys.stderr, "sctoolbox searchTrack [trackname]" print >> sys.stderr, "sctoolbox getTrackScore [trackname]" print >> sys.stderr, "sctoolbox similar [trackname]" print >> sys.stderr, "sctoolbox draw_style_galaxy [user] [jpg_path]" paths = [] client = SCDB.register() ############################################################################## if len(args ) == 5 and args[1] == 'correlates' and args[2] == 'pearson_tastes': user1 = SCDB.searchForUser(client, args[3]) user2 = SCDB.searchForUser(client, args[4]) puser1 = SCDB.extractProfile(client, user1) puser2 = SCDB.extractProfile(client, user2) r = SCDB.comparePearson(puser1, puser2) print 'Correlation score between users (pearson):', r ############################################################################## ############################################################################## elif len(args ) == 5 and args[1] == 'correlates' and args[2] == 'common_tracks': user1 = SCDB.searchForUser(client, args[3]) user2 = SCDB.searchForUser(client, args[4]) puser1 = SCDB.extractProfile(client, user1) puser2 = SCDB.extractProfile(client, user2) r = SCDB.compareCommonTracks(puser1, puser2) print 'Correlation score between users (common tracks):', r ############################################################################## ############################################################################## elif len(args) == 5 and args[1] == 'suggest' and args[ 3] == 'following_tournament': print( 'Launching tournament between tracks from followings, might take a while...' ) user = SCDB.searchForUser(client, args[2]) profile = SCDB.profileFollowings(client, user) suggestions = SCDB.getSuggestionsFromProfile(client, profile, int(args[4])) print(args[2] + " should like these tracks:") for item in suggestions: print item ############################################################################## ############################################################################## elif len(args) == 5 and args[1] == 'suggest' and args[3] == 'bestlikes': print( 'Rating tracks user liked, reposted, or commented, and playlisted, might take a while...' ) user = SCDB.searchForUser(client, args[2]) profile = SCDB.sortProfileFromFollowings(client, user) suggestions = SCDB.getSuggestionsFromProfile(client, profile, int(args[4])) print(args[2] + " best likes are:") for item in suggestions: print item ############################################################################## ############################################################################## elif len(args) == 5 and args[1] == 'suggest' and args[ 3] == 'following_tournament_short': print('Launching short tournament between tracks from followings...') user = SCDB.searchForUser(client, args[2]) profile = SCDB.profileFollowingsShort(client, user) suggestions = SCDB.getSuggestionsFromProfile(client, profile, int(args[4])) print(args[2] + " should like these tracks:") for item in suggestions: print item ############################################################################## ############################################################################## elif len(args) == 6 and args[1] == 'suggest' and args[ 3] == 'following_tournament' and args[5] == '--nomix': print( 'Launching tournament between tracks from followings, might take a while...' ) user = SCDB.searchForUser(client, args[2]) profile = SCDB.profileFollowings(client, user) suggestions = SCDB.getSuggestionsFromProfile(client, profile, int(args[4]), no_mix=True) print(args[2] + " should like these tracks:") for item in suggestions: print item ############################################################################## ############################################################################## elif len(args) == 6 and args[1] == 'suggest' and args[ 3] == 'following_tournament_short' and args[5] == '--nomix': print('Launching short tournament between tracks from followings...') user = SCDB.searchForUser(client, args[2]) profile = SCDB.profileFollowingsShort(client, user) print('Generating big profile...') suggestions = SCDB.getSuggestionsFromProfile(client, profile, int(args[4]), no_mix=True) print(args[2] + " should like these tracks:") for item in suggestions: print item ############################################################################## ############################################################################## elif len(args) == 7 and args[1] == 'suggest' and args[ 3] == 'following_tournament_playlimit' and args[5] == '--nomix': print('Launching custom tournament between tracks from followings...') user = SCDB.searchForUser(client, args[2]) profile = SCDB.profileFollowings(client, user) suggestions = SCDB.getSuggestionsFromProfile(client, profile, int(args[4]), no_mix=True, played_limit=int(args[6])) print(args[2] + " should like these tracks:") for item in suggestions: print item ############################################################################## ############################################################################## elif len(args) == 6 and args[1] == 'suggest' and args[ 3] == ' following_tournament_playlimit ': print('Launching custom tournament between tracks from followings...') user = SCDB.searchForUser(client, args[2]) profile = SCDB.profileFollowings(client, user) suggestions = SCDB.getSuggestionsFromProfile(client, profile, int(args[4]), no_mix=False, played_limit=int(args[5])) print(args[2] + " should like these tracks:") for item in suggestions: print item ############################################################################## ############################################################################## elif len(args) == 3 and args[1] == 'searchUser': container = client.get('/users', q=args[2]) n = 1 for item in container: print('############################') print('#' + str(n)) n += 1 print('username:'******'permalink:' + item.permalink) print('############################') ############################################################################## ############################################################################## elif len(args) == 4 and args[1] == 'draw_style_galaxy': print('Identifying user...') user = SCDB.searchForUser(client, args[2]) print('Downloading followers list...') followers_list = SCDB.getFollowerList(client, user) followers_list = SCDB.exctractsample(followers_list) row, col, data = SCDB.getCommentsData(client, followers_list) print('Generating clusters...') rotdata = clusters.rotatematrix(data) tagclust = clusters.hcluster(rotdata) print("Generationg dendrogram drawing...") clusters.drawdendrogram(tagclust, col, jpeg=args[3]) ############################################################################## ############################################################################## ############################################################################## ############################################################################## ############################################################################## ############################################################################## ############################################################################## ############################################################################## ############################################################################## else: usage()
parser.add_argument('-f',action='store',dest='matrixFile',nargs=1,help='Name of the file containing the blog matrix.') parser.add_argument('-ascii',action='store_true',dest='asciiDendrogram',help='Prints a dendrogram to the standard output.') parser.add_argument('-jpeg',action='store',dest='jpegDraw',nargs=1,help='Print the dendrogram to a jepeg file.') args = parser.parse_args() if args.matrixFile: fileName = str(args.matrixFile[0]) blognames,words,data=clusters.readfile(fileName) if data is not None: sys.stderr.write('Performing hcluster for {0}\n'.format(fileName)) clust = clusters.hcluster(data) sys.stderr.write('...Finished hcluster for {0}\n'.format(fileName)) if args.asciiDendrogram: if clust is not None: clusters.printclust(clust, labels=blognames) if args.jpegDraw: if clust is not None: jpegFileName = str(args.jpegDraw[0]) if '.jpg' not in jpegFileName: jpegFileName = jpegFileName + '.jpg' sys.stderr.write('Writing dendrogram to {0}\n'.format(jpegFileName)) clusters.drawdendrogram(clust, blognames, jpeg=jpegFileName) sys.stderr.write('...Dendrogram written to {0}\n'.format(jpegFileName))
import clusters import sys blognames,words,data=clusters.readfile('blogmatrix.txt') clust = clusters.hcluster(data) # print ASCII dendrogram clusters.printclust(clust, labels=blognames) sys.stdout = open('ascii.txt', 'w') # save JPEG dendrogram clusters.drawdendrogram(clust, blognames, jpeg='dendogram.jpg')
#!/usr/bin/env python import clusters datafile = '../data/word_data_tfidf.tsv'; blognames,words,data=clusters.readfile(datafile) clust=clusters.hcluster(data) file = open('../question5b.txt', 'w') clusters.printclustFile(file, clust,labels=blognames) file.close() clusters.drawdendrogram(clust,blognames,jpeg='../question5b.jpg')
#!/usr/local/bin/python # all code here stolen shamelessly from # "Programming Collective Intelligence, Chapter 3" import sys sys.path.insert(0, "../libs") import clusters blognames, words, data = clusters.readfile("data.txt") clust = clusters.hcluster(data) # print ASCII dendrogram clusters.printclust(clust, labels=blognames) # save JPEG dendrogram clusters.drawdendrogram(clust, blognames, jpeg="blogclust.jpg")
import clusters row_names, column_names, data = clusters.readfile('dataset_vectors.txt') clust = clusters.hcluster(data) print('clusters by euclidean distance') clusters.printhclust(clust, labels=row_names) clusters.drawdendrogram(clust, row_names, jpeg='hcluster_euclidean_centroid.jpg') print() clust = clusters.hcluster(data, clusters.find_by_min) print('clusters by euclidean distance') clusters.printhclust(clust, labels=row_names) clusters.drawdendrogram(clust, row_names, jpeg='hcluster_euclidean_min.jpg') print() clust = clusters.hcluster(data, clusters.find_by_max) print('clusters by euclidean distance') clusters.printhclust(clust, labels=row_names) clusters.drawdendrogram(clust, row_names, jpeg='hcluster_euclidean_max.jpg')
import clusters blog,words,data=clusters.readfile('tfidf.txt') variable = clusters.hcluster(data) # print ASCII dendrogram clusters.printclust(variable, labels=blog) # save JPEG dendrogram clusters.drawdendrogram(variable, blog, jpeg='clusterblogtfidf.jpg')
def drawDendogram(): blognames,words,data=clusters.readfile('blogdata.txt') clust=clusters.hcluster(data) clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 01/03/2017 11:57 AM # @Author : Shuqi.qin # @File : test.py # @Software: PyCharm Community Edition import clusters blognames, words, data = clusters.readfile('blogdata.txt') cluster = clusters.hclusters(data) clusters.drawdendrogram(cluster, blognames, 'wordcluster.jpg')
def draw_dendogram(): blognames, words, data = clusters.readfile('Outputs/blogdata.txt') clust = clusters.hcluster(data) clusters.drawdendrogram(clust, blognames, jpeg='blogclust.jpg')
import clusters import json if __name__ == "__main__": docs, words, data = clusters.readfile('data/grocery_vectors.txt') clust = clusters.hcluster(data, distance=clusters.tanimoto) print('clusters by tanimoto coefficient') clusters.drawdendrogram(clust, docs, jpeg='img/groceries_tanimoto.jpg') json_obj = {} clusters.jsonify(clust, json_obj) with open("json/tanimoto.json", "w") as output: json.dump(json_obj, output) #clust=clusters.hcluster(data,distance=clusters.pearson) #print('clusters by pearson correlation') #clusters.drawdendrogram(clust,docs,jpeg='groceries_pearson.jpg') #clust = clusters.hcluster(data, distance=clusters.cosine) #print('clusters by cosine similarity') #clusters.drawdendrogram(clust,docs,jpeg='groceries_cosine.jpg') #clust=clusters.hcluster(data,distance=clusters.euclidean) #print('clusters by euclidean distance') #clusters.drawdendrogram(clust,docs,jpeg='groceries_euclidean.jpg')
import clusters blogname, words, data = clusters.readfile('blogdata2.txt') coords = clusters.scaledown(data) clusters.draw2d(coords, blogname, jpeg='blog2d.jpg') rdata = clusters.rotatematrix(data) wordclust = clusters.hcluster(rdata) clusters.drawdendrogram(wordclust, labels=words, jpeg='wordclust.jpg')
#!/usr/bin/python import clusters blognames, words, data = clusters.readfile('blogdata1.txt') clust = clusters.hcluster(data) #Question 2 clusters.printclust(clust, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='dengrogram.jpg') #Question 3 print "K = 5" kclust5 = clusters.kcluster(data, k=5) print "\nK = 10" kclust10 = clusters.kcluster(data, k=10) print "\nK = 20" kclust20 = clusters.kcluster(data, k=20) #Question 4 coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='MDS.jpg')
def clustering(): print '## Clustering' import clusters allw,artw,artt,wordmatrix,wordvec=readpickle() clust=clusters.hcluster(wordmatrix) clusters.drawdendrogram(clust,artt,jpeg='cluster.jpg')
except IndexError: outfile.close() return outfile.write('\n') if __name__ == "__main__": allw, articlew, artt = getarticlewords() wordmatrix, wordvec = makematrix(allw, articlew) # print wordvec[0:10] # print artt[1] # print wordmatrix[1][0:10] # hierarchical clustering import clusters clust = clusters.hcluster(wordmatrix) clusters.drawdendrogram(clust, artt, jpeg = 'news.jpg') # non-negative matrix factorization import nmf # m1 = np.matrix([[1, 2, 3], [4, 5, 6]]) # m2 = np.matrix([[1, 2], [3, 4], [5, 6]]) # w, h = nmf.factorize(m1 * m2, pc = 3, iter = 100) # print w * h v = np.matrix(wordmatrix) weights, feats = nmf.factorize(v, pc = 20, iter = 50) topp, pn = showfeatures(weights, feats, artt, wordvec) showarticles(artt, topp, pn)
def ColumnClustering(): reload(clusters) blognames,words,data=clusters.readfile('blogdata1.txt') rdata=clusters.rotatematrix(data) wordclust=clusters.hcluster(rdata) clusters.drawdendrogram(wordclust,labels=words,jpeg='wordclust.jpg')
import clusters blog, words, data = clusters.readfile('tfidf.txt') variable = clusters.hcluster(data) # print ASCII dendrogram clusters.printclust(variable, labels=blog) # save JPEG dendrogram clusters.drawdendrogram(variable, blog, jpeg='clusterblogtfidf.jpg')
import clusters docs, words, data = clusters.readfile('titles_vectors.txt') clust = clusters.hcluster(data, distance=clusters.pearson) print('clusters by pearson correlation') clusters.printhclust(clust, labels=docs) clusters.drawdendrogram(clust, docs, jpeg='docsclust_pearson.jpg') clust = clusters.hcluster(data, distance=clusters.tanimoto) print('clusters by tanimoto coefficient') clusters.printhclust(clust, labels=docs) clusters.drawdendrogram(clust, docs, jpeg='docsclust_tanimoto.jpg') clust = clusters.hcluster(data, distance=clusters.euclidean) print('clusters by euclidean distance') clusters.printhclust(clust, labels=docs) clusters.drawdendrogram(clust, docs, jpeg='docsclust_euclidean.jpg') clust = clusters.hcluster(data, distance=clusters.cosine) print('clusters by euclidean distance') clusters.printhclust(clust, labels=docs) clusters.drawdendrogram(clust, docs, jpeg='docsclust_cosine.jpg')
import clusters name, word, data = clusters.readfile('blogdata1 (copy).txt') cluster = clusters.hcluster(data) clusters.printclust(cluster, labels=name) clusters.drawdendrogram(cluster, name, jpeg='BlogCluster.jpg')
####54页调用generatefeedvector生成blogdata文件失败。是因为feedlist里面的网址无法打开吗? ###downloadzebodata生成zebo.txt也失败。sigh import clusters blognames,words,data = clusters.readfile('blogdatadown.txt')#1 #clust = clusters.hcluster(data) #print (clust)#果然函数中这个值输出也都不一样呢。 #print(blognames) #clusters.printclust(clust, labels = blognames)#2 #clusters.drawdendrogram(clust, blognames, jpeg = 'blogclust.jpg')#3 rdata = clusters.rotatematrix(data)#4 wordclust = clusters.hcluster(rdata) clusters.drawdendrogram(wordclust, labels = words, jpeg = 'wordclust.jpg') ''' kclust = clusters.kcluster(data, k = 4)#5 print ([blognames[r] for r in kclust[0]]) print ([blognames[r] for r in kclust[1]]) import urllib.request#6 from bs4 import BeautifulSoup c = urllib.request.urlopen('https://en.wikipedia.org/wiki/Jon_Snow') soup = BeautifulSoup(c.read(),"lxml")#这里非常有趣! 感觉有空需要看下这个源代码库呀。 links = soup('a')#所以我还是不懂beautiful soup 的用法呀。 print(links[10]) print(links[10]['href']) #这一段是教BS的。 wants, people, data = clusters.readfile('zebodown.txt')#7
def draw_dendogram(): jobnames,projects,data=clusters.readfile('job_projects') clust=clusters.hcluster(data) #clusters.printclust(clust,labels=jobnames) clusters.drawdendrogram(clust,jobnames,jpeg='jobclust.jpg')
#!/usr/bin/env python # -*- coding: utf-8 -*- import clusters blognames,words,data=clusters.readfile( './../data/banpaku_utf8.csv' ) clust=clusters.hcluster(data) # CUIで結果を表示 #clusters.printclust( clust, labels=blognames) # 画像で結果を表示 reload(clusters) clusters.drawdendrogram(clust, blognames, jpeg="banpaku_reg.jpg")
__author__ = 'feng' import clusters sessionIds, data = clusters.readSessionFile('session.csv') clust = clusters.hcluster(data, distance=clusters.session_dissimilarity) #clusters.printclust(clust, labels=sessionIds) clusters.drawdendrogram(clust, sessionIds, jpeg='sessionclust.jpg')
dataList = [] for i in words: wordneed.append(i) for r in allInfo: print r clone = words theList = [] for i in allInfo[r]: if i in words: clone[i] = allInfo[r][i] for n in clone: theList.append(clone[n]) dataList.append(theList) return dataList,times,wordneed data,time,word = makeDate(allData) print 'done' # print len(data) clust = clusters.hcluster(data) print 'done' clusters.drawdendrogram(clust,time,jpeg = './myWeibo.jpg')
def drawingtheDendrogram(): blognames,words,data=clusters.readfile('blogdata1.txt') clust=clusters.hcluster(data) reload(clusters) clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
import clusters blognames, words, data = clusters.readfile('blogdata.txt') cl = clusters.hcluster(data) clusters.printclust(cl, labels=blognames) #ascii diagram clusters.drawdendrogram(cl, blognames, jpeg='blogcluster.jpg') #drawing the dendrogram
def prefer(): reload(clusters) wants,people,data=clusters.readfile('zebo.txt') clust=clusters.hcluster(data,distance=clusters.tanamoto) clusters.drawdendrogram(clust,wants)
import clusters import Image moviename, words, data = clusters.readfile('res/blogdata2.txt') print 'Processing......' clust = clusters.hcluster( data) print 'Output image is generating...' clusters.drawdendrogram(clust, moviename, jpeg = 'output/finaloutput.jpg') print "Scaling down..." coords = clusters.scaledown(data) clusters.draw2d(coords, moviename, jpeg = 'output/finaloutput2d.jpg') image = Image.open('output/finaloutput.jpg') image.show() x = input("Press any key to quit....")
import clusters docs, words, data = clusters.readfile('titles_vectors.txt') rdata = clusters.rotatematrix(data) clust = clusters.hcluster(rdata, distance=clusters.pearson) print('clusters by pearson correlation') clusters.printhclust(clust, labels=words) clusters.drawdendrogram(clust, words, jpeg='wordsclustpearson.jpg') clust = clusters.hcluster(rdata, distance=clusters.tanimoto) print('clusters by tanimoto coefficient') clusters.printhclust(clust, labels=words) clusters.drawdendrogram(clust, words, jpeg='wordsclusttanimoto.jpg') clust = clusters.hcluster(rdata, distance=clusters.euclidean) print('clusters by euclidean distance') clusters.printhclust(clust, labels=words) clusters.drawdendrogram(clust, words, jpeg='wordsclusteuclidean.jpg')
#print clust #clusters.printclust(clust,labels=blognames) #clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg') #转置,对词语进行聚类 #rdata=clusters.rotatematrix(data) #wordclust=clusters.hcluster(rdata) #clusters.drawdendrogram(wordclust,labels=words,jpeg='blogclust3.jpg') #kclust=clusters.kcluster(data,k=3) # #for r in kclust[0]: # print blognames[r] #BeautifulSoup #import urllib2 #from bs4 import BeautifulSoup # #c=urllib2.urlopen('http://www.baidu.com') #soup=BeautifulSoup(c.read(),'lxml') #links=soup('a') ##print soup #print links wants, people, data = clusters.readfile('zebo.txt') clust = clusters.hcluster(data, distance=clusters.tanamoto) clusters.drawdendrogram(clust, wants)
#Shawn Jones #!/usr/local/bin/python # all code here stolen shamelessly from # "Programming Collective Intelligence, Chapter 3" import sys sys.path.insert(0, '../libs') import clusters blognames,words,data=clusters.readfile('blogdata1V2.txt') clust = clusters.hcluster(data) # print ASCII dendrogram clusters.printclust(clust, labels=blognames) # save JPEG dendrogram clusters.drawdendrogram(clust, blognames, jpeg='blogclust.jpg')
feed = feedparser.parse('http://spchicagosp.wordpress.com/feed') feed.entries ================ import clusters blognames, words, data = clusters.readfile('blogdata.txt') clust = clusters.hcluster(data) clusters.printclust(clust, labels = blognames) clusters.drawdendrogram(clust, blognames, img ='blogclust.png')
import clusters blognames, words, data = clusters.readfile('blogdataascii.txt') clust = clusters.hcluster(data) clusters.drawdendrogram(clust, blognames, jpeg='blogcluster.jpg')
import clusters, data_processing '''Import Dataset''' data = data_processing.open_csv_file('dataset.csv') '''Create a list of countries in the order of the similarity matrix''' countries_list = data_processing.get_country_names(data) '''Create numerical attributes matrix''' attr_matrix = data_processing.create_attribute_matrix(data) data_processing.str_to_float(attr_matrix) '''hierachical clustering: euclidean distance''' num_cluster = 3 resulting_clusters = clusters.hcluster(attr_matrix, distance=clusters.euclidean) print('clusters by euclidean distance') clusters.printhclust(resulting_clusters, labels=countries_list) clusters.drawdendrogram(resulting_clusters, countries_list, jpeg='Euclidean Cluster.jpg') '''hierachical clustering: tanimoto coefficient''' resulting_clusters = clusters.hcluster(attr_matrix, distance=clusters.tanimoto) print('clusters by tanimoto coefficient') clusters.printhclust(resulting_clusters, labels=countries_list) clusters.drawdendrogram(resulting_clusters, countries_list, jpeg='Tanimoto Cluster.jpg') print() '''hierachical clustering: pearson similarity''' resulting_clusters = clusters.hcluster(attr_matrix, distance=clusters.pearson) print('clusters by pearson correlation') clusters.printhclust(resulting_clusters, labels=countries_list) clusters.drawdendrogram(resulting_clusters, countries_list,