def clustering(self, canvas):  # Clusters District or Parties
     columns, rows, percentages = clusters.readfile('data.txt')
     clust = clusters.hcluster(percentages, distance=clusters.sim_distance)
     clusters.drawdendrogram(clust, columns, jpeg='cl.jpg')
     img = ImageTk.PhotoImage(Image.open("cl.jpg"))
     canvas.create_image(20, 20, anchor=NW, image=img)
     canvas.image = img
Example #2
0
def do_stemmed():
    generate_blogfile_stem()
    blognames, words, data = clusters.readfile('datafiles/blogtop500_stemmed.txt')
    clust = clusters.hcluster(data)
    with open("datafiles/blogtop500stemmed_asciideno.txt", "w+") as out:
        clusters.printclust2file(clust, out, labels=blognames)
    clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500stemmed_deno.jpg')

    with open("datafiles/kmeans_blogtop500stemmed.txt", "w+") as kout:
        for k in [5, 10, 20]:
            print("For k=%d" % k)
            kout.write("K=%d\n" % k)
            kout.write("Iterations\n")
            centriods = clusters.kcluster_toFile(data, k=k, out=kout)
            kout.write("Centroid Values\n-------------------------\n")
            for count, centriod in enumerate(centriods, 1):
                print("Centroid #%d" % count)
                kout.write("Centroid #%d\n" % count)
                values = []
                for idx in centriod:
                    print(blognames[idx])
                    values.append(blognames[idx])
                kout.write("%s\n" % ', '.join(values))
            kout.write("=================================\n")
            print("-------")
    with open("datafiles/dimensionReductionStemmed.txt", "w+") as dout:
        scaled = clusters.scaledown_logiter(data, out=dout)
    clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500stemmed_clust2d.jpg')
Example #3
0
def main(input_f):
    (countries, vectors) = k_means.read_file(input_f)

    clusters = utils.hcluster(vectors, distance=distance_function)
    utils.drawdendrogram(clusters,
                         list(map(lambda x: x[1], countries)),
                         jpeg='data/hierarchical.jpg')

    # Self-picked clusters from the graph which I considered good
    good_clusters = [
        clusters.left, clusters.right.left, clusters.right.right.left.left,
        clusters.right.right.left.right.left,
        clusters.right.right.left.right.right,
        clusters.right.right.right.right, clusters.right.right.right.left
    ]

    cluster_level = []

    for cluster in good_clusters:
        country_ids = []
        get_all(cluster, country_ids)
        cluster_level.append(country_ids)

    for i in range(num_clusters):
        print('cluster {}:'.format(i + 1))
        print([countries[r] for r in cluster_level[i]])

    print("SSE: " + str(k_means.sse(cluster_level, vectors)))
Example #4
0
def main(nameof_file):
    (countries, vectors) = kmeans.readfile(nameof_file)
    clusters = utils.hcluster(vectors, distance=choice_dist)
    utils.drawdendrogram(
        clusters,
        list(map(lambda x: x[1], countries)),
        jpeg="C:/Users/akars/clustering_lab/processedhierarchical.jpg")

    opt_clust = [
        clusters.left, clusters.right.left, clusters.right.right.left.left,
        clusters.right.right.left.right.left,
        clusters.right.right.left.right.right,
        clusters.right.right.right.right, clusters.right.right.right.left
    ]

    cluster_level = []
    for cluster in opt_clust:
        country_ids = []
    get(cluster, country_ids)
    cluster_level.append(country_ids)

    for i in range(clusternum):
        print('cluster {}:'.format(i + 1))
        print([countries[r] for r in cluster_level[i]])
    print("SSE: " + str(kmeans.sse(cluster_level, vectors)))

    if __name__ == "__main__":
        main("C:/Users/akars/clustering_lab/processed/preprocessed.csv")
def createDendrogram():
    blogs, colnames, data = clusters.readfile('blogdata.txt')
    cluster = clusters.hcluster(data)
    clusters.drawdendrogram(cluster, blogs, jpeg='Dendrogram.jpg')
    f = open("ASCII.txt", 'w')
    sys.stdout = f
    clusters.printclust(cluster, labels=blogs)
    f.close()
    sys.stderr.close()
 def cluster_parties(self):
     self.state = "party"  #if user clickes cluster parties state changes to party.
     self.analysis_frame.pack(side=TOP, fill=BOTH)
     self.canvas.delete("all")  #clearing canvas
     # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas
     self.party_list, self.district_list, self.data = clusters.readfile(
         "matrix.txt")
     clust = clusters.hcluster(self.data, distance=clusters.sim_distance)
     clusters.drawdendrogram(clust, self.party_list, jpeg='parties.jpg')
     self.insert_image("parties.jpg")  #insert clustered image to canvas
Example #7
0
def main():
    given_clusters, item_hash, cluster_names = get_clusters(
        "data/itemHierarchy.csv")
    with open("json/{}.json".format(METRIC)) as data:
        json_obj = json.load(data)
        created_clusters = clusters.read_json(json_obj)
    labels = make_vectors.get_list("data/itemIndex.txt")

    label_nodes(created_clusters, labels, item_hash, cluster_names)
    prune(created_clusters, given_clusters, MIN_ACCURACY, 0)
    clusters.drawdendrogram(created_clusters,
                            labels,
                            jpeg='img/{}_experiment.jpg'.format(METRIC))
Example #8
0
def createJPegDendogram():

	'''
	blognames,words,data=clusters.readfile('blogVector.txt')
	clust=clusters.hcluster(data)
	clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
	'''
	


	
	blognames,words,data=clusters.readfile('blogVectorTFIDFVersion.txt')
	clust=clusters.hcluster(data)
	clusters.drawdendrogram(clust,blognames,jpeg='blogclustTFIDFVersion.jpg')
 def cluster_district(self):
     self.state = "district"
     #if user clickes cluster districts state changes to district.
     self.analysis_frame.pack(side=TOP, fill=BOTH)
     self.canvas.delete("all")  #clearing canvas
     # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas
     self.party_list, self.district_list, self.data = clusters.readfile(
         "matrix.txt")
     new_data = clusters.rotatematrix(self.data)
     #we need to rotated matrix to cluster districts.
     clust = clusters.hcluster(new_data, distance=clusters.sim_distance)
     clusters.drawdendrogram(clust,
                             self.district_list,
                             jpeg='districts.jpg')
     self.insert_image("districts.jpg")  #insert clustered image to canvas
Example #10
0
 def get_clusture(self, param):
     """
     param - str -> Parameeter will be specified in self.writefiles
     if param is Country it will show Country clusters
     if param is Criterias it will show data clusters
     """
     country_names, records, records_data = clusters.readfile(
         self.writed_names)
     if param == "Country":
         clust = clusters.hcluster(records_data)
         label = country_names
     elif param == "Criterias":
         rotated = clusters.rotatematrix(records_data)
         clust = clusters.hcluster(rotated)
         label = records
     self.jpg_names = 'clustured2.jpg'
     clusters.drawdendrogram(clust, labels=label, jpeg=self.jpg_names)
     self.show_image()
 def cluster_poli(self, event):  # function to cluster according to parties
     if self.run == 0:  # checks if it is the first time that clustering has been made
         self.create_rest_of_gui()
         self.run += 1
     self.update_idletasks()
     self.var.set(
         "party")  # sets the variable for usage in refined analysis
     clust = clusters.hcluster(
         clusters.rotatematrix(self.create_matrix()), distance=sim_distance
     )  # calls a function from clusters.py to do the clustering
     clusters.drawdendrogram(
         clust, self.data_center.list_of_parties
     )  # calls a function from clusters.py to draw the dendogram
     self.create_rest_of_gui(
     )  # recreates the 2. GUI part so everything is reset
     self.img = ImageTk.PhotoImage(Image.open("clusters.jpg"))
     self.canvas.create_image(
         0, 0, anchor=NW,
         image=self.img)  # Inserts the dendogram to the canvas
Example #12
0
def do_non_stem():
    # generate the blog file
    generate_blogfile()
    # read the data in
    blognames, words, data = clusters.readfile('datafiles/blogtop500.txt')
    # do clustering
    clust = clusters.hcluster(data)
    # write out asci denogram
    with open("datafiles/blogtop500_asciideno.txt", "w+") as out:
        clusters.printclust2file(clust, out, labels=blognames)
    # generate jpg version of same denogram
    clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500_deno.jpg')
    # do kmeans and log to file
    with open("datafiles/kmeans_blogtop500.txt", "w+") as kout:
        for k in [5, 10, 20]:
            print("For k=%d" % k)
            kout.write("K=%d\n" % k)
            kout.write("Iterations\n")
            # kmeans for value k
            centriods = clusters.kcluster_toFile(data, k=k, out=kout)
            kout.write("Centroid Values\n-------------------------\n")
            # log centroid values
            for count, centriod in enumerate(centriods, 1):
                print("Centroid #%d" % count)
                kout.write("Centroid #%d\n" % count)
                values = []
                for idx in centriod:
                    print(blognames[idx])
                    values.append(blognames[idx])
                kout.write("%s\n" % ', '.join(values))
            kout.write("=================================\n")
            print("-------")
    # do the dimensionality reduction
    with open("datafiles/dimensionReductionNonStemmed.txt","w+") as dout:
        scaled = clusters.scaledown_logiter(data,out=dout)
    # generated the similar blog jpg
    clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500_clust2d.jpg')
Example #13
0
import clusters

file = ''
with open('1000_terms.csv') as f:
    file = f.readlines()

screen_names, words, data = clusters.readfile(file)

print(clusters.readfile(file))
clust = clusters.hcluster(data)
#
#clusters.printclust(clust, labels=screen_names)
clusters.drawdendrogram(clust, screen_names, jpeg='q3_data/userclust.jpg')
            except IndexError:
                outfile.close()
                return
        outfile.write('\n')


if __name__ == "__main__":
    allw, articlew, artt = getarticlewords()
    wordmatrix, wordvec = makematrix(allw, articlew)

    # print wordvec[0:10]
    # print artt[1]
    # print wordmatrix[1][0:10]

    # hierarchical clustering
    import clusters
    clust = clusters.hcluster(wordmatrix)
    clusters.drawdendrogram(clust, artt, jpeg='news.jpg')

    # non-negative matrix factorization
    import nmf
    # m1 = np.matrix([[1, 2, 3], [4, 5, 6]])
    # m2 = np.matrix([[1, 2], [3, 4], [5, 6]])
    # w, h = nmf.factorize(m1 * m2, pc = 3, iter = 100)
    # print w * h

    v = np.matrix(wordmatrix)
    weights, feats = nmf.factorize(v, pc=20, iter=50)
    topp, pn = showfeatures(weights, feats, artt, wordvec)
    showarticles(artt, topp, pn)
Example #15
0
def main(args):
    def usage():
        print >> sys.stderr, "Usage:"
        print >> sys.stderr, "sctoolbox correlates common_tracks [user1] [user2]"
        print >> sys.stderr, "sctoolbox correlates pearson_tastes [user1] [user2]"
        print >> sys.stderr, "sctoolbox suggest [user] bestlikes [n]"
        print >> sys.stderr, "sctoolbox suggest [user] following_tournament [n]"
        print >> sys.stderr, "sctoolbox suggest [user] following_tournament_short [n]"
        print >> sys.stderr, "sctoolbox suggest [user] following_tournament [n] --nomix"
        print >> sys.stderr, "sctoolbox suggest [user] following_tournament_short [n] --nomix"
        print >> sys.stderr, "sctoolbox suggest [user] following_tournament_playlimit [n] --nomix [playlimit]"
        print >> sys.stderr, "sctoolbox suggest [user] following_tournament_playlimit [n] [playlimit]"
        print >> sys.stderr, "sctoolbox searchUser [username]"
        print >> sys.stderr, "sctoolbox searchTrack [trackname]"
        print >> sys.stderr, "sctoolbox getTrackScore [trackname]"
        print >> sys.stderr, "sctoolbox similar [trackname]"
        print >> sys.stderr, "sctoolbox draw_style_galaxy [user] [jpg_path]"

    paths = []

    client = SCDB.register()

    ##############################################################################
    if len(args
           ) == 5 and args[1] == 'correlates' and args[2] == 'pearson_tastes':
        user1 = SCDB.searchForUser(client, args[3])
        user2 = SCDB.searchForUser(client, args[4])

        puser1 = SCDB.extractProfile(client, user1)
        puser2 = SCDB.extractProfile(client, user2)

        r = SCDB.comparePearson(puser1, puser2)

        print 'Correlation score between users (pearson):', r
    ##############################################################################

    ##############################################################################
    elif len(args
             ) == 5 and args[1] == 'correlates' and args[2] == 'common_tracks':
        user1 = SCDB.searchForUser(client, args[3])
        user2 = SCDB.searchForUser(client, args[4])

        puser1 = SCDB.extractProfile(client, user1)
        puser2 = SCDB.extractProfile(client, user2)

        r = SCDB.compareCommonTracks(puser1, puser2)

        print 'Correlation score between users (common tracks):', r
    ##############################################################################

    ##############################################################################
    elif len(args) == 5 and args[1] == 'suggest' and args[
            3] == 'following_tournament':
        print(
            'Launching tournament between tracks from followings, might take a while...'
        )
        user = SCDB.searchForUser(client, args[2])
        profile = SCDB.profileFollowings(client, user)
        suggestions = SCDB.getSuggestionsFromProfile(client, profile,
                                                     int(args[4]))
        print(args[2] + " should like these tracks:")
        for item in suggestions:
            print item
    ##############################################################################

    ##############################################################################
    elif len(args) == 5 and args[1] == 'suggest' and args[3] == 'bestlikes':
        print(
            'Rating tracks user liked, reposted, or commented, and playlisted, might take a while...'
        )
        user = SCDB.searchForUser(client, args[2])
        profile = SCDB.sortProfileFromFollowings(client, user)
        suggestions = SCDB.getSuggestionsFromProfile(client, profile,
                                                     int(args[4]))
        print(args[2] + " best likes are:")
        for item in suggestions:
            print item
    ##############################################################################

    ##############################################################################
    elif len(args) == 5 and args[1] == 'suggest' and args[
            3] == 'following_tournament_short':
        print('Launching short tournament between tracks from followings...')
        user = SCDB.searchForUser(client, args[2])
        profile = SCDB.profileFollowingsShort(client, user)
        suggestions = SCDB.getSuggestionsFromProfile(client, profile,
                                                     int(args[4]))
        print(args[2] + " should like these tracks:")
        for item in suggestions:
            print item
    ##############################################################################

    ##############################################################################
    elif len(args) == 6 and args[1] == 'suggest' and args[
            3] == 'following_tournament' and args[5] == '--nomix':
        print(
            'Launching tournament between tracks from followings, might take a while...'
        )
        user = SCDB.searchForUser(client, args[2])
        profile = SCDB.profileFollowings(client, user)
        suggestions = SCDB.getSuggestionsFromProfile(client,
                                                     profile,
                                                     int(args[4]),
                                                     no_mix=True)
        print(args[2] + " should like these tracks:")
        for item in suggestions:
            print item
    ##############################################################################

    ##############################################################################
    elif len(args) == 6 and args[1] == 'suggest' and args[
            3] == 'following_tournament_short' and args[5] == '--nomix':
        print('Launching short tournament between tracks from followings...')
        user = SCDB.searchForUser(client, args[2])
        profile = SCDB.profileFollowingsShort(client, user)
        print('Generating big profile...')
        suggestions = SCDB.getSuggestionsFromProfile(client,
                                                     profile,
                                                     int(args[4]),
                                                     no_mix=True)
        print(args[2] + " should like these tracks:")
        for item in suggestions:
            print item
    ##############################################################################

    ##############################################################################
    elif len(args) == 7 and args[1] == 'suggest' and args[
            3] == 'following_tournament_playlimit' and args[5] == '--nomix':
        print('Launching custom tournament between tracks from followings...')
        user = SCDB.searchForUser(client, args[2])
        profile = SCDB.profileFollowings(client, user)
        suggestions = SCDB.getSuggestionsFromProfile(client,
                                                     profile,
                                                     int(args[4]),
                                                     no_mix=True,
                                                     played_limit=int(args[6]))
        print(args[2] + " should like these tracks:")
        for item in suggestions:
            print item
    ##############################################################################

    ##############################################################################
    elif len(args) == 6 and args[1] == 'suggest' and args[
            3] == ' following_tournament_playlimit ':
        print('Launching custom tournament between tracks from followings...')
        user = SCDB.searchForUser(client, args[2])
        profile = SCDB.profileFollowings(client, user)
        suggestions = SCDB.getSuggestionsFromProfile(client,
                                                     profile,
                                                     int(args[4]),
                                                     no_mix=False,
                                                     played_limit=int(args[5]))
        print(args[2] + " should like these tracks:")
        for item in suggestions:
            print item
    ##############################################################################

    ##############################################################################
    elif len(args) == 3 and args[1] == 'searchUser':
        container = client.get('/users', q=args[2])
        n = 1
        for item in container:
            print('############################')
            print('#' + str(n))
            n += 1
            print('username:'******'permalink:' + item.permalink)
        print('############################')
    ##############################################################################

    ##############################################################################
    elif len(args) == 4 and args[1] == 'draw_style_galaxy':
        print('Identifying user...')
        user = SCDB.searchForUser(client, args[2])
        print('Downloading followers list...')
        followers_list = SCDB.getFollowerList(client, user)
        followers_list = SCDB.exctractsample(followers_list)
        row, col, data = SCDB.getCommentsData(client, followers_list)
        print('Generating clusters...')
        rotdata = clusters.rotatematrix(data)
        tagclust = clusters.hcluster(rotdata)
        print("Generationg dendrogram drawing...")
        clusters.drawdendrogram(tagclust, col, jpeg=args[3])
    ##############################################################################

    ##############################################################################
    ##############################################################################

    ##############################################################################
    ##############################################################################

    ##############################################################################
    ##############################################################################

    ##############################################################################
    ##############################################################################

    else:
        usage()
Example #16
0
	parser.add_argument('-f',action='store',dest='matrixFile',nargs=1,help='Name of the file containing the blog matrix.')
	parser.add_argument('-ascii',action='store_true',dest='asciiDendrogram',help='Prints a dendrogram to the standard output.')
	parser.add_argument('-jpeg',action='store',dest='jpegDraw',nargs=1,help='Print the dendrogram to a jepeg file.')

	args = parser.parse_args()

	if args.matrixFile:
		fileName = str(args.matrixFile[0])
		blognames,words,data=clusters.readfile(fileName)

	if data is not None:
		sys.stderr.write('Performing hcluster for {0}\n'.format(fileName))
		clust = clusters.hcluster(data)
		sys.stderr.write('...Finished hcluster for {0}\n'.format(fileName))

	if args.asciiDendrogram:
		if clust is not None:
			clusters.printclust(clust, labels=blognames)

	if args.jpegDraw:
		if clust is not None:
			jpegFileName = str(args.jpegDraw[0])
			if '.jpg' not in jpegFileName:
				jpegFileName = jpegFileName + '.jpg'

			
			sys.stderr.write('Writing dendrogram to {0}\n'.format(jpegFileName))

			clusters.drawdendrogram(clust, blognames, jpeg=jpegFileName)

			sys.stderr.write('...Dendrogram written to {0}\n'.format(jpegFileName))
Example #17
0
import clusters
import sys

blognames,words,data=clusters.readfile('blogmatrix.txt')
clust = clusters.hcluster(data)

# print ASCII dendrogram
clusters.printclust(clust, labels=blognames)
sys.stdout = open('ascii.txt', 'w')

# save JPEG dendrogram
clusters.drawdendrogram(clust, blognames, jpeg='dendogram.jpg')
Example #18
0
#!/usr/bin/env python

import clusters

datafile = '../data/word_data_tfidf.tsv';

blognames,words,data=clusters.readfile(datafile)

clust=clusters.hcluster(data)

file = open('../question5b.txt', 'w')
clusters.printclustFile(file, clust,labels=blognames)
file.close()

clusters.drawdendrogram(clust,blognames,jpeg='../question5b.jpg')


Example #19
0
#!/usr/local/bin/python

# all code here stolen shamelessly from
# "Programming Collective Intelligence, Chapter 3"

import sys

sys.path.insert(0, "../libs")

import clusters

blognames, words, data = clusters.readfile("data.txt")
clust = clusters.hcluster(data)

# print ASCII dendrogram
clusters.printclust(clust, labels=blognames)

# save JPEG dendrogram
clusters.drawdendrogram(clust, blognames, jpeg="blogclust.jpg")
Example #20
0
import clusters

row_names, column_names, data = clusters.readfile('dataset_vectors.txt')

clust = clusters.hcluster(data)
print('clusters by euclidean distance')
clusters.printhclust(clust, labels=row_names)
clusters.drawdendrogram(clust,
                        row_names,
                        jpeg='hcluster_euclidean_centroid.jpg')

print()
clust = clusters.hcluster(data, clusters.find_by_min)
print('clusters by euclidean distance')
clusters.printhclust(clust, labels=row_names)
clusters.drawdendrogram(clust, row_names, jpeg='hcluster_euclidean_min.jpg')

print()
clust = clusters.hcluster(data, clusters.find_by_max)
print('clusters by euclidean distance')
clusters.printhclust(clust, labels=row_names)
clusters.drawdendrogram(clust, row_names, jpeg='hcluster_euclidean_max.jpg')
Example #21
0
import clusters
blog,words,data=clusters.readfile('tfidf.txt')
variable = clusters.hcluster(data)

# print ASCII dendrogram
clusters.printclust(variable, labels=blog)

# save JPEG dendrogram
clusters.drawdendrogram(variable, blog, jpeg='clusterblogtfidf.jpg')
def drawDendogram():
	blognames,words,data=clusters.readfile('blogdata.txt') 
	clust=clusters.hcluster(data) 
	clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg') 
Example #23
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 01/03/2017 11:57 AM
# @Author : Shuqi.qin
# @File : test.py
# @Software: PyCharm Community Edition

import clusters

blognames, words, data = clusters.readfile('blogdata.txt')

cluster = clusters.hclusters(data)

clusters.drawdendrogram(cluster, blognames, 'wordcluster.jpg')
Example #24
0
def draw_dendogram():
    blognames, words, data = clusters.readfile('Outputs/blogdata.txt')
    clust = clusters.hcluster(data)
    clusters.drawdendrogram(clust, blognames, jpeg='blogclust.jpg')
Example #25
0
import clusters
import json

if __name__ == "__main__":
    docs, words, data = clusters.readfile('data/grocery_vectors.txt')

    clust = clusters.hcluster(data, distance=clusters.tanimoto)
    print('clusters by tanimoto coefficient')
    clusters.drawdendrogram(clust, docs, jpeg='img/groceries_tanimoto.jpg')

    json_obj = {}
    clusters.jsonify(clust, json_obj)
    with open("json/tanimoto.json", "w") as output:
        json.dump(json_obj, output)

    #clust=clusters.hcluster(data,distance=clusters.pearson)
    #print('clusters by pearson correlation')
    #clusters.drawdendrogram(clust,docs,jpeg='groceries_pearson.jpg')

    #clust = clusters.hcluster(data, distance=clusters.cosine)
    #print('clusters by cosine similarity')
    #clusters.drawdendrogram(clust,docs,jpeg='groceries_cosine.jpg')

    #clust=clusters.hcluster(data,distance=clusters.euclidean)
    #print('clusters by euclidean distance')
    #clusters.drawdendrogram(clust,docs,jpeg='groceries_euclidean.jpg')
Example #26
0
import clusters
blogname, words, data = clusters.readfile('blogdata2.txt')
coords = clusters.scaledown(data)
clusters.draw2d(coords, blogname, jpeg='blog2d.jpg')
rdata = clusters.rotatematrix(data)
wordclust = clusters.hcluster(rdata)
clusters.drawdendrogram(wordclust, labels=words, jpeg='wordclust.jpg')
Example #27
0
#!/usr/bin/python

import clusters
blognames, words, data = clusters.readfile('blogdata1.txt')
clust = clusters.hcluster(data)

#Question 2
clusters.printclust(clust, labels=blognames)
clusters.drawdendrogram(clust, blognames, jpeg='dengrogram.jpg')

#Question 3
print "K = 5"
kclust5 = clusters.kcluster(data, k=5)
print "\nK = 10"
kclust10 = clusters.kcluster(data, k=10)
print "\nK = 20"
kclust20 = clusters.kcluster(data, k=20)

#Question 4

coords = clusters.scaledown(data)
clusters.draw2d(coords, blognames, jpeg='MDS.jpg')
Example #28
0
File: run.py Project: wz125/courses
def clustering():
  print '## Clustering'
  import clusters
  allw,artw,artt,wordmatrix,wordvec=readpickle()
  clust=clusters.hcluster(wordmatrix)
  clusters.drawdendrogram(clust,artt,jpeg='cluster.jpg')
            except IndexError:
                outfile.close()
                return
        outfile.write('\n')


if __name__ == "__main__":
    allw, articlew, artt = getarticlewords()
    wordmatrix, wordvec = makematrix(allw, articlew)

    # print wordvec[0:10]
    # print artt[1]
    # print wordmatrix[1][0:10]

    # hierarchical clustering
    import clusters
    clust = clusters.hcluster(wordmatrix)
    clusters.drawdendrogram(clust, artt, jpeg = 'news.jpg')

    # non-negative matrix factorization
    import nmf
    # m1 = np.matrix([[1, 2, 3], [4, 5, 6]])
    # m2 = np.matrix([[1, 2], [3, 4], [5, 6]])
    # w, h = nmf.factorize(m1 * m2, pc = 3, iter = 100)
    # print w * h

    v = np.matrix(wordmatrix)
    weights, feats = nmf.factorize(v, pc = 20, iter = 50)
    topp, pn = showfeatures(weights, feats, artt, wordvec)
    showarticles(artt, topp, pn)
Example #30
0
File: run.py Project: wz125/courses
def ColumnClustering():
  reload(clusters)
  blognames,words,data=clusters.readfile('blogdata1.txt')
  rdata=clusters.rotatematrix(data)
  wordclust=clusters.hcluster(rdata)
  clusters.drawdendrogram(wordclust,labels=words,jpeg='wordclust.jpg')
Example #31
0
import clusters
blog, words, data = clusters.readfile('tfidf.txt')
variable = clusters.hcluster(data)

# print ASCII dendrogram
clusters.printclust(variable, labels=blog)

# save JPEG dendrogram
clusters.drawdendrogram(variable, blog, jpeg='clusterblogtfidf.jpg')
Example #32
0
import clusters

docs, words, data = clusters.readfile('titles_vectors.txt')

clust = clusters.hcluster(data, distance=clusters.pearson)
print('clusters by pearson correlation')
clusters.printhclust(clust, labels=docs)
clusters.drawdendrogram(clust, docs, jpeg='docsclust_pearson.jpg')

clust = clusters.hcluster(data, distance=clusters.tanimoto)
print('clusters by tanimoto coefficient')
clusters.printhclust(clust, labels=docs)
clusters.drawdendrogram(clust, docs, jpeg='docsclust_tanimoto.jpg')

clust = clusters.hcluster(data, distance=clusters.euclidean)
print('clusters by euclidean distance')
clusters.printhclust(clust, labels=docs)
clusters.drawdendrogram(clust, docs, jpeg='docsclust_euclidean.jpg')

clust = clusters.hcluster(data, distance=clusters.cosine)
print('clusters by euclidean distance')
clusters.printhclust(clust, labels=docs)
clusters.drawdendrogram(clust, docs, jpeg='docsclust_cosine.jpg')
Example #33
0
import clusters

name, word, data = clusters.readfile('blogdata1 (copy).txt')
cluster = clusters.hcluster(data)

clusters.printclust(cluster, labels=name)

clusters.drawdendrogram(cluster, name, jpeg='BlogCluster.jpg')
Example #34
0
####54页调用generatefeedvector生成blogdata文件失败。是因为feedlist里面的网址无法打开吗?
###downloadzebodata生成zebo.txt也失败。sigh
import clusters

blognames,words,data = clusters.readfile('blogdatadown.txt')#1
#clust = clusters.hcluster(data)
#print (clust)#果然函数中这个值输出也都不一样呢。
#print(blognames)

#clusters.printclust(clust, labels = blognames)#2

#clusters.drawdendrogram(clust, blognames, jpeg = 'blogclust.jpg')#3

rdata = clusters.rotatematrix(data)#4
wordclust = clusters.hcluster(rdata)
clusters.drawdendrogram(wordclust, labels = words, jpeg = 'wordclust.jpg')
'''
kclust = clusters.kcluster(data, k = 4)#5
print ([blognames[r] for r in kclust[0]])
print ([blognames[r] for r in kclust[1]])

import urllib.request#6
from bs4 import BeautifulSoup
c = urllib.request.urlopen('https://en.wikipedia.org/wiki/Jon_Snow')
soup =  BeautifulSoup(c.read(),"lxml")#这里非常有趣! 感觉有空需要看下这个源代码库呀。
links = soup('a')#所以我还是不懂beautiful soup 的用法呀。
print(links[10])
print(links[10]['href'])
#这一段是教BS的。

wants, people, data = clusters.readfile('zebodown.txt')#7
def draw_dendogram():
    jobnames,projects,data=clusters.readfile('job_projects')
    clust=clusters.hcluster(data)
    #clusters.printclust(clust,labels=jobnames)
    clusters.drawdendrogram(clust,jobnames,jpeg='jobclust.jpg')
Example #36
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-


import clusters

blognames,words,data=clusters.readfile( './../data/banpaku_utf8.csv' )
clust=clusters.hcluster(data)

# CUIで結果を表示
#clusters.printclust( clust, labels=blognames)

# 画像で結果を表示
reload(clusters)
clusters.drawdendrogram(clust, blognames, jpeg="banpaku_reg.jpg")
__author__ = 'feng'

import clusters

sessionIds, data = clusters.readSessionFile('session.csv')
clust = clusters.hcluster(data, distance=clusters.session_dissimilarity)

#clusters.printclust(clust, labels=sessionIds)
clusters.drawdendrogram(clust, sessionIds, jpeg='sessionclust.jpg')
	dataList = []

	for i in words:
		wordneed.append(i)

	for r in allInfo:
		print r
		clone = words
		theList = []
		for i in allInfo[r]:
			if i in words:
				clone[i] = allInfo[r][i]
		for n in clone:
			theList.append(clone[n])
		dataList.append(theList)

	return dataList,times,wordneed






data,time,word = makeDate(allData)
print 'done'
# print len(data)
clust = clusters.hcluster(data)
print 'done'
clusters.drawdendrogram(clust,time,jpeg = './myWeibo.jpg')
Example #39
0
File: run.py Project: wz125/courses
def drawingtheDendrogram():
  blognames,words,data=clusters.readfile('blogdata1.txt')
  clust=clusters.hcluster(data)
  reload(clusters)
  clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
Example #40
0
import clusters

blognames, words, data = clusters.readfile('blogdata.txt')
cl = clusters.hcluster(data)
clusters.printclust(cl, labels=blognames)  #ascii diagram
clusters.drawdendrogram(cl, blognames,
                        jpeg='blogcluster.jpg')  #drawing the dendrogram
Example #41
0
File: run.py Project: wz125/courses
def prefer():
  reload(clusters)
  wants,people,data=clusters.readfile('zebo.txt')
  clust=clusters.hcluster(data,distance=clusters.tanamoto)
  clusters.drawdendrogram(clust,wants)
Example #42
0
import clusters
import Image
moviename, words, data =  clusters.readfile('res/blogdata2.txt')
print 'Processing......'
clust = clusters.hcluster( data)
print 'Output image is generating...'
clusters.drawdendrogram(clust, moviename, jpeg = 'output/finaloutput.jpg')
print "Scaling down..."
coords = clusters.scaledown(data)
clusters.draw2d(coords, moviename, jpeg = 'output/finaloutput2d.jpg')
image = Image.open('output/finaloutput.jpg')
image.show()
x = input("Press any key to quit....")
import clusters

docs, words, data = clusters.readfile('titles_vectors.txt')
rdata = clusters.rotatematrix(data)

clust = clusters.hcluster(rdata, distance=clusters.pearson)
print('clusters by pearson correlation')
clusters.printhclust(clust, labels=words)
clusters.drawdendrogram(clust, words, jpeg='wordsclustpearson.jpg')

clust = clusters.hcluster(rdata, distance=clusters.tanimoto)
print('clusters by tanimoto coefficient')
clusters.printhclust(clust, labels=words)
clusters.drawdendrogram(clust, words, jpeg='wordsclusttanimoto.jpg')

clust = clusters.hcluster(rdata, distance=clusters.euclidean)
print('clusters by euclidean distance')
clusters.printhclust(clust, labels=words)
clusters.drawdendrogram(clust, words, jpeg='wordsclusteuclidean.jpg')
Example #44
0
#print clust

#clusters.printclust(clust,labels=blognames)

#clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')

#转置,对词语进行聚类
#rdata=clusters.rotatematrix(data)
#wordclust=clusters.hcluster(rdata)
#clusters.drawdendrogram(wordclust,labels=words,jpeg='blogclust3.jpg')

#kclust=clusters.kcluster(data,k=3)
#
#for r in kclust[0]:
#    print blognames[r]

#BeautifulSoup
#import urllib2
#from bs4 import BeautifulSoup
#
#c=urllib2.urlopen('http://www.baidu.com')
#soup=BeautifulSoup(c.read(),'lxml')
#links=soup('a')
##print soup
#print links

wants, people, data = clusters.readfile('zebo.txt')
clust = clusters.hcluster(data, distance=clusters.tanamoto)
clusters.drawdendrogram(clust, wants)
Example #45
0
#Shawn Jones

#!/usr/local/bin/python

# all code here stolen shamelessly from 

# "Programming Collective Intelligence, Chapter 3"

import sys

sys.path.insert(0, '../libs')

import clusters

blognames,words,data=clusters.readfile('blogdata1V2.txt')

clust = clusters.hcluster(data)

# print ASCII dendrogram

clusters.printclust(clust, labels=blognames)

# save JPEG dendrogram

clusters.drawdendrogram(clust, blognames, jpeg='blogclust.jpg')

Example #46
0
feed = feedparser.parse('http://spchicagosp.wordpress.com/feed')
feed.entries




================
import clusters
blognames, words, data = clusters.readfile('blogdata.txt')
clust = clusters.hcluster(data)
clusters.printclust(clust, labels = blognames)
clusters.drawdendrogram(clust, blognames, img ='blogclust.png')

import clusters
blognames, words, data = clusters.readfile('blogdataascii.txt')
clust = clusters.hcluster(data)
clusters.drawdendrogram(clust, blognames, jpeg='blogcluster.jpg')
import clusters, data_processing
'''Import Dataset'''
data = data_processing.open_csv_file('dataset.csv')
'''Create a list of countries in the order of the similarity matrix'''
countries_list = data_processing.get_country_names(data)
'''Create numerical attributes matrix'''
attr_matrix = data_processing.create_attribute_matrix(data)
data_processing.str_to_float(attr_matrix)
'''hierachical clustering: euclidean distance'''
num_cluster = 3
resulting_clusters = clusters.hcluster(attr_matrix,
                                       distance=clusters.euclidean)
print('clusters by euclidean distance')
clusters.printhclust(resulting_clusters, labels=countries_list)
clusters.drawdendrogram(resulting_clusters,
                        countries_list,
                        jpeg='Euclidean Cluster.jpg')
'''hierachical clustering: tanimoto coefficient'''
resulting_clusters = clusters.hcluster(attr_matrix, distance=clusters.tanimoto)
print('clusters by tanimoto coefficient')
clusters.printhclust(resulting_clusters, labels=countries_list)
clusters.drawdendrogram(resulting_clusters,
                        countries_list,
                        jpeg='Tanimoto Cluster.jpg')
print()
'''hierachical clustering: pearson similarity'''
resulting_clusters = clusters.hcluster(attr_matrix, distance=clusters.pearson)
print('clusters by pearson correlation')
clusters.printhclust(resulting_clusters, labels=countries_list)
clusters.drawdendrogram(resulting_clusters,
                        countries_list,