def PrintClust(self):

        self.DataMatrix()
        #creating canvas for printing cluster
        self.maincanvas=Canvas(self.root,bg='#FFFFFF',width=100,height=320,)
        self.maincanvas.grid(row=8,rowspan=8,column=1,columnspan=16,pady=10,sticky="WE")
        self.xscrollbar = Scrollbar(self.root, orient=HORIZONTAL,command=self.maincanvas.xview)
        self.xscrollbar.grid(row=15,column=1,columnspan=16,sticky="WSE")

        self.yscrollbar = Scrollbar(self.root,orient=VERTICAL,command=self.maincanvas.yview)
        self.yscrollbar.grid(column=17,row=8,rowspan=8,sticky="WNS")

        self.maincanvas.config(xscrollcommand=self.xscrollbar,yscrollcommand=self.yscrollbar)

        courses, words, data = clusters.readfile("data.txt")
        #creating if condition for distance methods, tanimoto and pearson
        if str(self.var.get())=="1":

            clust=clusters.hcluster(data,distance=clusters.pearson)
            self.printclust=clusters.clust2str(clust, labels=courses)
            self.maincanvas.create_text(350,10,font="Helvetiva 10",text=self.printclust,anchor="nw")

        elif str(self.var.get())=="2":

            clust=clusters.hcluster(data,distance=clusters.tanimoto)
            self.printclust=clusters.clust2str(clust, labels=courses)
            self.maincanvas.create_text(350,10,font="Helvetiva 10",text=self.printclust,anchor="nw")
        else:

            tkMessageBox.showerror("ERROR","Please select distance method.")
            self.DataMatrix()
 def clustering(self, canvas):  # Clusters District or Parties
     columns, rows, percentages = clusters.readfile('data.txt')
     clust = clusters.hcluster(percentages, distance=clusters.sim_distance)
     clusters.drawdendrogram(clust, columns, jpeg='cl.jpg')
     img = ImageTk.PhotoImage(Image.open("cl.jpg"))
     canvas.create_image(20, 20, anchor=NW, image=img)
     canvas.image = img
Ejemplo n.º 3
0
def main(nameof_file):
    (countries, vectors) = kmeans.readfile(nameof_file)
    clusters = utils.hcluster(vectors, distance=choice_dist)
    utils.drawdendrogram(
        clusters,
        list(map(lambda x: x[1], countries)),
        jpeg="C:/Users/akars/clustering_lab/processedhierarchical.jpg")

    opt_clust = [
        clusters.left, clusters.right.left, clusters.right.right.left.left,
        clusters.right.right.left.right.left,
        clusters.right.right.left.right.right,
        clusters.right.right.right.right, clusters.right.right.right.left
    ]

    cluster_level = []
    for cluster in opt_clust:
        country_ids = []
    get(cluster, country_ids)
    cluster_level.append(country_ids)

    for i in range(clusternum):
        print('cluster {}:'.format(i + 1))
        print([countries[r] for r in cluster_level[i]])
    print("SSE: " + str(kmeans.sse(cluster_level, vectors)))

    if __name__ == "__main__":
        main("C:/Users/akars/clustering_lab/processed/preprocessed.csv")
Ejemplo n.º 4
0
def main(input_f):
    (countries, vectors) = k_means.read_file(input_f)

    clusters = utils.hcluster(vectors, distance=distance_function)
    utils.drawdendrogram(clusters,
                         list(map(lambda x: x[1], countries)),
                         jpeg='data/hierarchical.jpg')

    # Self-picked clusters from the graph which I considered good
    good_clusters = [
        clusters.left, clusters.right.left, clusters.right.right.left.left,
        clusters.right.right.left.right.left,
        clusters.right.right.left.right.right,
        clusters.right.right.right.right, clusters.right.right.right.left
    ]

    cluster_level = []

    for cluster in good_clusters:
        country_ids = []
        get_all(cluster, country_ids)
        cluster_level.append(country_ids)

    for i in range(num_clusters):
        print('cluster {}:'.format(i + 1))
        print([countries[r] for r in cluster_level[i]])

    print("SSE: " + str(k_means.sse(cluster_level, vectors)))
Ejemplo n.º 5
0
def __main__():
    blognames, words, data = clusters.read_file("blogdata.txt")
    clust = clusters.hcluster(data, distance=euclidean_distance)
    clusters.draw_dendogram(clust, blognames, jpeg="ex3dendrogram.jpg")


# I think this weights against groupings that have similar word use rates but different word use counts.
Ejemplo n.º 6
0
def do_stemmed():
    generate_blogfile_stem()
    blognames, words, data = clusters.readfile('datafiles/blogtop500_stemmed.txt')
    clust = clusters.hcluster(data)
    with open("datafiles/blogtop500stemmed_asciideno.txt", "w+") as out:
        clusters.printclust2file(clust, out, labels=blognames)
    clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500stemmed_deno.jpg')

    with open("datafiles/kmeans_blogtop500stemmed.txt", "w+") as kout:
        for k in [5, 10, 20]:
            print("For k=%d" % k)
            kout.write("K=%d\n" % k)
            kout.write("Iterations\n")
            centriods = clusters.kcluster_toFile(data, k=k, out=kout)
            kout.write("Centroid Values\n-------------------------\n")
            for count, centriod in enumerate(centriods, 1):
                print("Centroid #%d" % count)
                kout.write("Centroid #%d\n" % count)
                values = []
                for idx in centriod:
                    print(blognames[idx])
                    values.append(blognames[idx])
                kout.write("%s\n" % ', '.join(values))
            kout.write("=================================\n")
            print("-------")
    with open("datafiles/dimensionReductionStemmed.txt", "w+") as dout:
        scaled = clusters.scaledown_logiter(data, out=dout)
    clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500stemmed_clust2d.jpg')
def createDendrogram():
    blogs, colnames, data = clusters.readfile('blogdata.txt')
    cluster = clusters.hcluster(data)
    clusters.drawdendrogram(cluster, blogs, jpeg='Dendrogram.jpg')
    f = open("ASCII.txt", 'w')
    sys.stdout = f
    clusters.printclust(cluster, labels=blogs)
    f.close()
    sys.stderr.close()
 def hierarchical(self):
     self.clst = 1
     if len(self.data) != 0:
         clust = clusters.hcluster(self.data)
         names = self.authors
         output = clusters.clust2str(clust, names)
         self.t3.get("1.0", END)
         self.t3.delete("1.0", END)
         self.t3.insert(END, output)
Ejemplo n.º 9
0
 def get_clusture(self, param):
     """
     param - str -> Parameeter will be specified in self.writefiles
     if param is Country it will show Country clusters
     if param is Criterias it will show data clusters
     """
     country_names, records, records_data = clusters.readfile(
         self.writed_names)
     if param == "Country":
         clust = clusters.hcluster(records_data)
         label = country_names
     elif param == "Criterias":
         rotated = clusters.rotatematrix(records_data)
         clust = clusters.hcluster(rotated)
         label = records
     self.jpg_names = 'clustured2.jpg'
     clusters.drawdendrogram(clust, labels=label, jpeg=self.jpg_names)
     self.show_image()
 def cluster_parties(self):
     self.state = "party"  #if user clickes cluster parties state changes to party.
     self.analysis_frame.pack(side=TOP, fill=BOTH)
     self.canvas.delete("all")  #clearing canvas
     # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas
     self.party_list, self.district_list, self.data = clusters.readfile(
         "matrix.txt")
     clust = clusters.hcluster(self.data, distance=clusters.sim_distance)
     clusters.drawdendrogram(clust, self.party_list, jpeg='parties.jpg')
     self.insert_image("parties.jpg")  #insert clustered image to canvas
Ejemplo n.º 11
0
def main():

    # returns blog titles, words in blog (10%-50% boundaries), list of frequency info
    blognames,words,data=clusters.readfile('blogdata.txt') 

    # returns a tree of foo.id, foo.left, foo.right
    clust=clusters.hcluster(data)

    # walks tree and prints ascii approximation of a dendogram; distance measure is Pearson's r
    clusters.printclust(clust,labels=blognames) 
Ejemplo n.º 12
0
def createJPegDendogram():

	'''
	blognames,words,data=clusters.readfile('blogVector.txt')
	clust=clusters.hcluster(data)
	clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
	'''
	


	
	blognames,words,data=clusters.readfile('blogVectorTFIDFVersion.txt')
	clust=clusters.hcluster(data)
	clusters.drawdendrogram(clust,blognames,jpeg='blogclustTFIDFVersion.jpg')
 def cluster_district(self):
     self.state = "district"
     #if user clickes cluster districts state changes to district.
     self.analysis_frame.pack(side=TOP, fill=BOTH)
     self.canvas.delete("all")  #clearing canvas
     # https://stackoverflow.com/questions/15839491/how-to-clear-tkinter-canvas
     self.party_list, self.district_list, self.data = clusters.readfile(
         "matrix.txt")
     new_data = clusters.rotatematrix(self.data)
     #we need to rotated matrix to cluster districts.
     clust = clusters.hcluster(new_data, distance=clusters.sim_distance)
     clusters.drawdendrogram(clust,
                             self.district_list,
                             jpeg='districts.jpg')
     self.insert_image("districts.jpg")  #insert clustered image to canvas
  def testNormal(self):
    rows = [[6, 4, 2],
            [2, 4, 6],
            [1, 2, 3],
            [3, 2, 1.01]]


    clust = [clusters.bicluster(rows[i], id=i) for i in range(len(rows))]

    c0 = clusters.bicluster(clusters.mergevecs(rows[1], rows[2]),
      left=clust[1], right=clust[2], id=-1, distance=0.0)
    c1 = clusters.bicluster(clusters.mergevecs(rows[0], rows[3]),
      left=clust[0], right=clust[3], id=-2,
      distance=clusters.pearson_dist(rows[0], rows[3]))
    c2 = clusters.bicluster(clusters.mergevecs(c0.vec, c1.vec),
      left=c0, right=c1, id=-3,
      distance=clusters.pearson_dist(c0.vec, c1.vec))

    self.assertEquals(c2, clusters.hcluster(rows))
 def cluster_poli(self, event):  # function to cluster according to parties
     if self.run == 0:  # checks if it is the first time that clustering has been made
         self.create_rest_of_gui()
         self.run += 1
     self.update_idletasks()
     self.var.set(
         "party")  # sets the variable for usage in refined analysis
     clust = clusters.hcluster(
         clusters.rotatematrix(self.create_matrix()), distance=sim_distance
     )  # calls a function from clusters.py to do the clustering
     clusters.drawdendrogram(
         clust, self.data_center.list_of_parties
     )  # calls a function from clusters.py to draw the dendogram
     self.create_rest_of_gui(
     )  # recreates the 2. GUI part so everything is reset
     self.img = ImageTk.PhotoImage(Image.open("clusters.jpg"))
     self.canvas.create_image(
         0, 0, anchor=NW,
         image=self.img)  # Inserts the dendogram to the canvas
Ejemplo n.º 16
0
    def testNormal(self):
        rows = [[6, 4, 2], [2, 4, 6], [1, 2, 3], [3, 2, 1.01]]

        clust = [clusters.bicluster(rows[i], id=i) for i in range(len(rows))]

        c0 = clusters.bicluster(clusters.mergevecs(rows[1], rows[2]),
                                left=clust[1],
                                right=clust[2],
                                id=-1,
                                distance=0.0)
        c1 = clusters.bicluster(clusters.mergevecs(rows[0], rows[3]),
                                left=clust[0],
                                right=clust[3],
                                id=-2,
                                distance=clusters.pearson_dist(
                                    rows[0], rows[3]))
        c2 = clusters.bicluster(clusters.mergevecs(c0.vec, c1.vec),
                                left=c0,
                                right=c1,
                                id=-3,
                                distance=clusters.pearson_dist(c0.vec, c1.vec))

        self.assertEquals(c2, clusters.hcluster(rows))
Ejemplo n.º 17
0
 def clustering_button(self):
     if len(database) == 0:
         self.Error_Message_Function()
         return
     prof_names, words, data = clusters.readfile("Will_be_Cluestered.txt")
     type_of_clustering = values_of_clustering[int(
         self.Radio_Values3.get())]
     #Determining the type of clustering with the dictionary.
     if type_of_clustering == "Hierarcial":
         clust = clusters.hcluster(data)
         self.All_Results_Part.delete(0, END)
         for i in range(
                 len(
                     clusters.clust2str(clust, labels=prof_names).split(
                         '\n')) - 1):
             #split method is used for proper showing of cluster.
             self.All_Results_Part.insert(
                 END,
                 clusters.clust2str(clust,
                                    labels=prof_names).split('\n')[i])
             #Last line of list will be empty string so it is neglected.
     elif type_of_clustering == "K-Means":
         clust = clusters.kcluster(data, k=int(self.Value_of_k.get()))
         #k is getting from the entry.
         prof_names = database.keys()
         new_list_with_length_of_elements = [(len(i), i) for i in clust]
         new_list_with_length_of_elements.sort(reverse=True)
         counter = 0
         self.All_Results_Part.delete(0, END)
         for i, j in new_list_with_length_of_elements:
             new_proper_list = [prof_names[k] for k in range(len(j))]
             new_str = ""
             for i in new_proper_list:
                 new_str += str(i) + "  "
             self.All_Results_Part.insert(
                 END, "Cluster %d:{" % (counter + 1) + new_str + "}" + "\n")
             counter += 1
Ejemplo n.º 18
0
def do_non_stem():
    # generate the blog file
    generate_blogfile()
    # read the data in
    blognames, words, data = clusters.readfile('datafiles/blogtop500.txt')
    # do clustering
    clust = clusters.hcluster(data)
    # write out asci denogram
    with open("datafiles/blogtop500_asciideno.txt", "w+") as out:
        clusters.printclust2file(clust, out, labels=blognames)
    # generate jpg version of same denogram
    clusters.drawdendrogram(clust, blognames, jpeg='datafiles/blogtop500_deno.jpg')
    # do kmeans and log to file
    with open("datafiles/kmeans_blogtop500.txt", "w+") as kout:
        for k in [5, 10, 20]:
            print("For k=%d" % k)
            kout.write("K=%d\n" % k)
            kout.write("Iterations\n")
            # kmeans for value k
            centriods = clusters.kcluster_toFile(data, k=k, out=kout)
            kout.write("Centroid Values\n-------------------------\n")
            # log centroid values
            for count, centriod in enumerate(centriods, 1):
                print("Centroid #%d" % count)
                kout.write("Centroid #%d\n" % count)
                values = []
                for idx in centriod:
                    print(blognames[idx])
                    values.append(blognames[idx])
                kout.write("%s\n" % ', '.join(values))
            kout.write("=================================\n")
            print("-------")
    # do the dimensionality reduction
    with open("datafiles/dimensionReductionNonStemmed.txt","w+") as dout:
        scaled = clusters.scaledown_logiter(data,out=dout)
    # generated the similar blog jpg
    clusters.draw2d(scaled, blognames, jpg='datafiles/blogtop500_clust2d.jpg')
Ejemplo n.º 19
0
import clusters
blog, words, data = clusters.readfile('tfidf.txt')
variable = clusters.hcluster(data)

# print ASCII dendrogram
clusters.printclust(variable, labels=blog)

# save JPEG dendrogram
clusters.drawdendrogram(variable, blog, jpeg='clusterblogtfidf.jpg')
def __main__():
    blognames, words, data = clusters.read_file("blogdata.txt")
    clust = clusters.hcluster(data, distance=euclidean_distance)
    clusters.draw_dendogram(clust, blognames, jpeg="ex3dendrogram.jpg")

# I think this weights against groupings that have similar word use rates but different word use counts.
Ejemplo n.º 21
0
import clusters

docs, words, data = clusters.readfile('titles_vectors.txt')
rdata = clusters.rotatematrix(data)

clust = clusters.hcluster(rdata, distance=clusters.pearson)
print('clusters by pearson correlation')
clusters.printhclust(clust, labels=words)
clusters.drawdendrogram(clust, words, jpeg='wordsclustpearson.jpg')

clust = clusters.hcluster(rdata, distance=clusters.tanimoto)
print('clusters by tanimoto coefficient')
clusters.printhclust(clust, labels=words)
clusters.drawdendrogram(clust, words, jpeg='wordsclusttanimoto.jpg')

clust = clusters.hcluster(rdata, distance=clusters.euclidean)
print('clusters by euclidean distance')
clusters.printhclust(clust, labels=words)
clusters.drawdendrogram(clust, words, jpeg='wordsclusteuclidean.jpg')
Ejemplo n.º 22
0
def __main__():
    entries, words, data = clusters.read_file('entrydata.txt')
    clust = clusters.hcluster(data)
    clusters.draw_dendogram(clust, entries, jpeg="ex2dend.jpg")
Ejemplo n.º 23
0
Archivo: run.py Proyecto: wz125/courses
def drawingtheDendrogram():
  blognames,words,data=clusters.readfile('blogdata1.txt')
  clust=clusters.hcluster(data)
  reload(clusters)
  clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg')
Ejemplo n.º 24
0
import clusters

row_names, column_names, data = clusters.readfile('dataset_vectors.txt')

clust = clusters.hcluster(data)
print('clusters by euclidean distance')
clusters.printhclust(clust, labels=row_names)
clusters.drawdendrogram(clust,
                        row_names,
                        jpeg='hcluster_euclidean_centroid.jpg')

print()
clust = clusters.hcluster(data, clusters.find_by_min)
print('clusters by euclidean distance')
clusters.printhclust(clust, labels=row_names)
clusters.drawdendrogram(clust, row_names, jpeg='hcluster_euclidean_min.jpg')

print()
clust = clusters.hcluster(data, clusters.find_by_max)
print('clusters by euclidean distance')
clusters.printhclust(clust, labels=row_names)
clusters.drawdendrogram(clust, row_names, jpeg='hcluster_euclidean_max.jpg')
Ejemplo n.º 25
0
import kmcluster
import clusters
import word_cloud

country, data = kmcluster.read_file("processed_data.csv")

#for i in ['min', 'max']:
clust = clusters.hcluster(data, distance=clusters.cosine, inter_dis=max)
print(clust)
print('cosine similarity')
Ejemplo n.º 26
0
def getHCluster(inputFile):
    """Do Hierarchical Clustering"""

    blognames, words, data = clusters.readfile(inputFile)
    return blognames, words, clusters.hcluster(data)
Ejemplo n.º 27
0
####54页调用generatefeedvector生成blogdata文件失败。是因为feedlist里面的网址无法打开吗?
###downloadzebodata生成zebo.txt也失败。sigh
import clusters

blognames,words,data = clusters.readfile('blogdatadown.txt')#1
#clust = clusters.hcluster(data)
#print (clust)#果然函数中这个值输出也都不一样呢。
#print(blognames)

#clusters.printclust(clust, labels = blognames)#2

#clusters.drawdendrogram(clust, blognames, jpeg = 'blogclust.jpg')#3

rdata = clusters.rotatematrix(data)#4
wordclust = clusters.hcluster(rdata)
clusters.drawdendrogram(wordclust, labels = words, jpeg = 'wordclust.jpg')
'''
kclust = clusters.kcluster(data, k = 4)#5
print ([blognames[r] for r in kclust[0]])
print ([blognames[r] for r in kclust[1]])

import urllib.request#6
from bs4 import BeautifulSoup
c = urllib.request.urlopen('https://en.wikipedia.org/wiki/Jon_Snow')
soup =  BeautifulSoup(c.read(),"lxml")#这里非常有趣! 感觉有空需要看下这个源代码库呀。
links = soup('a')#所以我还是不懂beautiful soup 的用法呀。
print(links[10])
print(links[10]['href'])
#这一段是教BS的。
Ejemplo n.º 28
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-


import clusters

blognames,words,data=clusters.readfile( './../data/banpaku_utf8.csv' )
clust=clusters.hcluster(data)

# CUIで結果を表示
#clusters.printclust( clust, labels=blognames)

# 画像で結果を表示
reload(clusters)
clusters.drawdendrogram(clust, blognames, jpeg="banpaku_reg.jpg")
Ejemplo n.º 29
0
def createAsciiDendogram():
	blognames,words,data=clusters.readfile('blogVector.txt')
	clust=clusters.hcluster(data)

	clusters.printclust(clust,labels=blognames)
Ejemplo n.º 30
0
def drawDendogram():
	blognames,words,data=clusters.readfile('blogdata.txt') 
	clust=clusters.hcluster(data) 
	clusters.drawdendrogram(clust,blognames,jpeg='blogclust.jpg') 
Ejemplo n.º 31
0
import clusters
blogname, words, data = clusters.readfile('blogdata2.txt')
coords = clusters.scaledown(data)
clusters.draw2d(coords, blogname, jpeg='blog2d.jpg')
rdata = clusters.rotatematrix(data)
wordclust = clusters.hcluster(rdata)
clusters.drawdendrogram(wordclust, labels=words, jpeg='wordclust.jpg')
Ejemplo n.º 32
0
def draw_dendogram():
    jobnames,projects,data=clusters.readfile('job_projects')
    clust=clusters.hcluster(data)
    #clusters.printclust(clust,labels=jobnames)
    clusters.drawdendrogram(clust,jobnames,jpeg='jobclust.jpg')
Ejemplo n.º 33
0
def draw_dendogram():
    blognames, words, data = clusters.readfile('Outputs/blogdata.txt')
    clust = clusters.hcluster(data)
    clusters.drawdendrogram(clust, blognames, jpeg='blogclust.jpg')
            except IndexError:
                outfile.close()
                return
        outfile.write('\n')


if __name__ == "__main__":
    allw, articlew, artt = getarticlewords()
    wordmatrix, wordvec = makematrix(allw, articlew)

    # print wordvec[0:10]
    # print artt[1]
    # print wordmatrix[1][0:10]

    # hierarchical clustering
    import clusters
    clust = clusters.hcluster(wordmatrix)
    clusters.drawdendrogram(clust, artt, jpeg='news.jpg')

    # non-negative matrix factorization
    import nmf
    # m1 = np.matrix([[1, 2, 3], [4, 5, 6]])
    # m2 = np.matrix([[1, 2], [3, 4], [5, 6]])
    # w, h = nmf.factorize(m1 * m2, pc = 3, iter = 100)
    # print w * h

    v = np.matrix(wordmatrix)
    weights, feats = nmf.factorize(v, pc=20, iter=50)
    topp, pn = showfeatures(weights, feats, artt, wordvec)
    showarticles(artt, topp, pn)
def __main__():
    tag_list = build_tag_list("programming")
    tags, urls, data = build_tag_matrix(tag_list)
    cluster = hcluster(data)
    draw_dendogram(cluster, urls, jpeg="delicious.jpg")
Ejemplo n.º 36
0
import clusters
import json

if __name__ == "__main__":
    docs, words, data = clusters.readfile('data/grocery_vectors.txt')

    clust = clusters.hcluster(data, distance=clusters.tanimoto)
    print('clusters by tanimoto coefficient')
    clusters.drawdendrogram(clust, docs, jpeg='img/groceries_tanimoto.jpg')

    json_obj = {}
    clusters.jsonify(clust, json_obj)
    with open("json/tanimoto.json", "w") as output:
        json.dump(json_obj, output)

    #clust=clusters.hcluster(data,distance=clusters.pearson)
    #print('clusters by pearson correlation')
    #clusters.drawdendrogram(clust,docs,jpeg='groceries_pearson.jpg')

    #clust = clusters.hcluster(data, distance=clusters.cosine)
    #print('clusters by cosine similarity')
    #clusters.drawdendrogram(clust,docs,jpeg='groceries_cosine.jpg')

    #clust=clusters.hcluster(data,distance=clusters.euclidean)
    #print('clusters by euclidean distance')
    #clusters.drawdendrogram(clust,docs,jpeg='groceries_euclidean.jpg')
Ejemplo n.º 37
0
Archivo: run.py Proyecto: wz125/courses
def prefer():
  reload(clusters)
  wants,people,data=clusters.readfile('zebo.txt')
  clust=clusters.hcluster(data,distance=clusters.tanamoto)
  clusters.drawdendrogram(clust,wants)
Ejemplo n.º 38
0
Archivo: run.py Proyecto: wz125/courses
def clustering():
  print '## Clustering'
  import clusters
  allw,artw,artt,wordmatrix,wordvec=readpickle()
  clust=clusters.hcluster(wordmatrix)
  clusters.drawdendrogram(clust,artt,jpeg='cluster.jpg')
Ejemplo n.º 39
0
def getHCluster(inputFile):
    """Do Hierarchical Clustering"""

    blognames, words, data = clusters.readfile(inputFile)
    return blognames, words, clusters.hcluster(data)
Ejemplo n.º 40
0
Archivo: run.py Proyecto: wz125/courses
def ColumnClustering():
  reload(clusters)
  blognames,words,data=clusters.readfile('blogdata1.txt')
  rdata=clusters.rotatematrix(data)
  wordclust=clusters.hcluster(rdata)
  clusters.drawdendrogram(wordclust,labels=words,jpeg='wordclust.jpg')
Ejemplo n.º 41
0
import clusters
blog,words,data=clusters.readfile('tfidf.txt')
variable = clusters.hcluster(data)

# print ASCII dendrogram
clusters.printclust(variable, labels=blog)

# save JPEG dendrogram
clusters.drawdendrogram(variable, blog, jpeg='clusterblogtfidf.jpg')
Ejemplo n.º 42
0
Archivo: run.py Proyecto: wz125/courses
def countword():
  blognames,words,data=clusters.readfile('blogdata1.txt')
  clust=clusters.hcluster(data)
Ejemplo n.º 43
0
def __main__():
    tag_list = build_tag_list("programming")
    tags, urls, data = build_tag_matrix(tag_list)
    cluster = hcluster(data)
    draw_dendogram(cluster, urls, jpeg="delicious.jpg")
def __main__():
    wants, people, data = clusters.read_file('zebo.txt')
    clust = clusters.hcluster(data, distance=manhattan_distance)
    clusters.draw_dendogram(clust, wants, jpeg="ex4dend.jpg")
Ejemplo n.º 45
0
def __main__():
    wants, people, data = clusters.read_file('zebo.txt')
    clust = clusters.hcluster(data, distance=manhattan_distance)
    clusters.draw_dendogram(clust, wants, jpeg="ex4dend.jpg")
            except IndexError:
                outfile.close()
                return
        outfile.write('\n')


if __name__ == "__main__":
    allw, articlew, artt = getarticlewords()
    wordmatrix, wordvec = makematrix(allw, articlew)

    # print wordvec[0:10]
    # print artt[1]
    # print wordmatrix[1][0:10]

    # hierarchical clustering
    import clusters
    clust = clusters.hcluster(wordmatrix)
    clusters.drawdendrogram(clust, artt, jpeg = 'news.jpg')

    # non-negative matrix factorization
    import nmf
    # m1 = np.matrix([[1, 2, 3], [4, 5, 6]])
    # m2 = np.matrix([[1, 2], [3, 4], [5, 6]])
    # w, h = nmf.factorize(m1 * m2, pc = 3, iter = 100)
    # print w * h

    v = np.matrix(wordmatrix)
    weights, feats = nmf.factorize(v, pc = 20, iter = 50)
    topp, pn = showfeatures(weights, feats, artt, wordvec)
    showarticles(artt, topp, pn)
Ejemplo n.º 47
0
def generateAscii():
    blognames,words,data=clusters.readfile('blogdata.txt') 
    clust=clusters.hcluster(data)
    clusters.printclust(clust,labels=blognames) 
def __main__():
    entries, words, data = clusters.read_file('entrydata.txt')
    clust = clusters.hcluster(data)
    clusters.draw_dendogram(clust, entries, jpeg="ex2dend.jpg")
__author__ = 'feng'

import clusters

sessionIds, data = clusters.readSessionFile('session.csv')
clust = clusters.hcluster(data, distance=clusters.session_dissimilarity)

#clusters.printclust(clust, labels=sessionIds)
clusters.drawdendrogram(clust, sessionIds, jpeg='sessionclust.jpg')
Ejemplo n.º 50
0
import clusters
blognames, words, data = clusters.readfile('blogdataascii.txt')
clust = clusters.hcluster(data)
clusters.drawdendrogram(clust, blognames, jpeg='blogcluster.jpg')
import clusters

# pprint(clusters.readfile('blogdata.txt'))
blognames, words, data = clusters.readfile('blogdata.txt')
clust = clusters.hcluster(data)
# clusters.printclust(clust, labels=blognames)

# k-means
kclust1 = clusters.kcluster(data, k=10)
# pprint([[blognames[i] for i in kclust[j]] for j in range(10)])
kclust2, clusters_pos = clusters.kcluster_exercise(data, k=10)
# pprint(clusters_pos)

# clusters on preferences
wants, people, data = clusters.readfile('zebo.txt')
clust = clusters.hcluster(data, distance=clusters.tanamoto)
clusters.drawdendrogram(clust, wants)