def main(): blognames, words, data = clusters.readfile('blogdata.txt') print "K value is 5" kclust = clusters.kcluster(data, k=5) print "K value is 10" kclust = clusters.kcluster(data, k=10) print "K value is 20" kclust = clusters.kcluster(data, k=20)
def main(): blognames,words,data=clusters.readfile('blogdata.txt') print "K value is 5" kclust=clusters.kcluster(data,k=5) print "K value is 10" kclust=clusters.kcluster(data,k=10) print "K value is 20" kclust=clusters.kcluster(data,k=20)
def main(): data, countries = read_file() #elbow_method(data) #test_metrics(data) #b_k(data, countries) #hierarchical.hier(data,countries) final_clusters = [] final_country_clusters = [] best_sse = 5 for j in range(1001): raw_clusters = clus.kcluster(data, distance=func, k=best_k) clusters = [] country_clusters = [] for i in range(best_k): if len(raw_clusters[i]) == 0: continue clusters.append(raw_clusters[i]) #print('cluster {}:'.format(i + 1)) #print([countries[j] for j in raw_clusters[i]]) country_clusters.append([countries[j][1] for j in raw_clusters[i]]) temp_sse = sse(clusters, data) print('process: ' + str(j)) if temp_sse < best_sse: best_sse = temp_sse final_clusters = clusters final_country_clusters = country_clusters print('best sse: ' + str(best_sse)) file = open('cluster_results.json', "w") for i in range(len(final_country_clusters)): c = final_country_clusters[i] for country in c: file.write("['" + country + "', " + str(i) + "],\n") file.close() word_cloud.cloud(final_clusters, data)
def kcluster_bisect(clusters, vectors, distance=utils.euclidean, k=4): if len(clusters) == k: return clusters max_sse = None cluster_index = None for i in range(len(clusters)): # Compute cluster with highest SSE cluster = clusters[i] score = 0 centroid = k_means.get_centroid(cluster, vectors) for country in cluster: score += pow(distance(vectors[country], centroid), 2) if max_sse is None or score > max_sse: max_sse = score cluster_index = i original_indexes = [] # Save actual indexes of the chosen cluster relative to the original vectors for index in clusters[cluster_index]: original_indexes.append(index) new_clusters = utils.kcluster([vectors[index] for index in clusters.pop(cluster_index)], distance=distance, k=2) for cluster in new_clusters: for i in range(len(cluster)): cluster[i] = original_indexes[cluster[i]] # Convert back to original vector indexes return kcluster_bisect(clusters + new_clusters, vectors, distance=distance, k=k)
def bisekt(clusters, vectors, distance=utils.euclidean, k=4): if len(clusters) == k: return clusters my_sse = None indi_clus = None for i in range(len(clusters)): cluster = clusters[i] score = 0 centroid = kmeans.get_centroid(cluster, vectors) for country in cluster: score += pow(distance(vectors[country], centroid), 2) if my_sse is None or score > my_sse: my_sse = score indi_clus = i indi_orig = [] for index in clusters[indi_clus]: indi_orig.append(index) newclus = utils.kcluster( [vectors[index] for index in clusters.pop(indi_clus)], distance=distance, k=2) for cluster in newclus: for i in range(len(cluster)): cluster[i] = indi_orig[cluster[i]] return bisekt(clusters + newclus, vectors, distance=distance, k=clusternum)
def kmeans(x): jobnames,projects,data=clusters.readfile('job_projects') cl, matches = clusters.kcluster(data, k=x) #print cl matches_with_names = [] for i in range(x): matches_with_names.append([jobnames[r] for r in matches[i]]) return matches_with_names
def getKmeans(): blognames, words, data = clusters.readfile("blogdata.txt") print "K value is 5" kclust = clusters.kcluster(data, k=5) print "\t\t" + str([blognames[r] for r in kclust[0]]) # print blognames in 1st centroid print "\t\t" + str([blognames[r] for r in kclust[1]]) # print blognames in 2nd centroid print "\t\t" + str([blognames[r] for r in kclust[2]]) # print blognames in 3rd centroid print "\t\t" + str([blognames[r] for r in kclust[3]]) # print blognames in 4th centroid print "\t\t" + str([blognames[r] for r in kclust[4]]) # print blognames in 5th centroid print "K value is 10" kclust = clusters.kcluster(data, k=10) print "\t\t" + str([blognames[r] for r in kclust[0]]) # print blognames in 1st centroid print "\t\t" + str([blognames[r] for r in kclust[1]]) # print blognames in 2nd centroid print "\t\t" + str([blognames[r] for r in kclust[2]]) # print blognames in 3rd centroid print "\t\t" + str([blognames[r] for r in kclust[3]]) # print blognames in 4th centroid print "\t\t" + str([blognames[r] for r in kclust[4]]) # print blognames in 5th centroid print "\t\t" + str([blognames[r] for r in kclust[5]]) # print blognames in 6th centroid print "\t\t" + str([blognames[r] for r in kclust[6]]) # print blognames in 7th centroid print "\t\t" + str([blognames[r] for r in kclust[7]]) # print blognames in 8th centroid print "\t\t" + str([blognames[r] for r in kclust[8]]) # print blognames in 9th centroid print "\t\t" + str([blognames[r] for r in kclust[9]]) # print blognames in 10th centroid print "K value is 20" kclust = clusters.kcluster(data, k=20) print "\t\t" + str([blognames[r] for r in kclust[0]]) # print blognames in 1st centroid print "\t\t" + str([blognames[r] for r in kclust[1]]) # print blognames in 2nd centroid print "\t\t" + str([blognames[r] for r in kclust[2]]) # print blognames in 3rd centroid print "\t\t" + str([blognames[r] for r in kclust[3]]) # print blognames in 4th centroid print "\t\t" + str([blognames[r] for r in kclust[4]]) # print blognames in 5th centroid print "\t\t" + str([blognames[r] for r in kclust[5]]) # print blognames in 6th centroid print "\t\t" + str([blognames[r] for r in kclust[6]]) # print blognames in 7th centroid print "\t\t" + str([blognames[r] for r in kclust[7]]) # print blognames in 8th centroid print "\t\t" + str([blognames[r] for r in kclust[8]]) # print blognames in 9th centroid print "\t\t" + str([blognames[r] for r in kclust[9]]) # print blognames in 10th centroid print "\t\t" + str([blognames[r] for r in kclust[10]]) # print blognames in 11th centroid print "\t\t" + str([blognames[r] for r in kclust[11]]) # print blognames in 12th centroid print "\t\t" + str([blognames[r] for r in kclust[12]]) # print blognames in 13th centroid print "\t\t" + str([blognames[r] for r in kclust[13]]) # print blognames in 14th centroid print "\t\t" + str([blognames[r] for r in kclust[14]]) # print blognames in 15th centroid print "\t\t" + str([blognames[r] for r in kclust[15]]) # print blognames in 16th centroid print "\t\t" + str([blognames[r] for r in kclust[16]]) # print blognames in 17th centroid print "\t\t" + str([blognames[r] for r in kclust[17]]) # print blognames in 18th centroid print "\t\t" + str([blognames[r] for r in kclust[18]]) # print blognames in 19th centroid print "\t\t" + str([blognames[r] for r in kclust[19]]) # print blognames in 20th centroid
def getKmeans(): blognames,words,data=clusters.readfile('blogdata.txt') print "K value is 5" kclust=clusters.kcluster(data,k=5) print "\t\t"+str([blognames[r] for r in kclust[0]]) print "\t\t"+str([blognames[r] for r in kclust[1]]) print "\t\t"+str([blognames[r] for r in kclust[2]]) print "\t\t"+str([blognames[r] for r in kclust[3]]) print "\t\t"+str([blognames[r] for r in kclust[4]]) print "K value is 10" kclust=clusters.kcluster(data,k=10) print "\t\t"+str([blognames[r] for r in kclust[0]]) print "\t\t"+str([blognames[r] for r in kclust[1]]) print "\t\t"+str([blognames[r] for r in kclust[2]]) print "\t\t"+str([blognames[r] for r in kclust[3]]) print "\t\t"+str([blognames[r] for r in kclust[4]]) print "\t\t"+str([blognames[r] for r in kclust[5]]) print "\t\t"+str([blognames[r] for r in kclust[6]]) print "\t\t"+str([blognames[r] for r in kclust[7]]) print "\t\t"+str([blognames[r] for r in kclust[8]]) print "\t\t"+str([blognames[r] for r in kclust[9]]) print "K value is 20" kclust=clusters.kcluster(data,k=20) print "\t\t"+str([blognames[r] for r in kclust[0]]) print "\t\t"+str([blognames[r] for r in kclust[1]]) print "\t\t"+str([blognames[r] for r in kclust[2]]) print "\t\t"+str([blognames[r] for r in kclust[3]]) print "\t\t"+str([blognames[r] for r in kclust[4]]) print "\t\t"+str([blognames[r] for r in kclust[5]]) print "\t\t"+str([blognames[r] for r in kclust[6]]) print "\t\t"+str([blognames[r] for r in kclust[7]]) print "\t\t"+str([blognames[r] for r in kclust[8]]) print "\t\t"+str([blognames[r] for r in kclust[9]]) print "\t\t"+str([blognames[r] for r in kclust[10]]) print "\t\t"+str([blognames[r] for r in kclust[11]]) print "\t\t"+str([blognames[r] for r in kclust[12]]) print "\t\t"+str([blognames[r] for r in kclust[13]]) print "\t\t"+str([blognames[r] for r in kclust[14]]) print "\t\t"+str([blognames[r] for r in kclust[15]]) print "\t\t"+str([blognames[r] for r in kclust[16]]) print "\t\t"+str([blognames[r] for r in kclust[17]]) print "\t\t"+str([blognames[r] for r in kclust[18]]) print "\t\t"+str([blognames[r] for r in kclust[19]])
def k5(): (kcluster, t) = clusters.kcluster(data, k=5) print "Iteration for k=5 is: " + str(t) k=0 while k < 5: for r in kcluster[k]: print "Centroid " + str(k) + ": " + blognames[r] k+=1 print "\n"
def k5(): (kcluster, t) = clusters.kcluster(data, k=5) print "Iteration for k=5 is: " + str(t) k = 0 while k < 5: for r in kcluster[k]: print "Centroid " + str(k) + ": " + blognames[r] k += 1 print "\n"
def compute(k): interations, kclust = clusters.kcluster(data, k=k) print "k:{}".format(k) print "interations:{}".format(interations) count = 1 for cluster in kclust: print "cluster:{}".format(count) for c in cluster: print "\t" + blognames[c] count += 1 print "\n\n"
def initialize_cluster(): prefs = get_data_from_db() user_id_arr = prefs.keys() rev_prefs = transformPrefs(prefs) (movie_name_arr, user_id_arr, rating_arrs) = dic_to_arr_cluster(rev_prefs, user_id_arr) # result_from_cluster = clusters.kcluster( rating_arrs ) kcluster_result = clusters.kcluster(rating_arrs) store_name_arr_for_cluster(movie_name_arr) store_cluster_result(kcluster_result)
def bisect(data, number): cluster_list = [] centers_list = [] count = 0 while count != number: if count == 0: cluster, centroid = clusters.kcluster(data, distance=clusters.euclidean, k=2) cluster_list.append(cluster[0]) cluster_list.append(cluster[1]) centers_list.append(centroid[0]) centers_list.append(centroid[1]) count += 2 else: max_error = 0 index = 0 for f in range(len(cluster_list)): error = 0 for j in range(len(cluster_list[f])): for m in range(len(centers_list[f])): error += (data[cluster_list[f][j]][m] - centers_list[f][m]) ** 2 if max_error < error: max_error = error index = f new_data = [] buffer = cluster_list[index] for k in buffer: new_data.append(data[k]) cluster_list.pop(index) centers_list.pop(index) cluster, centroid = clusters.kcluster(new_data, distance=clusters.euclidean, k=2) for n in cluster: new = [] for l in n: new.append(buffer[l]) cluster_list.append(new) centers_list.append(centroid[0]) centers_list.append(centroid[1]) count += 1 # print(cluster_list) return cluster_list, centers_list
def view_clusters(request, template_name="babymaker/clusters.html"): user = request.user kclusters = [] k_value = 0 cluster_list = None if request.GET.get('k', None): k_value = int(request.GET.get('k')) bookmarks, word_vectors = Bookmark.get_word_vectors_for_user(user) kclusters = clusters.kcluster(word_vectors, k=k_value) cluster_list = render_kclusters(kclusters, bookmarks) return render(request, template_name, locals())
def initialize_cluster(): prefs = get_data_from_db() user_id_arr = prefs.keys() rev_prefs = transformPrefs(prefs) (movie_name_arr, user_id_arr, rating_arrs) = dic_to_arr_cluster(rev_prefs, user_id_arr) #result_from_cluster = clusters.kcluster( rating_arrs ) kcluster_result = clusters.kcluster(rating_arrs) store_name_arr_for_cluster(movie_name_arr) store_cluster_result(kcluster_result)
def test_metrics(data): sse_metrics = [] metrics = ['manhattan', 'euclidean', 'cosine', 'pearson', 'tanimoto'] clusters = eliminate(clus.kcluster(data, distance=clus.manhattan, k=best_k)) sse_metrics.append(sse(clusters, data)) clusters = eliminate(clus.kcluster(data, distance=clus.euclidean, k=best_k)) sse_metrics.append(sse(clusters, data)) clusters = eliminate(clus.kcluster(data, distance=clus.cosine, k=best_k)) sse_metrics.append(sse(clusters, data)) clusters = eliminate(clus.kcluster(data, distance=clus.pearson, k=best_k)) sse_metrics.append(sse(clusters, data)) clusters = eliminate(clus.kcluster(data, distance=clus.tanimoto, k=best_k)) sse_metrics.append(sse(clusters, data)) fig, ax = plt.subplots() ax.plot(metrics, sse_metrics) ax.set(xlabel='metrics', ylabel='sse', title='measure distance metrics') fig.savefig("metrics.png") plt.show()
def createKMeansClusters(kValue): if( kValue>0 ): blognames,words,data=clusters.readfile('blogVector.txt') kclust=clusters.kcluster(data,k=kValue) count = 0 for cluster in kclust: if( len(cluster) > 0 ): print 'cluster', count for instance in cluster: print '...',blognames[instance] count += 1
def main(): print "Loading term document matrix" rownames,colnames,counts=load_tdf("out/tdf.txt") print "Computing IDF" idf=compute_idf(counts) print "Computing TF-IDF matrix" tfidf=compute_tf_idf_matrix(counts,idf) n_clus=15 print "Computing " + str(n_clus) + " clusters" blog_clus=clusters.kcluster(tfidf,distance=compute_similarity_normalized,k=n_clus) f = open("out/blog_clus.txt",'w') for i in range(len(blog_clus)): for j in range(len(blog_clus[i])): f.write(str(i) + "\t" + rownames[blog_clus[i][j]] + "\n") f.close()
def kmeans(): karr = [5, 10, 20] blogs, colnames, data = clusters.readfile('Outputs/blogdata.txt') for i in karr: kclust, itercount = clusters.kcluster(data, k=i) print(kclust) f = open("Outputs/kclust_%d.txt" % i, 'w') f.write("Iteration count: %d \n" % itercount) print(len(kclust)) for cluster in kclust: f.write("****************************\n") f.write("[") for blogid in cluster: f.write(blogs[blogid] + ", ") f.write("]\n")
def elbow_method(data): # Data for plotting max_range = range(1, 25) x_k = [] y_sse = [] for i in max_range: raw_clusters = clus.kcluster(data, distance=func, k=i) clusters = eliminate(raw_clusters) x_k.append(i) y_sse.append(sse(clusters, data)) fig, ax = plt.subplots() ax.plot(x_k, y_sse) ax.set(xlabel='k', ylabel='sse', title='elbow chart') fig.savefig("test.png") plt.show()
def kclusterer(self): self.clst = 2 if len(self.data) != 0: self.t3.get("1.0", END) self.t3.delete("1.0", END) names = self.authors val = self.t2.get("1.0", "end-1c") if len(val) != 0: val = int(val) kclust = clusters.kcluster(self.data, k=val) # print val for i in range(val): a = [names[r] for r in kclust[i]] fin = "" for string in a: fin = fin + string + "," outpt = "{%s}" % fin + "\n" self.t3.insert(END, outpt) else: showerror("Error 202", "Please input a value for K")
def kMean(): kMeanValues = [5, 10, 20] blogs, colnames, data = clusters.readfile('blogdata.txt') for i in kMeanValues: kclust, itercount = clusters.kcluster(data, k=i) print(kclust) f = open("kclust_%d.txt" % i, 'w') f.write("Total Number Of Iterations: %d \n" % itercount) print(len(kclust)) clusterCount = 1 for cluster in kclust: i = 1 f.write("---\n") f.write("Cluster %d \n" % clusterCount) for blogid in cluster: f.write(str(i) + ".\t" + blogs[blogid] + "\n") i += 1 f.write("\n") clusterCount += 1
def main(nameof_file, output_f): (countries, vectors) = readfile(nameof_file) print(countries) print(vectors) clusters = utils.kcluster(vectors, distance=choice_dist, k=clusternum) chcked_clustr = [] final_clustr = [] for i in range(clusternum): if len(clusters[i]) == 0: continue chcked_clustr.append(clusters[i]) print('cluster {}:'.format(i + 1)) print([countries[r] for r in clusters[i]]) final_clustr.append([countries[r][1] for r in clusters[i]]) print("SSE: " + str(sse(chcked_clustr, vectors))) postdo(final_clustr, output_f) make_clust(clusters, vectors)
def main(input_f, output_f): (countries, vectors) = read_file(input_f) print(countries) print(vectors) clusters = utils.kcluster(vectors, distance=distance_function, k=num_clusters) proper_clusters = [] # Nonempty clusters country_clusters = [] # Clusters of country names instead of indexes for i in range(num_clusters): if len(clusters[i]) == 0: continue proper_clusters.append(clusters[i]) print('cluster {}:'.format(i + 1)) print([countries[r] for r in clusters[i]]) country_clusters.append([countries[r][1] for r in clusters[i]]) print("SSE: " + str(sse(proper_clusters, vectors))) write_output(country_clusters, output_f) make_word_clouds(clusters, vectors)
def bisect(clusters, values): if len(clusters) == 7: return clusters sse = None cluster_index = None initial= [] for i in range(len(clusters)): cluster = clusters[i] current_sse = 0 centroid = get_centroid(cluster, values) for country in cluster: current_sse += pow(c.cosine(values[country], centroid), 2) if sse is None or current_sse > sse: sse = current_sse cluster_index = i for index in clusters[cluster_index]: initial.append(index) new_clusters = c.kcluster([values[index] for index in clusters.pop(cluster_index)], distance=c.cosine, k=2) for cluster in new_clusters: for i in range(len(cluster)): cluster[i] = initial[cluster[i]] return bisect(clusters + new_clusters, values)
def clustering_button(self): if len(database) == 0: self.Error_Message_Function() return prof_names, words, data = clusters.readfile("Will_be_Cluestered.txt") type_of_clustering = values_of_clustering[int( self.Radio_Values3.get())] #Determining the type of clustering with the dictionary. if type_of_clustering == "Hierarcial": clust = clusters.hcluster(data) self.All_Results_Part.delete(0, END) for i in range( len( clusters.clust2str(clust, labels=prof_names).split( '\n')) - 1): #split method is used for proper showing of cluster. self.All_Results_Part.insert( END, clusters.clust2str(clust, labels=prof_names).split('\n')[i]) #Last line of list will be empty string so it is neglected. elif type_of_clustering == "K-Means": clust = clusters.kcluster(data, k=int(self.Value_of_k.get())) #k is getting from the entry. prof_names = database.keys() new_list_with_length_of_elements = [(len(i), i) for i in clust] new_list_with_length_of_elements.sort(reverse=True) counter = 0 self.All_Results_Part.delete(0, END) for i, j in new_list_with_length_of_elements: new_proper_list = [prof_names[k] for k in range(len(j))] new_str = "" for i in new_proper_list: new_str += str(i) + " " self.All_Results_Part.insert( END, "Cluster %d:{" % (counter + 1) + new_str + "}" + "\n") counter += 1
def main(input_f): (countries, vectors) = k_means.read_file(input_f) k_arr = [] sse_arr = [] for num in range_clusters: clusters = utils.kcluster(vectors, distance=k_means.distance_function, k=num) proper_clusters = [] print(str(round((float(num) / 50) * 100)) + "%") for i in range(num): if len(clusters[i]) != 0: proper_clusters.append(clusters[i]) k_arr.append(num) sse_arr.append(k_means.sse(proper_clusters, vectors)) plt.plot(k_arr, sse_arr) plt.ylabel("SSE") plt.xlabel("k") plt.xticks(np.arange(0, 50, 2)) plt.show()
import clusters, data_processing '''Import Dataset''' data = data_processing.open_csv_file('dataset.csv') '''Create a list of countries in the order of the similarity matrix''' countries_list = data_processing.get_country_names(data) '''Create numerical attributes matrix''' attr_matrix = data_processing.create_attribute_matrix(data) data_processing.str_to_float(attr_matrix) results = [['Country/Region', 'Cluster']] '''k-means clustering: euclidean distance''' num_cluster=5 resulting_clusters, centroids = clusters.kcluster(attr_matrix,distance=clusters.euclidean,k=num_cluster) print ('clusters by euclidean distance') for i in range(num_cluster): print ('cluster {}:'.format(i+1)) print ([countries_list[r] for r in resulting_clusters[i]]) print([r for r in resulting_clusters[i]]) print() '''k-means clustering: tanimoto coefficient''' resulting_clusters, centroids = clusters.kcluster(attr_matrix,distance=clusters.tanimoto,k=num_cluster) print ('clusters by tanimoto coefficient') for i in range(num_cluster): print ('cluster {}:'.format(i+1)) print ([countries_list[r] for r in resulting_clusters[i]])
def getKClusterRotated(inputFile, k): """Do K-means Clustering""" blognames, words, data = clusters.readfile(inputFile) kclust = clusters.kcluster(clusters.rotatematrix(data), k=k) return blognames, words, getNumbersToString(words, kclust)
import clusters # pprint(clusters.readfile('blogdata.txt')) blognames, words, data = clusters.readfile('blogdata.txt') clust = clusters.hcluster(data) # clusters.printclust(clust, labels=blognames) # k-means kclust1 = clusters.kcluster(data, k=10) # pprint([[blognames[i] for i in kclust[j]] for j in range(10)]) kclust2, clusters_pos = clusters.kcluster_exercise(data, k=10) # pprint(clusters_pos) # clusters on preferences wants, people, data = clusters.readfile('zebo.txt') clust = clusters.hcluster(data, distance=clusters.tanamoto) clusters.drawdendrogram(clust, wants)
#!/usr/bin/env python # -*- coding: utf-8 -*- import clusters blognames,words,data=clusters.readfile( './../data/feed_list.csv' ) for k in range( 1, 10 ) : kclust = None kclust = clusters.kcluster(data, blognames, clusters.pearson, k) for k_id in range( len( kclust ) ): print [blognames[r] for r in kclust[k_id]]
def kmean(): reload(clusters) rownames,words,data=clusters.readfile('blogdata.txt') kclust=clusters.kcluster(data,k=2) [rownames[r] for r in kclust[0]] [rownames[r] for r in kclust[1]]
for i in range(len(v1)): d += (v1[i] - v2[i])**2 return math.sqrt(d) #getBlogs() #main() blognames, words, data = clusters.readfile('similarblogdata.txt') print(blognames) print(words) print(data) for i in range(len(data[1:])): if len(data[i + 1]) != len(data[i]): print(blognames[i + 1]) print(len(data[i + 1])) print(blognames[i]) print(len(data[i])) clust = clusters.hcluster(data) clusters.printclust(clust, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='sblogclust.jpg') kclust = clusters.kcluster(data, k=5) printkclustValues(kclust) kclust = clusters.kcluster(data, k=10) printkclustValues(kclust) kclust = clusters.kcluster(data, k=20) printkclustValues(kclust) coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='sblogs2d.jpg')
elif name == "U.S.A.": line += '{"Country": "' + "United States" + '", "Cluster": ' + str(group) + '}' elif name == "Slovak Rep": line += '{"Country": "' + "Slovakia" + '", "Cluster": ' + str(group) + '}' else: line += '{"Country": "' + name + '", "Cluster": ' + str(group) + '}' return line row_names, column_names, data = clusters.readfile('dataset_vectors.txt') # rdata = clusters.rotatematrix(data) num_clusters = 6 print('Grouping countries into {} clusters:'.format(num_clusters)) # print(rdata) print() clust, centers = clusters.kcluster(data, distance=clusters.pearson, k=num_clusters) print('clusters by pearson correlation') for i in range(num_clusters): print("cluster {}".format(i+1)) print([row_names[r] for r in clust[i]]) print("The SSE Error of pearson correlation is: ", sse_error(data, centers, clust, num_clusters)) print() clust, centers = clusters.kcluster(data, distance=clusters.euclidean, k=num_clusters) print('clusters by euclidean distance') for i in range(num_clusters): print("cluster {}".format(i+1)) print([row_names[r] for r in clust[i]]) print("The SSE Error of euclidean is: ", sse_error(data, centers, clust, num_clusters)) print()
def draw2d(data,labels,jpeg='mds2d.jpg'): img=Image.new('RGB',(2000,2000),(255,255,255)) draw=ImageDraw.Draw(img) for i in range(len(data)): x=(data[i][0]+0.5)*1000 y=(data[i][1]+0.5)*1000 draw.text((x,y),labels[i],(0,0,0)) img.save(jpeg,'JPEG') if __name__ == "__main__": #steps added from the slides import clusters blognames,words,data=clusters.readfile('blogdata1.txt') # calculating the K-Means Clustering #code from slides print "K=5" kclust=clusters.kcluster(data,k=5) print "\n" print "K=10" kclust=clusters.kcluster(data,k=10) print "\n" print "K=20" kclust=clusters.kcluster(data,k=20) print "\n"
# -*- coding: utf-8 -*- import clusters if __name__ == '__main__': blognames, terms, data = clusters.readfile('blog_term_matrix.csv') k = 5 print("K-Means with k=%s" % k) clust = clusters.kcluster(data, k=k) # Clust is an array containing some centroid # Centroid is an array containing some blog-id # Example output of clust = [[2, 6, 10, 11, 13, 14, 17, 27, 29, 40, 43, 44, 49, 54, 55, 61, 62, 65, 66, 67, 70, 71, 73, 85], [8, 22, 41, 47, 59, 93], [25, 34, 38, 84], [21, 39, 68, 74, 80], [18, 24, 58, 78, 82, 94, 98], [4, 15, 16, 28, 31, 33, 35, 53, 77, 79, 86], [3, 9, 23, 26, 36, 37, 48, 52, 56, 57, 60, 87, 88, 95], [5, 12, 20, 30, 45, 46, 50, 51, 63, 64, 72, 75, 76, 89, 90, 91, 92, 97], [19, 69, 81, 83], [0, 1, 7, 32, 42, 96, 99]] for centroid_idx in range(0,len(clust)): centroid = clust[centroid_idx] centroid_blognames = [] for idx in range(0, len(centroid)): blog_id = centroid[idx] blog_name = blognames[blog_id] centroid_blognames.append(blog_name) print("Blognames in centroid-{} = {}\n".format(centroid_idx+1, centroid_blognames)) k = 10 print("K-Means with k=%s" % k) clust = clusters.kcluster(data, k=k) for centroid_idx in range(0,len(clust)): centroid = clust[centroid_idx] centroid_blognames = []
#!/usr/bin/env python # -*- coding: utf-8 -*- import clusters blognames, words, data = clusters.readfile('./../data/feed_list.csv') for k in range(1, 10): kclust = None kclust = clusters.kcluster(data, blognames, clusters.pearson, k) for k_id in range(len(kclust)): print[blognames[r] for r in kclust[k_id]]
parser = argparse.ArgumentParser(description='Get file name and extension') parser.add_argument('-f',action='store',dest='text_file',nargs=1,help='Blog data text file') parser.add_argument('-k',action='store',dest='number_of_clusters',nargs='+',help='Number of Clusters') a = len(sys.argv) - 4 print a while i < a: args = parser.parse_args() if args.text_file: text_file = (args.text_file[0]) if args.text_file: number_of_clusters = int(args.number_of_clusters[i]) blognames,words,data=clusters.readfile(text_file) #######################################Test Input #######################################User Defined Intput print "For k= " + str(args.number_of_clusters[i]) kclust=clusters.kcluster(data, k= number_of_clusters) print print "Clusters" for j in range(number_of_clusters): print '-----------------------------------------' print [blognames[r] for r in kclust[j]] i= i + 1
#!/usr/bin/python import clusters blognames, words, data = clusters.readfile('blogdata1.txt') clust = clusters.hcluster(data) #Question 2 clusters.printclust(clust, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='dengrogram.jpg') #Question 3 print "K = 5" kclust5 = clusters.kcluster(data, k=5) print "\nK = 10" kclust10 = clusters.kcluster(data, k=10) print "\nK = 20" kclust20 = clusters.kcluster(data, k=20) #Question 4 coords = clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='MDS.jpg')
#!/usr/bin/python import clusters blognames, words, data = clusters.readfile('blogdata1.txt') clust=clusters.hcluster(data) #Question 2 clusters.printclust(clust, labels=blognames) clusters.drawdendrogram(clust, blognames, jpeg='dengrogram.jpg') #Question 3 print "K = 5" kclust5 = clusters.kcluster(data, k=5) print "\nK = 10" kclust10 = clusters.kcluster(data, k=10) print "\nK = 20" kclust20 = clusters.kcluster(data, k=20) #Question 4 coords=clusters.scaledown(data) clusters.draw2d(coords, blognames, jpeg='MDS.jpg')
vecsum = zeros(len(data[0])) for l in list: vecsum = add(data[l],vecsum) topwrds = sorted(range(len(vecsum)), key=lambda x: vecsum[x])[-5:] for r in topwrds: print vecsum[r] print "The top words of this cluster are: \n" for r in topwrds: print words[r] moviename, words, data = clusters.readfile('res/blogdata2.txt') print 'Processing......' kclust = clusters.kcluster( data, k = 5) print "\t\t******* CLUSTER 1 *******" printwords( kclust[0], data, words) print '\n' print [moviename[r] for r in kclust[0]] print '\n\n\n' print "\t\t******* CLUSTER 2 *******" printwords( kclust[1], data, words) print '\n' print [moviename[r] for r in kclust[1]] print '\n\n\n'
import clusters blognames, words, data = clusters.readfile('blogdata.txt') kcl = clusters.kcluster(data, k=20) print([blognames[r] for r in kcl[0]]) print([blognames[r] for r in kcl[1]]) print([blognames[r] for r in kcl[2]]) print([blognames[r] for r in kcl[3]]) print([blognames[r] for r in kcl[4]]) #kmod for 5 print([blognames[r] for r in kcl[5]]) print([blognames[r] for r in kcl[6]]) print([blognames[r] for r in kcl[7]]) print([blognames[r] for r in kcl[8]]) print([blognames[r] for r in kcl[9]]) #kmod for 10 print([blognames[r] for r in kcl[10]]) print([blognames[r] for r in kcl[11]]) print([blognames[r] for r in kcl[12]]) print([blognames[r] for r in kcl[13]]) print([blognames[r] for r in kcl[14]]) print([blognames[r] for r in kcl[15]]) print([blognames[r] for r in kcl[16]]) print([blognames[r] for r in kcl[17]]) print([blognames[r] for r in kcl[18]]) print([blognames[r] for r in kcl[19]]) #kmod for 20
import clusters docs, words, data = clusters.readfile('titles_vectors.txt') rdata = clusters.rotatematrix(data) num_clusters = 2 print('Grouping words into {} clusters:'.format(num_clusters)) print() clust = clusters.kcluster(rdata, distance=clusters.pearson, k=num_clusters) print('clusters by pearson correlation') for i in range(num_clusters): print("cluster {}".format(i + 1)) print([words[r - 1] for r in clust[i]]) print() clust = clusters.kcluster(rdata, distance=clusters.tanimoto, k=num_clusters) print('clusters by tanimoto coefficient') for i in range(num_clusters): print("cluster {}".format(i + 1)) print([words[r] for r in clust[i]]) print() clust = clusters.kcluster(rdata, distance=clusters.euclidean, k=num_clusters) print('clusters by euclidean distance') for i in range(num_clusters): print("cluster {}".format(i + 1)) print([words[r] for r in clust[i]]) print() clust = clusters.kcluster(rdata, distance=clusters.cosine, k=num_clusters) print('clusters by cosine distance')
import clusters blogs, words, data = clusters.readfile('blogdata.txt') kclust = clusters.kcluster(data, k=5) print([blogs[item] for item in kclust[0]]) print([blogs[item] for item in kclust[1]]) print([blogs[item] for item in kclust[2]]) print([blogs[item] for item in kclust[3]]) print([blogs[item] for item in kclust[4]])
import clusters blognames, words, data = clusters.readfile('blogdata.txt') print '--------------------' print 'hierarchical clustering' print '--------------------' clust = clusters.hcluster(data) clusters.printclust(clust, labels=blognames) print '--------------------' print 'k-Means clustering' print '--------------------' numclusters = 3 clust = clusters.kcluster(data, k=numclusters) for i in range(numclusters): print 'cluster[%d]:' % i currentcluster = clust[i] for r in currentcluster: print '\t%s' % blognames[r] print
def testNormal(self): m = [[ 1, 2], [ 0, -1], [ 2, 4]] self.assertEquals([[0, 2], [1]], sorted(clusters.kcluster(m, k=2)))
#!/usr/bin/python import clusters blognames, words,data=clusters.readfile('blogdata.txt') kclust=clusters.kcluster(data,k=xxx) for r in kclust[xxx]: print blognames[r]
label = 1 word += k[i][label].split(' ') d = {} for w in word: if w in d: d[w] += 1 else: d[w] = 1 word_counts = [(w, count/20) for w, count in d.items()] word_cloud.create_cloud("{}.png".format(str(j)), word_counts) if __name__ == "__main__": num_clusters = 7 country, values = read_file("processed_data.csv") cluster = c.kcluster(values, distance=c.cosine, k=num_clusters) nonempty_clusters = [] countrys= [] for i in range(num_clusters): if len(cluster[i]) == 0: continue nonempty_clusters.append(cluster[i]) print('cluster {}:'.format(i + 1)) print([country[r] for r in cluster[i]]) countrys.append([country[r][1] for r in cluster[i]]) print("Cosine SSE = " + str(get_sse(nonempty_clusters, values,c.cosine))) output(countrys,"country_clus.json") generate(cluster,values) #cluster = c.kcluster(values, distance=c.euclidean, k=num_clusters) #nonempty_clusters = []
if __name__ == '__main__': textFileName = None blognames = None words = None data = None parser = parser = argparse.ArgumentParser(description='Makes a dendrogram in either ascii or jpeg format.') parser = argparse.ArgumentParser(description='Get file name and extension') parser.add_argument('-f',action='store',dest='text_file',nargs=1,help='Blog data text file') parser.add_argument('-k',action='store',dest='number_of_clusters',nargs='+',help='Number of Clusters') args = parser.parse_args() if args.text_file: textFileName = args.text_file[0] blognames,words,data=clusters.readfile(textFileName) if args.number_of_clusters: if data: for k in args.number_of_clusters: print("For k= {0}".format(k)) kclust = clusters.kcluster(data,k=int(k)) for j in range(int(k)): print('*' * 41) print [blognames[r] for r in kclust[j]]