def main():

    blognames, words, data = clusters.readfile('blogdata.txt')
    print "K value is 5"
    kclust = clusters.kcluster(data, k=5)
    print "K value is 10"
    kclust = clusters.kcluster(data, k=10)
    print "K value is 20"
    kclust = clusters.kcluster(data, k=20)
def main():

    blognames,words,data=clusters.readfile('blogdata.txt') 
    print "K value is 5"
    kclust=clusters.kcluster(data,k=5)
    print "K value is 10"
    kclust=clusters.kcluster(data,k=10)
    print "K value is 20"
    kclust=clusters.kcluster(data,k=20)
Beispiel #3
0
def main():
    data, countries = read_file()
    #elbow_method(data)
    #test_metrics(data)
    #b_k(data, countries)
    #hierarchical.hier(data,countries)
    final_clusters = []
    final_country_clusters = []
    best_sse = 5
    for j in range(1001):
        raw_clusters = clus.kcluster(data, distance=func, k=best_k)
        clusters = []
        country_clusters = []
        for i in range(best_k):
            if len(raw_clusters[i]) == 0:
                continue
            clusters.append(raw_clusters[i])
            #print('cluster {}:'.format(i + 1))
            #print([countries[j] for j in raw_clusters[i]])
            country_clusters.append([countries[j][1] for j in raw_clusters[i]])
        temp_sse = sse(clusters, data)
        print('process: ' + str(j))
        if temp_sse < best_sse:
            best_sse = temp_sse
            final_clusters = clusters
            final_country_clusters = country_clusters
    print('best sse: ' + str(best_sse))
    file = open('cluster_results.json', "w")
    for i in range(len(final_country_clusters)):
        c = final_country_clusters[i]
        for country in c:
            file.write("['" + country + "', " + str(i) + "],\n")
    file.close()
    word_cloud.cloud(final_clusters, data)
def kcluster_bisect(clusters, vectors, distance=utils.euclidean, k=4):
    if len(clusters) == k:
        return clusters

    max_sse = None
    cluster_index = None
    for i in range(len(clusters)):  # Compute cluster with highest SSE
        cluster = clusters[i]
        score = 0

        centroid = k_means.get_centroid(cluster, vectors)

        for country in cluster:
            score += pow(distance(vectors[country], centroid), 2)

        if max_sse is None or score > max_sse:
            max_sse = score
            cluster_index = i

    original_indexes = []  # Save actual indexes of the chosen cluster relative to the original vectors
    for index in clusters[cluster_index]:
        original_indexes.append(index)

    new_clusters = utils.kcluster([vectors[index] for index in clusters.pop(cluster_index)], distance=distance, k=2)
    for cluster in new_clusters:
        for i in range(len(cluster)):
            cluster[i] = original_indexes[cluster[i]]  # Convert back to original vector indexes

    return kcluster_bisect(clusters + new_clusters, vectors, distance=distance, k=k)
Beispiel #5
0
def bisekt(clusters, vectors, distance=utils.euclidean, k=4):
    if len(clusters) == k:
        return clusters

    my_sse = None
    indi_clus = None
    for i in range(len(clusters)):
        cluster = clusters[i]
        score = 0

        centroid = kmeans.get_centroid(cluster, vectors)

        for country in cluster:
            score += pow(distance(vectors[country], centroid), 2)

        if my_sse is None or score > my_sse:
            my_sse = score
            indi_clus = i

    indi_orig = []
    for index in clusters[indi_clus]:
        indi_orig.append(index)

    newclus = utils.kcluster(
        [vectors[index] for index in clusters.pop(indi_clus)],
        distance=distance,
        k=2)
    for cluster in newclus:
        for i in range(len(cluster)):
            cluster[i] = indi_orig[cluster[i]]

    return bisekt(clusters + newclus, vectors, distance=distance, k=clusternum)
def kmeans(x):
    jobnames,projects,data=clusters.readfile('job_projects')
    cl, matches = clusters.kcluster(data, k=x)
    #print cl
    matches_with_names = []
    for i in range(x):
         matches_with_names.append([jobnames[r] for r in matches[i]])
    return matches_with_names
Beispiel #7
0
def getKmeans():
    blognames, words, data = clusters.readfile("blogdata.txt")
    print "K value is 5"
    kclust = clusters.kcluster(data, k=5)
    print "\t\t" + str([blognames[r] for r in kclust[0]])  # print blognames in 1st centroid
    print "\t\t" + str([blognames[r] for r in kclust[1]])  # print blognames in 2nd centroid
    print "\t\t" + str([blognames[r] for r in kclust[2]])  # print blognames in 3rd centroid
    print "\t\t" + str([blognames[r] for r in kclust[3]])  # print blognames in 4th centroid
    print "\t\t" + str([blognames[r] for r in kclust[4]])  # print blognames in 5th centroid
    print "K value is 10"
    kclust = clusters.kcluster(data, k=10)
    print "\t\t" + str([blognames[r] for r in kclust[0]])  # print blognames in 1st centroid
    print "\t\t" + str([blognames[r] for r in kclust[1]])  # print blognames in 2nd centroid
    print "\t\t" + str([blognames[r] for r in kclust[2]])  # print blognames in 3rd centroid
    print "\t\t" + str([blognames[r] for r in kclust[3]])  # print blognames in 4th centroid
    print "\t\t" + str([blognames[r] for r in kclust[4]])  # print blognames in 5th centroid
    print "\t\t" + str([blognames[r] for r in kclust[5]])  # print blognames in 6th centroid
    print "\t\t" + str([blognames[r] for r in kclust[6]])  # print blognames in 7th centroid
    print "\t\t" + str([blognames[r] for r in kclust[7]])  # print blognames in 8th centroid
    print "\t\t" + str([blognames[r] for r in kclust[8]])  # print blognames in 9th centroid
    print "\t\t" + str([blognames[r] for r in kclust[9]])  # print blognames in 10th centroid
    print "K value is 20"
    kclust = clusters.kcluster(data, k=20)
    print "\t\t" + str([blognames[r] for r in kclust[0]])  # print blognames in 1st centroid
    print "\t\t" + str([blognames[r] for r in kclust[1]])  # print blognames in 2nd centroid
    print "\t\t" + str([blognames[r] for r in kclust[2]])  # print blognames in 3rd centroid
    print "\t\t" + str([blognames[r] for r in kclust[3]])  # print blognames in 4th centroid
    print "\t\t" + str([blognames[r] for r in kclust[4]])  # print blognames in 5th centroid
    print "\t\t" + str([blognames[r] for r in kclust[5]])  # print blognames in 6th centroid
    print "\t\t" + str([blognames[r] for r in kclust[6]])  # print blognames in 7th centroid
    print "\t\t" + str([blognames[r] for r in kclust[7]])  # print blognames in 8th centroid
    print "\t\t" + str([blognames[r] for r in kclust[8]])  # print blognames in 9th centroid
    print "\t\t" + str([blognames[r] for r in kclust[9]])  # print blognames in 10th centroid
    print "\t\t" + str([blognames[r] for r in kclust[10]])  # print blognames in 11th centroid
    print "\t\t" + str([blognames[r] for r in kclust[11]])  # print blognames in 12th centroid
    print "\t\t" + str([blognames[r] for r in kclust[12]])  # print blognames in 13th centroid
    print "\t\t" + str([blognames[r] for r in kclust[13]])  # print blognames in 14th centroid
    print "\t\t" + str([blognames[r] for r in kclust[14]])  # print blognames in 15th centroid
    print "\t\t" + str([blognames[r] for r in kclust[15]])  # print blognames in 16th centroid
    print "\t\t" + str([blognames[r] for r in kclust[16]])  # print blognames in 17th centroid
    print "\t\t" + str([blognames[r] for r in kclust[17]])  # print blognames in 18th centroid
    print "\t\t" + str([blognames[r] for r in kclust[18]])  # print blognames in 19th centroid
    print "\t\t" + str([blognames[r] for r in kclust[19]])  # print blognames in 20th centroid
Beispiel #8
0
def getKmeans():
	blognames,words,data=clusters.readfile('blogdata.txt') 
	print "K value is 5"
	kclust=clusters.kcluster(data,k=5)
	print "\t\t"+str([blognames[r] for r in kclust[0]]) 
	print "\t\t"+str([blognames[r] for r in kclust[1]]) 
	print "\t\t"+str([blognames[r] for r in kclust[2]]) 
	print "\t\t"+str([blognames[r] for r in kclust[3]]) 
	print "\t\t"+str([blognames[r] for r in kclust[4]]) 
	print "K value is 10"
	kclust=clusters.kcluster(data,k=10)
	print "\t\t"+str([blognames[r] for r in kclust[0]]) 
	print "\t\t"+str([blognames[r] for r in kclust[1]]) 
	print "\t\t"+str([blognames[r] for r in kclust[2]]) 
	print "\t\t"+str([blognames[r] for r in kclust[3]]) 
	print "\t\t"+str([blognames[r] for r in kclust[4]]) 
	print "\t\t"+str([blognames[r] for r in kclust[5]]) 
	print "\t\t"+str([blognames[r] for r in kclust[6]]) 
	print "\t\t"+str([blognames[r] for r in kclust[7]]) 
	print "\t\t"+str([blognames[r] for r in kclust[8]]) 
	print "\t\t"+str([blognames[r] for r in kclust[9]]) 
	print "K value is 20"
	kclust=clusters.kcluster(data,k=20)
	print "\t\t"+str([blognames[r] for r in kclust[0]]) 
	print "\t\t"+str([blognames[r] for r in kclust[1]]) 
	print "\t\t"+str([blognames[r] for r in kclust[2]])
	print "\t\t"+str([blognames[r] for r in kclust[3]]) 
	print "\t\t"+str([blognames[r] for r in kclust[4]]) 
	print "\t\t"+str([blognames[r] for r in kclust[5]]) 
	print "\t\t"+str([blognames[r] for r in kclust[6]]) 
	print "\t\t"+str([blognames[r] for r in kclust[7]]) 
	print "\t\t"+str([blognames[r] for r in kclust[8]]) 
	print "\t\t"+str([blognames[r] for r in kclust[9]]) 
	print "\t\t"+str([blognames[r] for r in kclust[10]])
	print "\t\t"+str([blognames[r] for r in kclust[11]])
	print "\t\t"+str([blognames[r] for r in kclust[12]])
	print "\t\t"+str([blognames[r] for r in kclust[13]])
	print "\t\t"+str([blognames[r] for r in kclust[14]])
	print "\t\t"+str([blognames[r] for r in kclust[15]])
	print "\t\t"+str([blognames[r] for r in kclust[16]])
	print "\t\t"+str([blognames[r] for r in kclust[17]])
	print "\t\t"+str([blognames[r] for r in kclust[18]])
	print "\t\t"+str([blognames[r] for r in kclust[19]])
Beispiel #9
0
def k5():
	(kcluster, t) = clusters.kcluster(data, k=5)
	print "Iteration for k=5 is: " + str(t)

	k=0
	while k < 5:
	  for r in kcluster[k]:
	    print "Centroid " + str(k) + ": " + blognames[r]
	  k+=1
	print "\n"
def k5():
    (kcluster, t) = clusters.kcluster(data, k=5)
    print "Iteration for k=5 is: " + str(t)

    k = 0
    while k < 5:
        for r in kcluster[k]:
            print "Centroid " + str(k) + ": " + blognames[r]
        k += 1
    print "\n"
Beispiel #11
0
def compute(k):
    interations, kclust = clusters.kcluster(data, k=k)
    print "k:{}".format(k)
    print "interations:{}".format(interations)
    count = 1
    for cluster in kclust:
        print "cluster:{}".format(count)
        for c in cluster:
            print "\t" + blognames[c]
        count += 1
    print "\n\n"
def initialize_cluster():
    prefs = get_data_from_db()
    user_id_arr = prefs.keys()
    rev_prefs = transformPrefs(prefs)

    (movie_name_arr, user_id_arr, rating_arrs) = dic_to_arr_cluster(rev_prefs, user_id_arr)

    # result_from_cluster = clusters.kcluster( rating_arrs )
    kcluster_result = clusters.kcluster(rating_arrs)
    store_name_arr_for_cluster(movie_name_arr)
    store_cluster_result(kcluster_result)
Beispiel #13
0
def bisect(data, number):
    cluster_list = []
    centers_list = []
    count = 0
    while count != number:
        if count == 0:
            cluster, centroid = clusters.kcluster(data, distance=clusters.euclidean, k=2)
            cluster_list.append(cluster[0])
            cluster_list.append(cluster[1])
            centers_list.append(centroid[0])
            centers_list.append(centroid[1])
            count += 2
        else:
            max_error = 0
            index = 0
            for f in range(len(cluster_list)):
                error = 0
                for j in range(len(cluster_list[f])):
                    for m in range(len(centers_list[f])):
                        error += (data[cluster_list[f][j]][m] - centers_list[f][m]) ** 2
                if max_error < error:
                    max_error = error
                    index = f
            new_data = []
            buffer = cluster_list[index]
            for k in buffer:
                new_data.append(data[k])
            cluster_list.pop(index)
            centers_list.pop(index)
            cluster, centroid = clusters.kcluster(new_data, distance=clusters.euclidean, k=2)
            for n in cluster:
                new = []
                for l in n:
                    new.append(buffer[l])
                cluster_list.append(new)

            centers_list.append(centroid[0])
            centers_list.append(centroid[1])
            count += 1
        # print(cluster_list)
    return cluster_list, centers_list
Beispiel #14
0
def view_clusters(request, template_name="babymaker/clusters.html"):
    user = request.user
    kclusters = []
    k_value = 0
    cluster_list = None
    
    if request.GET.get('k', None):
        k_value = int(request.GET.get('k'))
        bookmarks, word_vectors = Bookmark.get_word_vectors_for_user(user)
        kclusters = clusters.kcluster(word_vectors, k=k_value)
        cluster_list = render_kclusters(kclusters, bookmarks)
    return render(request, template_name, locals())
Beispiel #15
0
def initialize_cluster():
    prefs = get_data_from_db()
    user_id_arr = prefs.keys()
    rev_prefs = transformPrefs(prefs)

    (movie_name_arr, user_id_arr,
     rating_arrs) = dic_to_arr_cluster(rev_prefs, user_id_arr)

    #result_from_cluster = clusters.kcluster( rating_arrs )
    kcluster_result = clusters.kcluster(rating_arrs)
    store_name_arr_for_cluster(movie_name_arr)
    store_cluster_result(kcluster_result)
Beispiel #16
0
def test_metrics(data):
    sse_metrics = []
    metrics = ['manhattan', 'euclidean', 'cosine', 'pearson', 'tanimoto']
    clusters = eliminate(clus.kcluster(data, distance=clus.manhattan,
                                       k=best_k))
    sse_metrics.append(sse(clusters, data))
    clusters = eliminate(clus.kcluster(data, distance=clus.euclidean,
                                       k=best_k))
    sse_metrics.append(sse(clusters, data))
    clusters = eliminate(clus.kcluster(data, distance=clus.cosine, k=best_k))
    sse_metrics.append(sse(clusters, data))
    clusters = eliminate(clus.kcluster(data, distance=clus.pearson, k=best_k))
    sse_metrics.append(sse(clusters, data))
    clusters = eliminate(clus.kcluster(data, distance=clus.tanimoto, k=best_k))
    sse_metrics.append(sse(clusters, data))

    fig, ax = plt.subplots()
    ax.plot(metrics, sse_metrics)

    ax.set(xlabel='metrics', ylabel='sse', title='measure distance metrics')
    fig.savefig("metrics.png")
    plt.show()
Beispiel #17
0
def createKMeansClusters(kValue):

	if( kValue>0 ):
		blognames,words,data=clusters.readfile('blogVector.txt')
		kclust=clusters.kcluster(data,k=kValue)

		count = 0
		for cluster in kclust:

			if( len(cluster) > 0 ):
				print 'cluster', count
				for instance in cluster:
					print '...',blognames[instance]

				count += 1
Beispiel #18
0
def main():
    print "Loading term document matrix"
    rownames,colnames,counts=load_tdf("out/tdf.txt")
    print "Computing IDF"
    idf=compute_idf(counts)
    print "Computing TF-IDF matrix"
    tfidf=compute_tf_idf_matrix(counts,idf)
    n_clus=15
    print "Computing " + str(n_clus) + " clusters"
    blog_clus=clusters.kcluster(tfidf,distance=compute_similarity_normalized,k=n_clus)
    f = open("out/blog_clus.txt",'w')
    for i in range(len(blog_clus)):
        for j in range(len(blog_clus[i])):
            f.write(str(i) + "\t" + rownames[blog_clus[i][j]] + "\n")
    f.close()        
Beispiel #19
0
def kmeans():
    karr = [5, 10, 20]
    blogs, colnames, data = clusters.readfile('Outputs/blogdata.txt')
    for i in karr:

        kclust, itercount = clusters.kcluster(data, k=i)
        print(kclust)
        f = open("Outputs/kclust_%d.txt" % i, 'w')
        f.write("Iteration count: %d \n" % itercount)
        print(len(kclust))
        for cluster in kclust:
            f.write("****************************\n")
            f.write("[")
            for blogid in cluster:
                f.write(blogs[blogid] + ", ")
            f.write("]\n")
Beispiel #20
0
def elbow_method(data):
    # Data for plotting
    max_range = range(1, 25)
    x_k = []
    y_sse = []
    for i in max_range:
        raw_clusters = clus.kcluster(data, distance=func, k=i)
        clusters = eliminate(raw_clusters)
        x_k.append(i)
        y_sse.append(sse(clusters, data))

    fig, ax = plt.subplots()
    ax.plot(x_k, y_sse)

    ax.set(xlabel='k', ylabel='sse', title='elbow chart')
    fig.savefig("test.png")
    plt.show()
 def kclusterer(self):
     self.clst = 2
     if len(self.data) != 0:
         self.t3.get("1.0", END)
         self.t3.delete("1.0", END)
         names = self.authors
         val = self.t2.get("1.0", "end-1c")
         if len(val) != 0:
             val = int(val)
             kclust = clusters.kcluster(self.data, k=val)
             # print val
             for i in range(val):
                 a = [names[r] for r in kclust[i]]
                 fin = ""
                 for string in a:
                     fin = fin + string + ","
                 outpt = "{%s}" % fin + "\n"
                 self.t3.insert(END, outpt)
         else:
             showerror("Error 202", "Please input a value for K")
def kMean():
    kMeanValues = [5, 10, 20]
    blogs, colnames, data = clusters.readfile('blogdata.txt')
    for i in kMeanValues:

        kclust, itercount = clusters.kcluster(data, k=i)
        print(kclust)
        f = open("kclust_%d.txt" % i, 'w')
        f.write("Total Number Of Iterations: %d \n" % itercount)
        print(len(kclust))
        clusterCount = 1
        for cluster in kclust:
            i = 1
            f.write("---\n")
            f.write("Cluster %d \n" % clusterCount)
            for blogid in cluster:
                f.write(str(i) + ".\t" + blogs[blogid] + "\n")
                i += 1
            f.write("\n")
            clusterCount += 1
Beispiel #23
0
def main(nameof_file, output_f):
    (countries, vectors) = readfile(nameof_file)
    print(countries)
    print(vectors)

    clusters = utils.kcluster(vectors, distance=choice_dist, k=clusternum)
    chcked_clustr = []
    final_clustr = []
    for i in range(clusternum):
        if len(clusters[i]) == 0:
            continue

        chcked_clustr.append(clusters[i])
        print('cluster {}:'.format(i + 1))
        print([countries[r] for r in clusters[i]])
        final_clustr.append([countries[r][1] for r in clusters[i]])

    print("SSE: " + str(sse(chcked_clustr, vectors)))
    postdo(final_clustr, output_f)
    make_clust(clusters, vectors)
Beispiel #24
0
def main(input_f, output_f):
    (countries, vectors) = read_file(input_f)
    print(countries)
    print(vectors)

    clusters = utils.kcluster(vectors, distance=distance_function, k=num_clusters)
    proper_clusters = []  # Nonempty clusters
    country_clusters = []  # Clusters of country names instead of indexes
    for i in range(num_clusters):
        if len(clusters[i]) == 0:
            continue

        proper_clusters.append(clusters[i])
        print('cluster {}:'.format(i + 1))
        print([countries[r] for r in clusters[i]])
        country_clusters.append([countries[r][1] for r in clusters[i]])

    print("SSE: " + str(sse(proper_clusters, vectors)))

    write_output(country_clusters, output_f)
    make_word_clouds(clusters, vectors)
Beispiel #25
0
def bisect(clusters, values):
    if len(clusters) == 7:
        return clusters
    sse = None
    cluster_index = None
    initial= []
    for i in range(len(clusters)):
        cluster = clusters[i]
        current_sse = 0
        centroid = get_centroid(cluster, values)
        for country in cluster:
            current_sse += pow(c.cosine(values[country], centroid), 2)
        if sse is None or current_sse > sse:
            sse = current_sse
            cluster_index = i
    for index in clusters[cluster_index]:
        initial.append(index)
    new_clusters = c.kcluster([values[index] for index in clusters.pop(cluster_index)], distance=c.cosine, k=2)
    for cluster in new_clusters:
        for i in range(len(cluster)):
            cluster[i] = initial[cluster[i]]
    return bisect(clusters + new_clusters, values)
Beispiel #26
0
 def clustering_button(self):
     if len(database) == 0:
         self.Error_Message_Function()
         return
     prof_names, words, data = clusters.readfile("Will_be_Cluestered.txt")
     type_of_clustering = values_of_clustering[int(
         self.Radio_Values3.get())]
     #Determining the type of clustering with the dictionary.
     if type_of_clustering == "Hierarcial":
         clust = clusters.hcluster(data)
         self.All_Results_Part.delete(0, END)
         for i in range(
                 len(
                     clusters.clust2str(clust, labels=prof_names).split(
                         '\n')) - 1):
             #split method is used for proper showing of cluster.
             self.All_Results_Part.insert(
                 END,
                 clusters.clust2str(clust,
                                    labels=prof_names).split('\n')[i])
             #Last line of list will be empty string so it is neglected.
     elif type_of_clustering == "K-Means":
         clust = clusters.kcluster(data, k=int(self.Value_of_k.get()))
         #k is getting from the entry.
         prof_names = database.keys()
         new_list_with_length_of_elements = [(len(i), i) for i in clust]
         new_list_with_length_of_elements.sort(reverse=True)
         counter = 0
         self.All_Results_Part.delete(0, END)
         for i, j in new_list_with_length_of_elements:
             new_proper_list = [prof_names[k] for k in range(len(j))]
             new_str = ""
             for i in new_proper_list:
                 new_str += str(i) + "  "
             self.All_Results_Part.insert(
                 END, "Cluster %d:{" % (counter + 1) + new_str + "}" + "\n")
             counter += 1
Beispiel #27
0
def main(input_f):
    (countries, vectors) = k_means.read_file(input_f)
    k_arr = []
    sse_arr = []

    for num in range_clusters:
        clusters = utils.kcluster(vectors,
                                  distance=k_means.distance_function,
                                  k=num)
        proper_clusters = []

        print(str(round((float(num) / 50) * 100)) + "%")
        for i in range(num):
            if len(clusters[i]) != 0:
                proper_clusters.append(clusters[i])

        k_arr.append(num)
        sse_arr.append(k_means.sse(proper_clusters, vectors))

    plt.plot(k_arr, sse_arr)
    plt.ylabel("SSE")
    plt.xlabel("k")
    plt.xticks(np.arange(0, 50, 2))
    plt.show()
Beispiel #28
0
import clusters, data_processing

'''Import Dataset'''
data = data_processing.open_csv_file('dataset.csv')

'''Create a list of countries in the order of the similarity matrix'''
countries_list = data_processing.get_country_names(data)

'''Create numerical attributes matrix'''
attr_matrix = data_processing.create_attribute_matrix(data)
data_processing.str_to_float(attr_matrix)
results = [['Country/Region', 'Cluster']]

'''k-means clustering: euclidean distance'''
num_cluster=5
resulting_clusters, centroids = clusters.kcluster(attr_matrix,distance=clusters.euclidean,k=num_cluster)
print ('clusters by euclidean distance')
for i in range(num_cluster):
    print ('cluster {}:'.format(i+1))
    print ([countries_list[r] for r in resulting_clusters[i]])
    print([r for r in resulting_clusters[i]])

print()

'''k-means clustering: tanimoto coefficient'''
resulting_clusters, centroids = clusters.kcluster(attr_matrix,distance=clusters.tanimoto,k=num_cluster)
print ('clusters by tanimoto coefficient')
for i in range(num_cluster):
    print ('cluster {}:'.format(i+1))
    print ([countries_list[r] for r in resulting_clusters[i]])
Beispiel #29
0
def getKClusterRotated(inputFile, k):
    """Do K-means Clustering"""

    blognames, words, data = clusters.readfile(inputFile)
    kclust = clusters.kcluster(clusters.rotatematrix(data), k=k)
    return blognames, words, getNumbersToString(words, kclust)
import clusters

# pprint(clusters.readfile('blogdata.txt'))
blognames, words, data = clusters.readfile('blogdata.txt')
clust = clusters.hcluster(data)
# clusters.printclust(clust, labels=blognames)

# k-means
kclust1 = clusters.kcluster(data, k=10)
# pprint([[blognames[i] for i in kclust[j]] for j in range(10)])
kclust2, clusters_pos = clusters.kcluster_exercise(data, k=10)
# pprint(clusters_pos)

# clusters on preferences
wants, people, data = clusters.readfile('zebo.txt')
clust = clusters.hcluster(data, distance=clusters.tanamoto)
clusters.drawdendrogram(clust, wants)
Beispiel #31
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import clusters

blognames,words,data=clusters.readfile( './../data/feed_list.csv' )

for k in range( 1, 10 ) :
	kclust = None
	kclust = clusters.kcluster(data, blognames, clusters.pearson, k)
	for k_id in range( len( kclust ) ): 
		print [blognames[r] for r in kclust[k_id]]
Beispiel #32
0
def kmean():
  reload(clusters)
  rownames,words,data=clusters.readfile('blogdata.txt')
  kclust=clusters.kcluster(data,k=2)
  [rownames[r] for r in kclust[0]]
  [rownames[r] for r in kclust[1]]
Beispiel #33
0
    for i in range(len(v1)):
        d += (v1[i] - v2[i])**2
    return math.sqrt(d)


#getBlogs()
#main()

blognames, words, data = clusters.readfile('similarblogdata.txt')
print(blognames)
print(words)
print(data)
for i in range(len(data[1:])):
    if len(data[i + 1]) != len(data[i]):
        print(blognames[i + 1])
        print(len(data[i + 1]))
        print(blognames[i])
        print(len(data[i]))
clust = clusters.hcluster(data)
clusters.printclust(clust, labels=blognames)
clusters.drawdendrogram(clust, blognames, jpeg='sblogclust.jpg')

kclust = clusters.kcluster(data, k=5)
printkclustValues(kclust)
kclust = clusters.kcluster(data, k=10)
printkclustValues(kclust)
kclust = clusters.kcluster(data, k=20)
printkclustValues(kclust)
coords = clusters.scaledown(data)
clusters.draw2d(coords, blognames, jpeg='sblogs2d.jpg')
Beispiel #34
0
    elif name == "U.S.A.":
        line += '{"Country": "' + "United States" + '", "Cluster": ' + str(group) + '}'
    elif name == "Slovak Rep":
        line += '{"Country": "' + "Slovakia" + '", "Cluster": ' + str(group) + '}'
    else:
        line += '{"Country": "' + name + '", "Cluster": ' + str(group) + '}'
    return line


row_names, column_names, data = clusters.readfile('dataset_vectors.txt')
# rdata = clusters.rotatematrix(data)
num_clusters = 6
print('Grouping countries into {} clusters:'.format(num_clusters))
# print(rdata)
print()
clust, centers = clusters.kcluster(data, distance=clusters.pearson, k=num_clusters)
print('clusters by pearson correlation')
for i in range(num_clusters):
    print("cluster {}".format(i+1))
    print([row_names[r] for r in clust[i]])
print("The SSE Error of pearson correlation is: ", sse_error(data, centers, clust, num_clusters))

print()
clust, centers = clusters.kcluster(data, distance=clusters.euclidean, k=num_clusters)
print('clusters by euclidean distance')
for i in range(num_clusters):
    print("cluster {}".format(i+1))
    print([row_names[r] for r in clust[i]])
print("The SSE Error of euclidean is: ", sse_error(data, centers, clust, num_clusters))

print()
Beispiel #35
0
def draw2d(data,labels,jpeg='mds2d.jpg'):
    img=Image.new('RGB',(2000,2000),(255,255,255))
    draw=ImageDraw.Draw(img)
    for i in range(len(data)):
        x=(data[i][0]+0.5)*1000
        y=(data[i][1]+0.5)*1000
        draw.text((x,y),labels[i],(0,0,0))
    img.save(jpeg,'JPEG')



if __name__ == "__main__":

#steps added from the slides
 import clusters
 blognames,words,data=clusters.readfile('blogdata1.txt')

 # calculating the K-Means Clustering
 #code from slides
 print "K=5"
 kclust=clusters.kcluster(data,k=5)
 print "\n"
 
 print "K=10"
 kclust=clusters.kcluster(data,k=10)
 print "\n"

 print "K=20"
 kclust=clusters.kcluster(data,k=20)
 print "\n"
# -*- coding: utf-8 -*- 

import clusters

if __name__ == '__main__':
    blognames, terms, data = clusters.readfile('blog_term_matrix.csv')
    
    k = 5
    print("K-Means with k=%s" % k)
    clust = clusters.kcluster(data, k=k)
    
    # Clust is an array containing some centroid
    # Centroid is an array containing some blog-id
    # Example output of clust = [[2, 6, 10, 11, 13, 14, 17, 27, 29, 40, 43, 44, 49, 54, 55, 61, 62, 65, 66, 67, 70, 71, 73, 85], [8, 22, 41, 47, 59, 93], [25, 34, 38, 84], [21, 39, 68, 74, 80], [18, 24, 58, 78, 82, 94, 98], [4, 15, 16, 28, 31, 33, 35, 53, 77, 79, 86], [3, 9, 23, 26, 36, 37, 48, 52, 56, 57, 60, 87, 88, 95], [5, 12, 20, 30, 45, 46, 50, 51, 63, 64, 72, 75, 76, 89, 90, 91, 92, 97], [19, 69, 81, 83], [0, 1, 7, 32, 42, 96, 99]]
    for centroid_idx in range(0,len(clust)):
        centroid = clust[centroid_idx]
        centroid_blognames = []
        for idx in range(0, len(centroid)):
            blog_id = centroid[idx]
            blog_name = blognames[blog_id]
            centroid_blognames.append(blog_name)
        
        print("Blognames in centroid-{} = {}\n".format(centroid_idx+1, centroid_blognames))
    
    k = 10
    print("K-Means with k=%s" % k)
    clust = clusters.kcluster(data, k=k)
    
    for centroid_idx in range(0,len(clust)):
        centroid = clust[centroid_idx]
        centroid_blognames = []
Beispiel #37
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import clusters

blognames, words, data = clusters.readfile('./../data/feed_list.csv')

for k in range(1, 10):
    kclust = None
    kclust = clusters.kcluster(data, blognames, clusters.pearson, k)
    for k_id in range(len(kclust)):
        print[blognames[r] for r in kclust[k_id]]
parser = argparse.ArgumentParser(description='Get file name and extension')
parser.add_argument('-f',action='store',dest='text_file',nargs=1,help='Blog data text file')
parser.add_argument('-k',action='store',dest='number_of_clusters',nargs='+',help='Number of Clusters')

a = len(sys.argv) - 4

print a

while i < a:
	args = parser.parse_args()
	if args.text_file:
		text_file = (args.text_file[0])
	if args.text_file:
		number_of_clusters = int(args.number_of_clusters[i])

	blognames,words,data=clusters.readfile(text_file)



#######################################Test Input

#######################################User Defined Intput
	print "For k= " + str(args.number_of_clusters[i])
	kclust=clusters.kcluster(data, k= number_of_clusters)
	print
	print "Clusters"
	for j in range(number_of_clusters):
		print '-----------------------------------------'
		print [blognames[r] for r in kclust[j]]
	i= i + 1
Beispiel #39
0
#!/usr/bin/python

import clusters
blognames, words, data = clusters.readfile('blogdata1.txt')
clust = clusters.hcluster(data)

#Question 2
clusters.printclust(clust, labels=blognames)
clusters.drawdendrogram(clust, blognames, jpeg='dengrogram.jpg')

#Question 3
print "K = 5"
kclust5 = clusters.kcluster(data, k=5)
print "\nK = 10"
kclust10 = clusters.kcluster(data, k=10)
print "\nK = 20"
kclust20 = clusters.kcluster(data, k=20)

#Question 4

coords = clusters.scaledown(data)
clusters.draw2d(coords, blognames, jpeg='MDS.jpg')
Beispiel #40
0
#!/usr/bin/python

import clusters
blognames, words, data = clusters.readfile('blogdata1.txt')
clust=clusters.hcluster(data)

#Question 2
clusters.printclust(clust, labels=blognames)
clusters.drawdendrogram(clust, blognames, jpeg='dengrogram.jpg')



#Question 3
print "K = 5"
kclust5 = clusters.kcluster(data, k=5)
print "\nK = 10"
kclust10 = clusters.kcluster(data, k=10)
print "\nK = 20"
kclust20 = clusters.kcluster(data, k=20)



#Question 4

coords=clusters.scaledown(data)
clusters.draw2d(coords, blognames, jpeg='MDS.jpg')

    vecsum = zeros(len(data[0]))
    for l in list:
        vecsum = add(data[l],vecsum)

    topwrds =  sorted(range(len(vecsum)), key=lambda x: vecsum[x])[-5:]
    for r in topwrds:
        print vecsum[r]
    print "The top words of this cluster are: \n"
    for r in topwrds:
        print words[r]


	
moviename, words, data =  clusters.readfile('res/blogdata2.txt')
print 'Processing......'
kclust = clusters.kcluster( data, k = 5)

print "\t\t******* CLUSTER 1 *******"
printwords( kclust[0], data, words)
print '\n'
print [moviename[r] for r in kclust[0]]
print '\n\n\n'


print "\t\t******* CLUSTER 2 *******"
printwords( kclust[1], data, words)
print '\n'
print [moviename[r] for r in kclust[1]]
print '\n\n\n'

Beispiel #42
0
import clusters

blognames, words, data = clusters.readfile('blogdata.txt')
kcl = clusters.kcluster(data, k=20)
print([blognames[r] for r in kcl[0]])
print([blognames[r] for r in kcl[1]])
print([blognames[r] for r in kcl[2]])
print([blognames[r] for r in kcl[3]])
print([blognames[r] for r in kcl[4]])  #kmod for 5
print([blognames[r] for r in kcl[5]])
print([blognames[r] for r in kcl[6]])
print([blognames[r] for r in kcl[7]])
print([blognames[r] for r in kcl[8]])
print([blognames[r] for r in kcl[9]])  #kmod for 10
print([blognames[r] for r in kcl[10]])
print([blognames[r] for r in kcl[11]])
print([blognames[r] for r in kcl[12]])
print([blognames[r] for r in kcl[13]])
print([blognames[r] for r in kcl[14]])
print([blognames[r] for r in kcl[15]])
print([blognames[r] for r in kcl[16]])
print([blognames[r] for r in kcl[17]])
print([blognames[r] for r in kcl[18]])
print([blognames[r] for r in kcl[19]])  #kmod for 20
import clusters

docs, words, data = clusters.readfile('titles_vectors.txt')
rdata = clusters.rotatematrix(data)
num_clusters = 2
print('Grouping words into {} clusters:'.format(num_clusters))

print()
clust = clusters.kcluster(rdata, distance=clusters.pearson, k=num_clusters)
print('clusters by pearson correlation')
for i in range(num_clusters):
    print("cluster {}".format(i + 1))
    print([words[r - 1] for r in clust[i]])

print()
clust = clusters.kcluster(rdata, distance=clusters.tanimoto, k=num_clusters)
print('clusters by tanimoto coefficient')
for i in range(num_clusters):
    print("cluster {}".format(i + 1))
    print([words[r] for r in clust[i]])

print()
clust = clusters.kcluster(rdata, distance=clusters.euclidean, k=num_clusters)
print('clusters by euclidean distance')
for i in range(num_clusters):
    print("cluster {}".format(i + 1))
    print([words[r] for r in clust[i]])

print()
clust = clusters.kcluster(rdata, distance=clusters.cosine, k=num_clusters)
print('clusters by cosine distance')
Beispiel #44
0
def getKClusterRotated(inputFile, k):
    """Do K-means Clustering"""

    blognames, words, data = clusters.readfile(inputFile)
    kclust = clusters.kcluster(clusters.rotatematrix(data), k=k)
    return blognames, words, getNumbersToString(words, kclust)
Beispiel #45
0
import clusters

blogs, words, data = clusters.readfile('blogdata.txt')
kclust = clusters.kcluster(data, k=5)
print([blogs[item] for item in kclust[0]])
print([blogs[item] for item in kclust[1]])
print([blogs[item] for item in kclust[2]])
print([blogs[item] for item in kclust[3]])
print([blogs[item] for item in kclust[4]])
Beispiel #46
0
import clusters


blognames, words, data = clusters.readfile('blogdata.txt')

print '--------------------'
print 'hierarchical clustering'
print '--------------------'

clust = clusters.hcluster(data)
clusters.printclust(clust, labels=blognames)


print '--------------------'
print 'k-Means clustering'
print '--------------------'

numclusters = 3
clust = clusters.kcluster(data, k=numclusters)
for i in range(numclusters):
    print 'cluster[%d]:' % i
    currentcluster = clust[i]
    for r in currentcluster:
        print '\t%s' % blognames[r]
    print
  def testNormal(self):
    m = [[ 1,  2],
         [ 0, -1],
         [ 2,  4]]

    self.assertEquals([[0, 2], [1]], sorted(clusters.kcluster(m, k=2)))
Beispiel #48
0
#!/usr/bin/python
import clusters

blognames, words,data=clusters.readfile('blogdata.txt')
kclust=clusters.kcluster(data,k=xxx)

for r in kclust[xxx]:
    print blognames[r]

Beispiel #49
0
                label = 1
            word += k[i][label].split(' ')
        d = {}
        for w in word:
            if w in d:
                d[w] += 1
            else:
                d[w] = 1
        word_counts = [(w, count/20) for w, count in d.items()]
        word_cloud.create_cloud("{}.png".format(str(j)), word_counts)
if __name__ == "__main__":
    num_clusters = 7

    country, values = read_file("processed_data.csv")

    cluster = c.kcluster(values, distance=c.cosine, k=num_clusters)
    nonempty_clusters = []
    countrys= []
    for i in range(num_clusters):
        if len(cluster[i]) == 0:
            continue
        nonempty_clusters.append(cluster[i])
        print('cluster {}:'.format(i + 1))
        print([country[r] for r in cluster[i]])
        countrys.append([country[r][1] for r in cluster[i]])

    print("Cosine SSE = " + str(get_sse(nonempty_clusters, values,c.cosine)))
    output(countrys,"country_clus.json")
    generate(cluster,values)
    #cluster = c.kcluster(values, distance=c.euclidean, k=num_clusters)
    #nonempty_clusters = []
Beispiel #50
0
if __name__ == '__main__':

	textFileName = None 
	blognames = None 
	words = None 
	data = None

	parser = parser = argparse.ArgumentParser(description='Makes a dendrogram in either ascii or jpeg format.')
	parser = argparse.ArgumentParser(description='Get file name and extension')
	parser.add_argument('-f',action='store',dest='text_file',nargs=1,help='Blog data text file')
	parser.add_argument('-k',action='store',dest='number_of_clusters',nargs='+',help='Number of Clusters')

	args = parser.parse_args()

	if args.text_file:
		textFileName = args.text_file[0]
		blognames,words,data=clusters.readfile(textFileName)

	if args.number_of_clusters:
		if data:
			for k in args.number_of_clusters:
				print("For k= {0}".format(k))
				kclust = clusters.kcluster(data,k=int(k))
				for j in range(int(k)):
					print('*' * 41)
					print [blognames[r] for r in kclust[j]]