def main(): ### ========== TODO : START ========== ### # part 1: explore LFW data set X, y = util.get_lfw_data() #util.show_image(X[0]) #util.show_image(X[1]) #util.show_image(X[2]) scores = np.zeros((19, 19)) for i in range(19): for j in range(19): if i != j: X1, y1 = util.limit_pics(X, y, [i, j], 40) face_points = build_face_image_points(X1, y1) cluster_set = kMeans(face_points, 2, "cheat", plot=False) scores[i, j] = cluster_set.score() np.fill_diagonal(scores, np.iinfo(np.int16).max) similar_tuple = np.unravel_index(np.argmin(scores), scores.shape) print "it did worst with: ", similar_tuple np.fill_diagonal(scores, np.iinfo(np.int16).min) distinct_tuple = np.unravel_index(np.argmax(scores), scores.shape) print "it did best with: ", distinct_tuple X1, y1 = util.limit_pics(X, y, [similar_tuple[0]], 40) util.show_image(X1[0]) X1, y1 = util.limit_pics(X, y, [similar_tuple[1]], 40) util.show_image(X1[0]) X1, y1 = util.limit_pics(X, y, [distinct_tuple[0]], 40) util.show_image(X1[0]) X1, y1 = util.limit_pics(X, y, [distinct_tuple[1]], 40) util.show_image(X1[0]) #util.show_image(np.mean(X, axis=1)) U, mu = util.PCA(X) #util.plot_gallery([util.vec_to_image(U[:,i]) for i in xrange(12)]) # for i in [1,10,50, 100, 500, 1288]: # Z, ul = util.apply_PCA_from_Eig(X, U, i, mu) # new_X = util.reconstruct_from_PCA(Z, ul, mu) # util.plot_gallery([new_X[j] for j in xrange(12)]) ### ========== TODO : END ========== ### #======================================== # part 2 # part b: test Cluster implementation # centroid: [ 1.04022358 0.62914619] np.random.seed(1234) sim_points = generate_points_2d(20) cluster = Cluster(sim_points) print 'centroid:', cluster.centroid().attrs # parts c-d: test kMeans implementation using toy dataset np.random.seed(1234) sim_points = generate_points_2d(20) k = 3 # test cluster using random initialization #kmeans_clusters = kMeans(sim_points, k, init='random', plot=True) # test cluster using cheat initialization kmeans_clusters = kMeans(sim_points, k, init='cheat', plot=True) ### ========== TODO : START ========== ### # part 3 # part a: explore effect of lower-dimensional representations on clustering performance np.random.seed(1234) # part b: determine ``most discriminative'' and ``least discriminative'' pairs of images np.random.seed(1234) ### ========== TODO : END ========== ### #======================================== # part 4a # test Cluster implementation # medoid: [ 1.05674064 0.71183522] np.random.seed(1234) sim_points = generate_points_2d(20) cluster = Cluster(sim_points) print 'medoid:', cluster.medoid().attrs # test kMedoids implementation using toy dataset np.random.seed(1234) sim_points = generate_points_2d(20) k = 3 # test cluster using random initialization kmedoids_clusters = kMedoids(sim_points, k, init='random', plot=True) ### ========== TODO : START ========== ### # part 4 # part b: compare k-means and k-medoids np.random.seed(1234) # part c: explore effect of lower-dimensional representations on clustering performance np.random.seed(1234)
def main(): ### ========== TODO : START ========== ### # part 1: explore LFW data set # part 1a: X, y = get_lfw_data() size = (50, 37) y_range = [] for y_ in y: if y_ not in y_range: y_range.append(y_) print y_range """ show_image(X[0],size) print y[0] show_image(X[1], size) print y[1] show_image(X[2], size) print y[2] show_image(X[4], size) print y[4] """ average = 0 for i in range(0, 1508): average += X[i] average = average / 1508.0 show_image(average, size) # part 1b: U, mu = PCA(X) plot_gallery([vec_to_image(U[:, i]) for i in xrange(12)]) # part 1c: l_range = [1, 10, 50, 100, 500, 1288] for l in l_range: Z, Ul = apply_PCA_from_Eig(X, U, l, mu) X_rec = reconstruct_from_PCA(Z, Ul, mu) #plot_gallery([X_rec[i] for i in xrange(12)]) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 2d-2f: cluster toy dataset np.random.seed(1234) points = generate_points_2d(20) #kMeans(points,3,plot = True) #kMedoids(points,3, plot = True) #kMeans(points,3,init='cheat', plot = True) #kMedoids(points,3, init='cheat', plot = True) ### ========== TODO : END ========== ### """ ### ========== TODO : START ========== ### # part 3a: cluster faces np.random.seed(1234) X1, y1 = util.limit_pics(X, y, [2,3,6,8], 40) points = build_face_image_points(X1, y1) plot = {} for pt in points: if pt.label not in plot: plot[pt.label] = [] plot[pt.label].append(pt) clusters = ClusterSet() for l in plot: clusters.add(Cluster(plot[l])) plot_clusters(clusters, 'orig', ClusterSet.centroids) print "start" # faces kMeans cluster score = kMeans(points,k=4) max = score min = score average = score for i in range(9): score = kMeans(points, k=4) average = average + score if score>max: max = score if score<min: min = score print "KMeans:" print "average:" print average/10.0 print "max" print max print "min" print min print "start" score = kMedoids(points, k=4) max = score min = score average = score for i in range(9): score = kMedoids(points, k=4) average = average + score if score > max: max = score if score < min: min = score print "KMedoids" print "average:" print average / 10.0 print "max" print max print "min" print min # part 3b: explore effect of lower-dimensional representations on clustering performance np.random.seed(1234) X1, y1 = util.limit_pics(X, y, [2, 8], 40) U, mu = PCA(X) l = 1 l_range = [] kMeans_score = [] kMedoids_score = [] while l <= 41: l_range.append(l) Z, Ul = apply_PCA_from_Eig(X1, U, l, mu) points = build_face_image_points(Z, y1) kMeans_score.append(kMeans(points, 2, init='cheat')) kMedoids_score.append(kMedoids(points, 2, init='cheat')) l = l + 2 mean_scatter = plt.scatter(l_range, kMeans_score,c='b', s=20) medoid_scatter = plt.scatter(l_range,kMedoids_score,c='r',s=20) plt.legend((mean_scatter,medoid_scatter),('kMeans', 'kMedoids')) plt.show() #X_rec = reconstruct_from_PCA(Z, Ul, mu) """ # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images np.random.seed(1234) best_pair = [] poorest_pair = [] best_score = 0 poorest_score = 100 l = 30 for person1 in range(12): for person2 in range(person1 + 1, 12): X1, y1 = util.limit_pics(X, y, [person1, person2], 40) U, mu = PCA(X) Z, Ul = apply_PCA_from_Eig(X1, U, l, mu) points = build_face_image_points(Z, y1) score = kMedoids(points, 2) if score > best_score: best_score = score best_pair = [person1, person2] if score < poorest_score: poorest_score = score poorest_pair = [person1, person2] print best_pair print best_score plot_representative_images(X, y, best_pair, title='The most distinguished two persons') print poorest_pair print poorest_score plot_representative_images(X, y, poorest_pair, title='The most undistinguished two persons')
def main(): ### ========== TODO : START ========== ### # part 1: explore LFW data set x_in, y_in = get_lfw_data() #print(x_in.shape) #print("y size:", y_in.shape) #x_average = np.mean(x_in, axis=0) #show_image(x_in[3]) #show_image(x_average) #U, mu = PCA(x_in) #plot_gallery([vec_to_image(U[:, i]) for i in range(12)]) #l_comp = [1, 10, 50, 100, 500, 1288] #for i in range(len(l_comp)): #x_eigen, u_eigen = apply_PCA_from_Eig(x_in, U, l_comp[i], mu) #x_out = reconstruct_from_PCA(x_eigen, u_eigen, mu) #print(l_comp[i]) #plot_gallery([x_out[i] for i in range(12)]) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 2d-2f: cluster toy dataset np.random.seed(1234) #points = generate_points_2d(20) #kMeans(points,k=3, plot=True) #kMedoids(points,k=3, plot=True) #print(tt.label) #tt = kMedoids(test_1, 3, init='cheat', plot=True) # for i in tt.members: # print("--------------") # print(i) # print(tt.score()) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 3a: cluster faces np.random.seed(1234) X1, y1 = util.limit_pics(x_in, y_in, [4, 6, 13, 16], 40) print(X1.shape) points = build_face_image_points(X1, y1) # total_med_score = 0 med_max = 0 med_min = float('inf') for i in range(10): tt = kMedoids(points, 4, plot=False) score = tt.score() total_med_score += score if score > med_max: med_max = score if score < med_min: med_min = score print('total med score is', total_med_score) print('max is', med_max, 'min is', med_min) # part 3b: explore effect of lower-dimensional representations on clustering performance np.random.seed(1234) # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images np.random.seed(1234)
def main(): ### ========== TODO : START ========== ### # part 1: explore LFW data set # 1a: show images X, y = get_lfw_data() #show_image(X[0]) #show_image(X[100]) #show_image(X[1000]) mu = X.mean(0) #show_image(mu) # 1b: eigenfaces U = PCA(X) #show_image(vec_to_image(U[0][:, 0])) #show_image(vec_to_image(U[0][:, 1])) #show_image(vec_to_image(U[0][:, 2])) #show_image(vec_to_image(U[0][:, 3])) #show_image(vec_to_image(U[0][:, 4])) #show_image(vec_to_image(U[0][:, 5])) #show_image(vec_to_image(U[0][:, 6])) #show_image(vec_to_image(U[0][:, 7])) #show_image(vec_to_image(U[0][:, 8])) #show_image(vec_to_image(U[0][:, 9])) #show_image(vec_to_image(U[0][:, 10])) #show_image(vec_to_image(U[0][:, 11])) # 1c: reconstruct from PCA '''li = [1, 10, 50, 100, 500, 1288] for l in li: Z, Ul = apply_PCA_from_Eig(X, U[0], l, mu) for i in range(0, 12): im_name = "l{}_im{}".format(l, (i + 1)) show_image(reconstruct_from_PCA(Z, Ul, mu)[i]) print(im_name) plt.savefig("../../images/{}".format(im_name))''' ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 2d-2f: cluster toy dataset np.random.seed(1234) pts = generate_points_2d(20) #print("Using kmeans and random_init") #kMeans(pts, 3, plot=True) #print("Using kmedoids and random_init") #kMedoids(pts, 3, plot=True) #print("Using kmeans and cheat_init") #kMeans(pts, 3, init="cheat", plot=True) #print("Using kmedoids and cheat_init") #kMedoids(pts, 3, init="cheat", plot=True) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 3a: cluster faces '''np.random.seed(1234) X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40) points = build_face_image_points(X1, y1) kmeans_scores = [] kmeans_runtime = [] kmeds_scores = [] kmeds_runtime = [] for i in range(0, 10): start = time.time() kmeans_clusterset = kMeans(points, 4, plot=False) end = time.time() kmeans_runtime.append(end - start) kmeans_score = kmeans_clusterset.score() kmeans_scores.append(kmeans_score) start = time.time() kmeds_clusterset = kMedoids(points, 4, plot=False) end = time.time() kmeds_runtime.append(end - start) kmeds_score = kmeds_clusterset.score() kmeds_scores.append(kmeds_score) print("kmeans average score: {}".format(sum(kmeans_scores) / float(len(kmeans_scores)))) print("kmeans max score: {}".format(max(kmeans_scores))) print("kmeans min score: {}".format(min(kmeans_scores))) print("kmeans average runtime: {}s".format((sum(kmeans_runtime) / float(len(kmeans_runtime))))) print("kmeans max runtime: {}".format(max(kmeans_runtime))) print("kmeans min runtime: {}".format(min(kmeans_runtime))) print("kmeds average score: {}".format(sum(kmeds_scores) / float(len(kmeds_scores)))) print("kmeds max score: {}".format(max(kmeds_scores))) print("kmeds min score: {}".format(min(kmeds_scores))) print("kmeds average runtime: {}s".format((sum(kmeds_runtime) / float(len(kmeds_runtime))))) print("kmeds max runtime: {}".format(max(kmeds_runtime))) print("kmeds min runtime: {}".format(min(kmeds_runtime)))''' # part 3b: explore effect of lower-dimensional representations on clustering performance '''np.random.seed(1234) X2, y2 = util.limit_pics(X, y, [4, 13], 40) li = [] for i in range(1, 43, 2): li.append(i) kmeans_face_scores = [] kmeds_face_scores = [] for l in li: print(l) Z, Ul = apply_PCA_from_Eig(X2, U[0], l, mu) points2 = build_face_image_points(Z, y2) kmeans_face_cset = kMeans(points2, 2, init="cheat", plot=False) kmeans_face_scores.append(kmeans_face_cset.score()) kmeds_face_cset = kMedoids(points2, 2, init="cheat", plot=False) kmeds_face_scores.append(kmeds_face_cset.score()) plt.plot(li, kmeans_face_scores, label="K-Means") plt.plot(li, kmeds_face_scores, label="K-Medoids") plt.title("Clustering Score Vs. Number of Principal Components") plt.xlabel("Number of Principal Components") plt.ylabel("Score") plt.legend() plt.show()''' # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images np.random.seed(1234) best_score = float("-inf") best_im = [0, 0] worst_score = float("inf") worst_im = [0, 0] for i in range(0, 19): for j in range(0, 19): if i != j: X3, y3 = util.limit_pics(X, y, [i, j], 40) curr_points = build_face_image_points(X3, y3) c_set = kMedoids(curr_points, 2, init="cheat", plot=False) curr_score = c_set.score() if curr_score > best_score: best_score = curr_score best_im[0] = i best_im[1] = j if curr_score < worst_score: worst_score = curr_score worst_im[0] = i worst_im[1] = j print("The Most Discriminative Images were {}, with a score of {}".format( best_im, best_score)) plot_representative_images(X, y, best_im, title="Most Discriminative") print("The Least Discriminative Images {}, with a score of {}".format( worst_im, worst_score)) plot_representative_images(X, y, worst_im, title="Least Discriminative")
def main(): ### ========== TODO : START ========== ### # part 1: explore LFW data set X, y = get_lfw_data() #show_image(im=X[2]) #show_image(im=X[1]) #show_image(im=X[3]) #average_image = np.mean(X, axis=0) #show_image(im=average_image) U, mu = PCA(X) #plot_gallery([vec_to_image(U[:, i]) for i in xrange(12)]) # Selecting the dimension, l, to map all features to for l in [1, 10, 50, 100, 500, 1288]: Z, Ul = apply_PCA_from_Eig(X, U, l, mu) X_rec = reconstruct_from_PCA(Z, Ul, mu) #plot_gallery([vec_to_image(X_rec[i]) for i in xrange(12)], subtitles=["l="+str(l)+",n="+str(j) for j in xrange(12)]) # Original 12 Images #plot_gallery([vec_to_image(X[i]) for i in xrange(12)]) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 2d: cluster toy dataset #np.random.seed(1234) #points = generate_points_2d(20) #kMeans(points, 3, init='cheat', plot=True) #kMedoids(points, 3, init='cheat', plot=True) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 3a: cluster faces np.random.seed(1234) X1, y1 = limit_pics(X, y, [4, 6, 13, 16], 40) points = build_face_image_points(X1, y1) k_means_scores = [] k_medoids_scores = [] #for _ in range(10): #clusters = kMeans(points, 4, init='random', plot=False) #k_means_scores.append(clusters.score()) #clusters = kMedoids(points, 4, init='random', plot=False) #k_medoids_scores.append(clusters.score()) #print('k-means average: {}'.format(np.mean(k_means_scores))) #print('k-means min: {}'.format(np.min(k_means_scores))) #print('k-means max: {}'.format(np.max(k_means_scores))) #print('k-medoids average: {}'.format(np.mean(k_medoids_scores))) #print('k-medoids min: {}'.format(np.min(k_medoids_scores))) #print('k-medoids max: {}'.format(np.max(k_medoids_scores))) # part 3b: explore effect of lower-dimensional representations on clustering performance np.random.seed(1234) X2, y2 = util.limit_pics(X, y, [4, 13], 40) # kmeans_scores_dict = dict() kmedoids_scores_dict = dict() # for l in np.arange(1, 42): Z2, Ul2 = apply_PCA_from_Eig(X2, U, l, mu) X_rec2 = reconstruct_from_PCA(Z2, Ul2, mu) points = build_face_image_points(X_rec2, y2) # cluster_set1 = kMeans(points, 2, "cheat") cluster_set2 = kMedoids(points, 2, "cheat") # kmeans_scores_dict[l] = cluster_set1.score() kmedoids_scores_dict[l] = cluster_set2.score() # plt.plot(list(kmeans_scores_dict.keys()), list(kmeans_scores_dict.values()), 'r', label='K-means') plt.plot(list(kmedoids_scores_dict.keys()), list(kmedoids_scores_dict.values()), 'b', label='K-medoids') plt.title('Score for kMeans and kMedoids vs. # Principal Components') plt.xlabel('# Principal Components') plt.ylabel('score') plt.legend() #plt.show() # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images np.random.seed(1234) max_score = (-1, None, None) min_score = (np.Inf, None, None) for i in np.arange(0, 19): for j in np.arange(0, 19): if i != j: X_ij, y_ij = util.limit_pics(X, y, [i, j], 40) points = build_face_image_points(X_ij, y_ij) cluster_set = kMedoids(points, 2, init='cheat') score = cluster_set.score() if score < min_score[0]: min_score = (score, i, j) if score > max_score[0]: max_score = (score, i, j) print max_score print min_score plot_representative_images(X, y, [min_score[1], min_score[2]], title='min score images') plot_representative_images(X, y, [max_score[1], max_score[2]], title='max score images')
def main() : ### ========== TODO : START ========== ### # part 1: explore LFW data set X, y = get_lfw_data() mean_face = np.mean(X, axis = 0) U, mu = PCA(X) assert(np.sum(np.abs(mean_face - mu)) == 0) #show_image(vec_to_image(mu)) #PART A num_eigenfaces_to_plot = 12 #plot_gallery([vec_to_image(U[:,i]) # for i in xrange(num_eigenfaces_to_plot)]) #PART B for l in [1,10,50,100,500,1288]: Z, Ul = apply_PCA_from_Eig(X, U, l, mu) X_rec = reconstruct_from_PCA(Z, Ul, mu) # plot_gallery([vec_to_image(X_rec[i]) # for i in xrange(num_eigenfaces_to_plot)]) #PART C ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 2d-2f: cluster toy dataset print "generating data for clustering" np.random.seed(1234) pts = generate_points_2d(20) cluster_set = kMeans(pts, 3, plot = False, verbose = False) # 2 print "kmeans rand init score: {}".format(cluster_set.score()) another_cluster_set = kMedoids(pts, 3, plot = False, verbose = False) #2 print "k medoids rand init score: {}".format(another_cluster_set.score()) km_clust_2 = kMeans(pts, 3, init = 'cheat', plot = False, verbose = False) #2 print "k means cheat init score: {}".format(km_clust_2.score()) k_med_clust_2 = kMedoids(pts, 3, init='cheat', plot = False, verbose = False) #2 print "k medoids cheat init score: {}".format(k_med_clust_2.score()) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 3a: cluster faces np.random.seed(1234) X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40) points = build_face_image_points(X1, y1) kmeans_scores, kmed_scores = [], [] kmeans_times, kmed_times = [], [] import time for i in range(10): print "running k-means and k-medoids for the {}th time".format(i+1) t = time.time() cluster_set = kMeans(points, 4) kmeans_times.append(time.time()-t) kmeans_scores.append(cluster_set.score()) t = time.time() kmed_set = kMedoids(points, 4) kmed_times.append(time.time()-t) kmed_scores.append(kmed_set.score()) means_avg, means_max, means_min = np.mean(np.array(kmeans_scores)), max(kmeans_scores), min(kmeans_scores) med_avg, med_max, med_min = np.mean(np.array(kmed_scores)), max(kmed_scores), min(kmed_scores) kmeans_time = np.mean(np.array(kmeans_times)) kmed_time = np.mean(np.array(kmed_times)) print "kmeans time: {}".format(kmeans_time) print "kmed time: {}".format(kmed_time) print "K means average: {}, max: {}, min: {}".format(means_avg, means_max, means_min) print "K medoids average: {}, max: {}, min: {}".format(med_avg, med_max, med_min) exit() # part 3b: explore effect of lower-dimensional representations on clustering performance np.random.seed(1234) X2, y2 = util.limit_pics(X, y, [4, 13], 40) l_kmeans = {} l_kmed = {} for l in range(1,42): if l % 5 == 0: print "iteration: l = {}".format(l) Z, Ul = apply_PCA_from_Eig(X2, U, l, mu) X2_rec = reconstruct_from_PCA(Z, Ul, mu) points = build_face_image_points(X2_rec, y2) kmeans_clust = kMeans(points, 2, init='cheat') kmed_clust = kMedoids(points, 2, init='cheat') l_kmeans[l] = kmeans_clust.score() l_kmed[l] = kmed_clust.score() plt.plot(list(l_kmeans.keys()), list(l_kmeans.values()), 'r', label='K means') plt.plot(list(l_kmed.keys()), list(l_kmed.values()), 'b', label='K medoids') plt.title('K-means and K-medoids score with respect to principal components') plt.xlabel('Number of principal components') plt.ylabel('Clustering score') plt.legend() plt.show() print l_kmed.items() # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images max_score, min_score = (-1, None, None), (np.Inf, None, None) min_tup, max_tup = (None, None, []), (None, None, []) np.random.seed(1234) for i in range(0,19): for j in range(0,19): if i != j: if i % 5 == 0 and j % 5 == 0: print "considering groups {} and {}".format(i,j) X_ij, y_ij = util.limit_pics(X, y, [i,j], 40) points = build_face_image_points(X_ij, y_ij) med_clust = kMedoids(points, 2, init='cheat') score = med_clust.score() if score < min_score[0]: min_score = (score, i, j) if score > max_score[0]: max_score = (score, i, j) print max_score print min_score assert(min_score[1] == 4 and min_score[2] == 5) plot_representative_images(X, y, [min_score[1], min_score[2]], title = 'min score images') assert(max_score[1] == 9 and max_score[2] == 16) plot_representative_images(X, y, [max_score[1], max_score[2]], title = 'max score images')
def main() : ### ========== TODO : START ========== ### # part 1: explore LFW data set X, y = get_lfw_data() mean = np.mean(X, axis=0) #print(mean) #show_image(mean) show_image(vec_to_image(mean)) U, mu = PCA(X) plot_gallery([vec_to_image(U[:,i]) for i in range(12)]) #plot_title = "1c-" #for l in [1,10,50,100,500,1288]: # Z, Ul = apply_PCA_from_Eig(X, U, l, mu) # X_rec = reconstruct_from_PCA(Z, Ul, mu) # title = plot_title + str(l) #plot_gallery([vec_to_image(X_rec[i]) for i in range(12)], title=title) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 2d-2f: cluster toy dataset # np.random.seed(1234) # points = generate_points_2d(20) # kMeans(points, 3, init='random', plot=True) # np.random.seed(1234) # points = generate_points_2d(20) # kMedoids(points, 3, init='random', plot=True) # np.random.seed(1234) # points = generate_points_2d(20) # kMeans(points, 3, init='cheat', plot=True) # np.random.seed(1234) # points = generate_points_2d(20) # kMedoids(points, 3, init='cheat', plot=True) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 3a: cluster faces np.random.seed(1234) X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40) points = build_face_image_points(X1, y1) minScoreMeans = 1000 maxScoreMeans = -1000 average = 0 totalTime = 0 for i in range(10): start = time.time() scoreMeans = kMeans(points, 4, init='random').score() print(scoreMeans) if(i==0): minScoreMeans = scoreMeans maxScoreMeans = scoreMeans average = scoreMeans totalTime = time.time() - start else: if scoreMeans < minScoreMeans: minScoreMeans = scoreMeans if scoreMeans > maxScoreMeans: maxScoreMeans = scoreMeans average += scoreMeans totalTime += time.time() - start print("min score") print(minScoreMeans) print("max score") print(maxScoreMeans) print("average score") print(average/10) print("average time") print(totalTime/10) minScoreMedoids = 1000 maxScoreMedoids = -1000 averageM = 0 totalTime = 0 for i in range(10): start = time.time() scoreMedoids = kMedoids(points, 4, init='random').score() print(scoreMedoids) if(i==0): minScoreMedoids = scoreMedoids maxScoreMedoids = scoreMedoids averageM = scoreMeans totalTime = time.time() - start else: if scoreMedoids < minScoreMedoids: minScoreMedoids = scoreMedoids if scoreMedoids > maxScoreMedoids: maxScoreMedoids = scoreMedoids averageM += scoreMedoids totalTime += time.time() - start print("min score") print(minScoreMedoids) print("max score") print(maxScoreMedoids) print("average") print(averageM/10) print("average time") print(totalTime/10) # part 3b: explore effect of lower-dimensional representations on clustering performance np.random.seed(1234) scoresMeans = [] scoresMedoids = [] l_values = [] X1, y1 = util.limit_pics(X, y, [4, 13], 40) for l in range(1, 42): l_values.append(l) l+=2 for l in l_values: Z, U1 = apply_PCA_from_Eig(X1, U, l, mu) X_rec = reconstruct_from_PCA(Z, U1, mu) points = build_face_image_points(X_rec, y1) scoreM = kMeans(points, 2, init='cheat').score() scoreM2 = kMedoids(points, 2, init='cheat').score() scoresMeans.append(scoreM) scoresMedoids.append(scoreM2) plt.plot(l_values, scoresMeans, 'c', label='kMeans') plt.plot(l_values, scoresMedoids, 'b', label='kMedoids') plt.xlabel('# of principal components') plt.ylabel('score') plt.legend() plt.show() # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images np.random.seed(1234) score = 0 minScore = 1000 iMin = -1 iMax = -1 jMin = -1 jMax = -1 maxScore = -1000 for i in range(0,19): for j in range(0,19): if i != j: X2, y2 = util.limit_pics(X, y, [i, j], 40) points = build_face_image_points(X, y) score = kMedoids(points,2, init='cheat').score() if score < minScore: minScore = score iMin = i jMin = j if score > maxScore: maxScore = score iMax = i jMax = j print("min:") print(minScore) print(iMin) print(jMin) plot_representative_images(X, y, iMin, jMin, title="min") print("max:") print(maxScore) print(iMax) print(jMax) plot_representative_images(X, y, iMax, jMax, title="min")
def main() : ### ========== TODO : START ========== ### # part 1: explore LFW data set X, y = util.get_lfw_data() n,d = X.shape avg_face = [] for column_index in range(d): col = X[:,column_index] avg_face_attr = np.mean(col, axis=0) avg_face.append(avg_face_attr) util.show_image(np.array(avg_face)) ### ========== TODO : END ========== ### # 1b U, mu = util.PCA(X) n,d = U.shape plot_gallery([vec_to_image(U[:,i]) for i in xrange(12)]) for column_index in range(d): col = U[:,column_index] util.show_image(util.vec_to_image(col)) # 1c ls = [1, 10, 50, 100, 500, 1288] for l in ls: Z, Ul = util.apply_PCA_from_Eig(X, U, l, mu) X_rec = util.reconstruct_from_PCA(Z, Ul, mu) plot_gallery(X_rec[:12]) # test centroid # p1 = Point('1', 1, np.array([5, 4])) # p2 = Point('2', 2, np.array([9, 10])) # p3 = Point('3', 3, np.array([3, 9])) # c = Cluster([p1, p2, p3]) # print(str(c)) # print(str(c.centroid())) # end test centroid ### ========== TODO : START ========== ### # part 2d-2f: cluster toy dataset np.random.seed(1234) k = 3 pts_per_cluster = 20 for i in range(1): points = generate_points_2d(pts_per_cluster) k_clusters = kMeans(points, k, init="cheat", plot=True) k_clusters = kMedoids(points, k, init="cheat", plot=True) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 3a: cluster faces np.random.seed(1234) k = 4 X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40) points = build_face_image_points(X1, y1) plot = {} for pt in points: if pt.label not in plot: plot[pt.label] = [] plot[pt.label].append(pt) clusters = ClusterSet() for l in plot: clusters.add(Cluster(plot[l])) plot_clusters(clusters, 'orig', ClusterSet.centroids) Part 3a centroid_score = [] medoid_score = [] for i in range(10): k_clusters = kMeans(points, k, init="random", plot=False) centroid_score.append(k_clusters.score()) centroid_mean = sum(centroid_score) / float(len(centroid_score)) centroid_min = min(centroid_score) centroid_max = max(centroid_score) print('Centroid avg:', centroid_mean) print('Centroid min:', centroid_min) print('Centroid max:', centroid_max) medoid_score = [] for i in range(10): k_clusters = kMedoids(points, k, init="random", plot=False) medoid_score.append(k_clusters.score()) centroid_mean = sum(medoid_score) / float(len(medoid_score)) centroid_min = min(medoid_score) centroid_max = max(medoid_score) print('Medoid avg:', centroid_mean) print('Medoid min:', centroid_min) print('Medoid max:', centroid_max) # part 3b: explore effect of lower-dimensional representations on clustering performance np.random.seed(1234) U, mu = util.PCA(X) X1, y1 = util.limit_pics(X, y, [4, 13], 40) k = 2 ls = range(42)[1::2] centroid_score = [] medoid_score = [] for l in ls: Z, Ul = util.apply_PCA_from_Eig(X1, U, l, mu) # X_rec = util.reconstruct_from_PCA(Z, Ul, mu) points = build_face_image_points(Z, y1) # plot_gallery(X_rec[:12]) c = kMeans(points, k, init="cheat", plot=False) centroid_score.append(c.score()) k_clusters = kMedoids(points, k, init="cheat") medoid_score.append(k_clusters.score()) scatter = plt.scatter(ls, centroid_score, c='c', s=20) scatter2 = plt.scatter(ls, medoid_score, c='r', s=20) plt.suptitle('kMeans and kMedoids', fontsize=20) plt.xlabel('L', fontsize=16) plt.ylabel('Score', fontsize=16) plt.legend((scatter, scatter2), ('kMeans', 'kMedoids'), scatterpoints=1, loc='lower right', ncol=3, fontsize=14) plt.show() # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images np.random.seed(1234) totalPeople = 19 best_score = 0 worst_score = float("inf") best_pair = None worst_pair = None for p1 in xrange(totalPeople): for p2 in xrange(p1+1, totalPeople): X3, y3 = util.limit_pics(X, y, [p1, p2], 40) points = build_face_image_points(X3, y3) clusters = kAverages(points, 2, ClusterSet.medoids, init='cheat', plot=False) score = clusters.score() if score > best_score: best_score = score best_pair = (p1,p2) if score < worst_score: worst_score = score worst_pair = (p1,p2) print(best_pair) print(best_score) plot_representative_images(X,y, best_pair, title="Most Similar Face") print(worst_pair) print(worst_score) plot_representative_images(X,y, worst_pair, title="Least Similar Face")
def main(): ### ========== TODO : START ========== ### # part 1: explore LFW data set X, y = get_lfw_data() # show_image(X[0]) # show_image(np.mean(X, axis=0)) U, mu = util.PCA(X) # plot_gallery([vec_to_image(U[:,i]) for i in xrange(12)]) # l_values = [1, 10, 50, 100, 500, 1288] # for l_value in l_values: # Z, UI = apply_PCA_from_Eig(X, U, l_value, mu) # X_rec = reconstruct_from_PCA(Z, UI, mu) # title_text = "Reconstructed for l = %d" %l_value # plot_gallery(X_rec, title =title_text) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 2d-2f: cluster toy dataset np.random.seed(1234) # print 'Problem 2(d)' # points_list = generate_points_2d(20) # only do one or the other, if you do both, the results appear to be wrong... # cluster_set_result = kMeans(points_list, 3, plot=True) # cluster_set_result2 = kMedoids(points_list, 3, plot=True) # using cheat_init # cluster_set_result3 = kMeans(points_list, 3, init='cheat', plot=True) # cluster_set_result4 = kMedoids(points_list, 3, init='cheat', plot=True) ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 3a: cluster faces np.random.seed(1234) X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40) points = build_face_image_points(X1, y1) k_means_purity = [] k_medoids_purity = [] k_means_times = [] k_medoids_times = [] for i in range(10): # repeat 10 times start_time = time.time() k_medoids_result = kMedoids(points, 4) end_time = time.time() k_medoids_purity.append(k_medoids_result.score()) k_medoids_times.append(end_time - start_time) start_time = time.time() k_means_result = kMeans(points, 4) end_time = time.time() k_means_purity.append(k_means_result.score()) k_means_times.append(end_time - start_time) k_means_min = min(k_means_purity) k_means_max = max(k_means_purity) k_means_average = np.mean(np.asarray(k_means_purity)) print 'K-means min: %f, max: %f, avg: %f' % (k_means_min, k_means_max, k_means_average) print 'K-means avg time: %f' % np.mean(np.asarray(k_means_times)) print 'K-medoids min: %f, max: %f, avg: %f' % (min(k_medoids_purity), \ max(k_medoids_purity), \ np.mean(np.asarray(k_medoids_purity))) print 'K-medoids avg time: %f' % np.mean(np.asarray(k_medoids_times))
def main(): ### ========== TODO : START ========== ### # part 1: explore LFW data set X, y = get_lfw_data() #show_image(np.mean(X, axis=0)) #axis 0 is finds the average of the column for all of the images U, mu = util.PCA(X) """ l_values = [1,10, 50,100,500, 1288] image_arr = np.arange(start=0, stop=12) for l in l_values: Z, Ul = util.apply_PCA_from_Eig(X, U, l, mu) # to lower the dimension of the images X_rec = reconstruct_from_PCA(Z,Ul,mu) title = "Reconstructed images for l = %d" % (l) print title plot_gallery(X_rec, title= title) """ ### ========== TODO : END ========== ### ### ========== TODO : START ========== ### # part 2d-2f: cluster toy dataset #part 2d np.random.seed(1234) points = generate_points_2d(N=20) print("2d") #uncomment this """ clusters = kMeans(points,3,'random',True) #end of 2d #part 2e medoid_cluster = kMedoids(points,3,'random',True) #end of 2e #part 2f, cheat initialization kmeans_cheat = kMeans(points,3,'cheat',True) kmedoid_cheat = kMedoids(points,3,'cheat', True) """ ### ========== TODO : END ========== ### #IMPORTANT #Begin of 3a comment """ ### ========== TODO : START ========== ### # part 3a: cluster faces np.random.seed(1234) X1, y1 = util.limit_pics(X, y, [4, 6, 13, 16], 40) points = build_face_image_points(X1, y1) kmean_score_list = [] kmedoid_score_list = [] for i in np.arange(0,10): kmean_cluster = kMeans(points, 4, 'random', False) kmean_score_list.append(kmean_cluster.score()) kmedoid_cluster = kMedoids(points, 4, 'random', False) kmedoid_score_list.append(kmedoid_cluster.score()) kmean_avg = np.mean(kmean_score_list) kmean_max = max(kmean_score_list) kmean_min = min(kmean_score_list) kmedoid_avg = np.mean(kmedoid_score_list) kmedoid_max = max(kmedoid_score_list) kmedoid_min = min(kmedoid_score_list) """ #End of 3A comments #IMPORTANT #Begin of 3b comment """" # part 3b: explore effect of lower-dimensional representations on clustering performance print ("3b") np.random.seed(1234) # Use PCA to get the the eigenfaces (and eigenvectors) U, mu = util.PCA(X) l_range = np.arange(1,42) k = 2 X2, y2 = util.limit_pics(X, y, [4, 13], 40) kmean_score_dict = {} kmedoid_score_dict = {} for l in l_range: Z1, Ul1 = apply_PCA_from_Eig(X2,U,l, mu) #reduce the dimension X2_reconstructed = reconstruct_from_PCA(Z1,Ul1,mu) points = build_face_image_points(X2_reconstructed, y2) kmeans_clust = kMeans(points,k,'cheat',False) kmedoid_clust = kMedoids(points,k,'cheat',False) kmean_score_dict[l] = kmeans_clust.score() kmedoid_score_dict[l] = kmedoid_clust.score() print "3b here" plt.plot(list(kmean_score_dict.keys()),list(kmean_score_dict.values()), color= 'b', label='kMeans') plt.plot(list(kmedoid_score_dict.keys()),list(kmedoid_score_dict.values()),color= 'g', label='kMedoid') plt.title("kMean and kMedoid Scores vs l (Number of Principal Components)") plt.xlabel("l (Number of Principal Components)") plt.ylabel("kMean and kMedoid Scores") plt.legend() plt.show() #End of 3b comment """ # part 3c: determine ``most discriminative'' and ``least discriminative'' pairs of images np.random.seed(1234) min_score = (np.inf, None, None) max_score = (-1, None, None) #we know there are 19 people print("Starting") for i in range(0, 19): for j in range(0, 19): if i == j: #if on the same person continue X3, y3 = util.limit_pics(X, y, [i, j], 40) #receive the images points = build_face_image_points(X3, y3) kmedoid_clust = kMedoids(points, 2, 'cheat', False) if kmedoid_clust.score() < min_score[0]: min_score = (kmedoid_clust.score, i, j) if kmedoid_clust.score() > max_score[0]: max_score = (kmedoid_clust.score, i, j) #now we have the min and max clusters print("before the plot") plot_representative_images( X, y, [max_score[1], max_score[2]], title="Images with Maximum Cluster Score (Best Clustering)") plot_representative_images( X, y, [min_score[1], min_score[2]], title="Images with Minumum Cluster Score (Worst Clustering)")