matrix = matrix_pce #hierarchical clustering part starts here linkage = dendro.compute_linkage(matrix) #methods = ['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward'] #for method in methods: # linkage = sch.linkage(matrix, method=method) threshold = 0.7*linkage[:,2].max() # default threshold used in sch.dendogram is 0.7*linkage[:,2].max() #dendrogram = dendro.compute_dendrogram(linkage) dendrogram = dendro.plot_dendrogram_and_matrix(linkage, matrix, color_threshold=threshold) #compute flat clustering in the exact same way as sch.dendogram colors the clusters cluster = numpy.array(sch.fcluster(linkage, threshold, criterion='distance'), dtype=numpy.int) numpy.set_printoptions(threshold=numpy.nan) # make numpy print the full array print "flat clustering:\n", cluster #get the actual clustering filelist = numpy.loadtxt(directory + "/filelist.txt", dtype=numpy.string_) true_clustering = ["_".join(s.split("_")[:-1]) for s in filelist] true_clusters = sorted(set(true_clustering)) true_labels = numpy.array([true_clusters.index(id) for id in true_clustering], dtype=numpy.int) print "true clustering:\n", true_labels
def get_ground_truth(): filelist = numpy.loadtxt(directory + "/filelist.txt", dtype=numpy.string_) numfiles = filelist.size matrix_ans = numpy.zeros([numfiles,numfiles], dtype=numpy.float) for i in range(numfiles): for j in range(numfiles): cam1 = "_".join(filelist[i].split("_")[:-1]) cam2 = "_".join(filelist[j].split("_")[:-1]) if cam1 == cam2: matrix_ans[i][j] = 1.0 else: matrix_ans[i][j] = 100.0 if i == j: matrix_ans[i][j] = 0.0 return matrix_ans #if __name__ == "__main__": # import sys # if len(sys.argv) != 2: # print("Usage: ./camera_identification <name-of-dataset>") # exit() # import os # if not os.path.isdir(directory + "/" + sys.argv[1]): # print("incorrect dataset name, cannot find " + directory + "/" + sys.argv[1]) # exit() dataset ="pentax" #sys.argv[1] directory = directory + "/" + dataset #load the distance matrixes from files matrix_pce = numpy.fromfile(directory + "/matrix-" + dataset + "-pce.dat", dtype='>d') matrix_pce0 = numpy.fromfile(directory + "/matrix-" + dataset + "-pce0.dat", dtype='>d') matrix_ncc = numpy.fromfile(directory + "/matrix-" + dataset + "-ncc.dat", dtype='>d') matrix_pce, matrix_ncc = map_ncc_scores_to_pce_domain(matrix_pce, matrix_ncc) matrix_ncc = convert_similarity_to_distance(matrix_ncc) matrix_pce = convert_similarity_to_distance(matrix_pce) matrix_pce0 = convert_similarity_to_distance(matrix_pce0) matrix_ans = get_ground_truth() plot_distance_matrices(matrix_ncc, matrix_pce, matrix_pce0, matrix_ans) #set metric to use for clustering matrix = matrix_pce #hierarchical clustering part starts here linkage = dendro.compute_linkage(matrix) #methods = ['single', 'complete', 'average', 'weighted', 'centroid', 'median', 'ward'] #for method in methods: # linkage = sch.linkage(matrix, method=method) threshold = 0.7*linkage[:,2].max() # default threshold used in sch.dendogram is 0.7*linkage[:,2].max() #dendrogram = dendro.compute_dendrogram(linkage) dendrogram = dendro.plot_dendrogram_and_matrix(linkage, matrix, color_threshold=threshold) #compute flat clustering in the exact same way as sch.dendogram colors the clusters cluster = numpy.array(sch.fcluster(linkage, threshold, criterion='distance'), dtype=numpy.int) numpy.set_printoptions(threshold=numpy.nan) # make numpy print the full array print("flat clustering:\n", cluster) #get the actual clustering filelist = numpy.loadtxt(directory + "/filelist.txt", dtype=numpy.string_) true_clustering = ["_".join(s.split("_")[:-1]) for s in filelist] true_clusters = sorted(set(true_clustering)) true_labels = numpy.array([true_clusters.index(id) for id in true_clustering], dtype=numpy.int) print("true clustering:\n", true_labels) print_metrics(true_labels, cluster)