def clusterize_aggroCluster(videos_hash, nb_clusters): videos_indice, X = createInput(videos_hash) aggroCluster = AgglomerativeClustering(n_clusters=nb_clusters, linkage='ward', affinity='euclidean').fit(X) listCluster = createClusterList(aggroCluster.fit_predict(X), videos_indice) ri = rand_index(listCluster) print(ri) return ri, videos_indice, aggroCluster, listCluster
def clusterize_kmeans(videos_hash, nb_clusters): videos_indice, X = createInput(videos_hash) kmeans = KMeans(n_clusters=nb_clusters, random_state=0, n_init=5, max_iter=300).fit(X) listCluster = createClusterList(kmeans.predict(X), videos_indice) ri = rand_index(listCluster) print(ri) return ri, videos_indice, kmeans, listCluster
def main(): counter = 0 classes = {} for line in fileinput.input(): class_name, observation_name, extension = [ p for s in line.split(' ') for p in s.split('.') ] if class_name not in classes: classes[class_name] = set() classes[class_name].add(observation_name) for class_name in classes: print(class_name, classes[class_name]) clusters = list(classes.values()) for cluster in clusters: pass #print(cluster) rand_index(clusters)
def comparison(labels,names): k = len(set(labels)) clusters = [] for i in range(k): sets = [set() for _ in xrange(970)] for i in range(9700): sets[labels[i]].update([names[i]]) print sets return rand_index(sets)
def main(n_clusters, do_weight, cluster): # devide video file paths into chunks to be sent to processes if rank == 0: start_time = time.time() video_files = load_filenames(n_clusters=n_clusters) video_names = map(path_to_name, video_files) chunks = [[] for _ in range(size)] for i, chunk in enumerate(zip(video_files, video_names)): chunks[i % size].append(chunk) else: video_files_and_names = None chunks = None comm.Barrier() # scatter data to each process handle hashing in each process video_files_and_names = comm.scatter(chunks, root=0) video_files, names = zip(*video_files_and_names) videos, weights = zip(*map( lambda x: generate_video_representation(x, do_weight), video_files)) # gather the result from each process data = comm.gather(zip(videos, weights, names), root=0) # only do clustering in process 0, which gathered all data if rank == 0: videos, weights, video_names = zip( *[pair for paired_data in data for pair in paired_data]) videos = np.asarray(list(videos)) # cluster the videos if cluster == 'kmeans': clusters = cluster_videos_kmeans(videos, weights, video_names, n_clusters) elif cluster == 'gmm': clusters = cluster_videos_gmm(videos, weights, video_names, n_clusters, cov_type="diag") elif cluster == 'ac': clusters = cluster_videos_ac(videos, weights, video_names, n_clusters) time_end = time.time() - start_time # score the clustering method score = rand_index(clusters, n_clusters) print "Scores: ", np.round(score, 2), "\nExecution time: %s" % (time_end)
def main(n_clusters, do_weight, cluster): video_files = load_filenames(n_clusters=n_clusters) video_names = map(path_to_name, video_files) #generate_video_representation(video_files[0]) videos, weights = zip(*map( lambda x: generate_video_representation(x, do_weight), video_files)) videos = np.asarray(list(videos)) # cluster the images if cluster == 'kmeans': clusters = cluster_videos_kmeans(videos, weights, video_names, n_clusters) elif cluster == 'gmm': clusters = cluster_videos_gmm(videos, weights, video_names, n_clusters) elif cluster == 'ac': clusters = cluster_videos_ac(videos, weights, video_names, n_clusters) # score the clustering method score = rand_index(clusters, n_clusters) print score
def pred_to_clusters(list_filenames,pred,n_clusters): names = [path_to_name(path) for path in list_filenames] clusters = [set() for _ in range(n_clusters)] for idx , clus in enumerate(pred): clusters[clus].add(names[idx]) return adjusted_rand_index.rand_index(clusters,n_clusters)
def check_res(result_sets): import adjusted_rand_index #We just simply run the script given by the teacher and print the results print(adjusted_rand_index.rand_index(result_sets))
with open("features.txt", "wb") as fp: #Pickling pickle.dump(features, fp) with open("names.txt", "wb") as fb: #Pickling pickle.dump(names, fb) #========================================= #================CLUSTERING ================= labels = cluster_videos(features,names) #============================================ t2 = datetime.now() #===========CHECK ADJ. RAND INDEX ===================== #Sort the video name list by the clustered labels sidx = np.argsort(labels) split_idx = np.flatnonzero(np.diff(np.take(labels,sidx))>0)+1 out = np.split(np.take(names,sidx,axis=0), split_idx) clusters = list(set(L) for L in out) #Run Davids true clusters and comp Adj.Rand Index score = ARI.rand_index(clusters) #=============PRINT SOME NICE STUFF OUT================ print('Computational time') print('Processing: {}\nClustering: {}\nOverall: {}\n'.format(t1-t0,t2-t1,t2-t0)) print('Adj. Rand Index: {:.5f}'.format(score)) #=======================================================
print("Using " + str(full_data_feat.shape[1]) + " features.") # Compute the similarity matrix by using hamming distance # Use matrix broadcasting print("Computing distances...") sim_matrix = (full_data_feat[:, None, :] != full_data_feat).sum(2) # Bottom up hierarchical clustering with complete/maximal cluster distance print("Clustering data...") model = AgglomerativeClustering(n_clusters=970, linkage="complete", affinity="precomputed") # Input the similarity matrix and fit the data to the model model.fit(sim_matrix) # Print snippets of the assigned clusters print(model.labels_) # For each video get the cluster id and # store the name of the video in a set clusters = [-1] * 970 for i, name in enumerate(full_data_name): cluster_idx = model.labels_[i] if clusters[cluster_idx] == -1: # No name has been assigned yet. clusters[cluster_idx] = {name.split('.')[0]} else: # Add to previous set clusters[cluster_idx].add(name.split('.')[0]) # Compute adjusted rand index and report the time used print("Final rand index = ", rand_index(clusters)) print("Query took %0.2f seconds" % (time.time() - t0))