def domanda_6(): print("Computing: hierchical clustering on dataset_212...") C1, t1, d1 = hierarchical_clustering(dataset_212, 9, weighted) print("Distortion for hierchical clustering:", d1) print("Computing: kmeans clustering on dataset_212...") C2, t2, d2 = kmeans_clustering(dataset_212, 9, 5, weighted) print("Distortion for kmeans clustering:", d2)
def visualize_data(cluster_input, data, method=None, display_centers=False): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(data) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) if method == None: cluster_list = sequential_clustering(singleton_list, cluster_input) print("Displaying", len(cluster_list), "sequential clusters") elif method == 'hierarchical_clustering': cluster_list = clustering.hierarchical_clustering(singleton_list, cluster_input) print("Displaying", len(cluster_list), "hierarchical clusters") elif method == 'kmeans_clustering': cluster_list = clustering.kmeans_clustering(singleton_list, cluster_input[0], cluster_input[1]) print("Displaying", len(cluster_list), "k-means clusters") else: print("ERROR: method entered into visualize_data not recognized") alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, display_centers)
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_111_URL) singleton_list = [] for line in data_table: singleton_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) #cluster_list = sequential_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "sequential clusters" #cluster_list = clustering.hierarchical_clustering(singleton_list, 15) #print "Displaying", len(cluster_list), "hierarchical clusters" cluster_list = clustering.kmeans_clustering(singleton_list, 9, 5) print "Displaying", len(cluster_list), "k-means clusters" # draw the clusters using matplotlib or simplegui if DESKTOP: alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, False) alg_clusters_matplotlib.plot_clusters(data_table, cluster_list, True) #add cluster centers else: alg_clusters_simplegui.PlotClusters(data_table, cluster_list) # use toggle in GUI to add cluster centers
def domanda_5(): print("Computing: kmeans clustering on dataset_212...") C, t, d = kmeans_clustering(dataset_212, 9, 5, weighted) print("Drawing...") draw_clustering( C, "Clustering k-means su 212 contee" + (" (v. pesata)" if weighted else ""))
def domanda_2(): print("Computing: kmeans clustering on dataset_full...") C, t, d = kmeans_clustering(dataset_full, 15, 5, weighted) print("Drawing...") draw_clustering( C, "Clustering k-means sull'intero dataset" + (" (v. pesata)" if weighted else ""))
def kmeans_dist(data_url): """ Calculates distirtion of k_means for 6-20 clusters """ res = {} data_table = load_data_table(data_url) for num_clust in range(6, 21): singleton_list = [] for line in data_table: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = alg_project3_solution.kmeans_clustering( singleton_list, num_clust, 5) res[num_clust] = ca.compute_distortion(cluster_list, data_table) return res
def domanda_9(): print("Computing...") datasets = { "dataset con 212 contee": dataset_212, "dataset con 562 contee": dataset_562, "dataset con 1041 contee": dataset_1041 } min_c, max_c = 6, 21 interval = range(min_c, max_c) for name, dataset in datasets.items(): h_distortion_list = hierarchical_clustering_distortion_list( dataset, min_c, weighted)[min_c:max_c] k_distortion_list = [ kmeans_clustering(dataset, i, 5, weighted)[2] for i in interval ] draw_distortion(list(interval), h_distortion_list, k_distortion_list, name + (" (v. pesata)" if weighted else ""))
def test_kmeans(): """ Test for k-means clustering kmeans_clustering should not mutate cluster_list, but make a new copy of each test anyways """ # load small data table print print "Testing kmeans_clustering on 24 county set" data_24_table = load_data_table(DATA_24_URL) kmeansdata_24 = [[15, 1, set([('34017', '36061'), ('06037',), ('06059',), ('36047',), ('36081',), ('06071', '08031'), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [15, 3, set([('34017', '36061'), ('06037', '06059'), ('06071',), ('36047',), ('36081',), ('08031',), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [15, 5, set([('34017', '36061'), ('06037', '06059'), ('06071',), ('36047',), ('36081',), ('08031',), ('36059',), ('36005',), ('55079',), ('34013', '34039'), ('06075',), ('01073',), ('06029',), ('41051', '41067'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [10, 1, set([('34017', '36061'), ('06029', '06037', '06075'), ('11001', '24510', '34013', '34039', '51013', '51760', '51840', '54009'), ('06059',), ('36047',), ('36081',), ('06071', '08031', '41051', '41067'), ('36059',), ('36005',), ('01073', '55079')])], [10, 3, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081',), ('36059',), ('36005',), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [10, 5, set([('34013', '34017', '36061'), ('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('34039', '36047'), ('36081',), ('36059',), ('36005',), ('01073', '55079'), ('11001', '24510', '51013', '51760', '51840', '54009')])], [5, 1, set([('06029', '06037', '06075'), ('01073', '11001', '24510', '34013', '34017', '34039', '36047', '51013', '51760', '51840', '54009', '55079'), ('06059',), ('36005', '36059', '36061', '36081'), ('06071', '08031', '41051', '41067')])], [5, 3, set([('06029', '06037', '06075'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '51760', '51840', '54009', '55079')])], [5, 5, set([('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])]] # kmeansdata_24 = [[5, 5, set([('06029', '06037', '06075'), ('08031', '41051', '41067'), ('06059', '06071'), ('01073', '55079'), ('11001', '24510', '34013', '34017', '34039', '36005', '36047', '36059', '36061', '36081', '51013', '51760', '51840', '54009')])]] suite = poc_simpletest.TestSuite() for num_clusters, num_iterations, expected_county_tuple in kmeansdata_24: # build initial list of clusters for each test since mutation is allowed cluster_list = [] for idx in range(len(data_24_table)): line = data_24_table[idx] cluster_list.append(alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # compute student answer student_clustering = student.kmeans_clustering(cluster_list, num_clusters, num_iterations) student_county_tuple = set_of_county_tuples(student_clustering) # Prepare test error_message = "Testing kmeans_custering on 24 county table, num_clusters = " + str(num_clusters) error_message += " num_iterations = " + str(num_iterations) error_message += "\nStudent county tuples: " + str(student_county_tuple) error_message += "\nExpected county tuples: " + str(expected_county_tuple) suite.run_test(student_county_tuple == expected_county_tuple, True, error_message) suite.report_results()
def get_cluster_labels(utterances): """ Calculates cluster labels for all the utterances. If there are few documents, I use PCA for dimension reduction and k-means for clustering. If there are more than 100, I use UMAP and HDBSCAN together. :utterances: list of strings :return: list of integers indicating what cluster they belong to. """ n_docs = len(utterances) if n_docs <= 100: document_embeddings = embeddings.pretrained_transformer_embeddings( utterances) dims = min(clustering.get_optimal_n_components(document_embeddings), 10) reduced = clustering.reduce_dimensions_pca(document_embeddings,\ dimensions=dims) return clustering.kmeans_clustering(reduced,\ max_num_clusters=min(n_docs,80)) else: document_embeddings = embeddings.word2vec_sif_embeddings( utterances, model_name=None) reduced = clustering.reduce_dimensions_umap(document_embeddings) return clustering.hdbscan_clustering(reduced)
def compute_q5_q6(): # Load data table111 = viz_tools.load_data_table(DATA_111_URL) # Formate data as Clusters singleton_list = [] for line in table111: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # Note: K-means tested first b/c clustering.hierarchical_clustering # mutates list of clusters # K-means kmeans_clusters = clustering.kmeans_clustering(singleton_list, 9, 5) k_distortion = compute_distortion(kmeans_clusters, table111) print("K-means Distortion: {}".format(k_distortion)) # Hierarchical hierarchical_clusters = clustering.hierarchical_clustering( singleton_list, 9) h_distortion = compute_distortion(hierarchical_clusters, table111) print("Hierarchical Distortion: {}".format(h_distortion))
def test_compute_distortion(): # Load data table290 = viz_tools.load_data_table(DATA_290_URL) # Formate data as Clusters singleton_list = [] for line in table290: singleton_list.append( alg_cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) # Note: K-means tested first b/c clustering.hierarchical_clustering # mutates list of clusters # Test 2: Expect 2.323×10^11 kmeans_clusters = clustering.kmeans_clustering(singleton_list, 16, 5) k_distortion = compute_distortion(kmeans_clusters, table290) print("K-means Distortion: {}".format(k_distortion)) # Test 1: Expect 2.575×10^11 hierarchical_clusters = clustering.hierarchical_clustering( singleton_list, 16) h_distortion = compute_distortion(hierarchical_clusters, table290) print("Hierarchical Distortion: {}".format(h_distortion))
def q10(plot_key): # Load data table111 = viz_tools.load_data_table(DATA_111_URL) table290 = viz_tools.load_data_table(DATA_290_URL) table896 = viz_tools.load_data_table(DATA_896_URL) # Create cluster function create_cluster = lambda line: alg_cluster.Cluster(set([line[0]]), line[ 1], line[2], line[3], line[4]) # Formate data as Clusters klist111 = [create_cluster(line) for line in table111] klist290 = [create_cluster(line) for line in table290] klist896 = [create_cluster(line) for line in table896] hlist111 = [create_cluster(line) for line in table111] hlist290 = [create_cluster(line) for line in table290] hlist896 = [create_cluster(line) for line in table896] # Initialize distortion lists distortion111k, distortion290k, distortion896k = [], [], [] distortion111h, distortion290h, distortion896h = [], [], [] # Calculate distortion lists for num in range(20, 5, -1): if plot_key == 111: kmeans_cluster111 = clustering.kmeans_clustering(klist111, num, 5) h_cluster111 = clustering.hierarchical_clustering(hlist111, num) distortion111k.append( compute_distortion(kmeans_cluster111, table111)) distortion111h.append(compute_distortion(h_cluster111, table111)) elif plot_key == 290: kmeans_cluster290 = clustering.kmeans_clustering(klist290, num, 5) h_cluster290 = clustering.hierarchical_clustering(hlist290, num) distortion290k.append( compute_distortion(kmeans_cluster290, table290)) distortion290h.append(compute_distortion(h_cluster290, table290)) elif plot_key == 896: kmeans_cluster896 = clustering.kmeans_clustering(klist896, num, 5) h_cluster896 = clustering.hierarchical_clustering(hlist896, num) distortion896k.append( compute_distortion(kmeans_cluster896, table896)) distortion896h.append(compute_distortion(h_cluster896, table896)) # Plot results fig = plt.figure('Distortion for Different Clustering Methods') plt.title('Distortion for Different Clustering Methods: {} Points'.format( plot_key)) plt.xlabel('Number of Clusters') plt.ylabel('Distortion') x = list(range(20, 5, -1)) if plot_key == 111: y1, y4 = distortion111k, distortion111h plt.plot(x, y1, '-bo', markersize=1, label='K-means (111)') plt.plot(x, y4, '-co', markersize=1, label='Hierarchical (111)') elif plot_key == 290: y2, y5 = distortion290k, distortion290h plt.plot(x, y2, '-go', markersize=1, label='K-means (290)') plt.plot(x, y5, '-mo', markersize=1, label='Hierarchical (290)') elif plot_key == 896: y3, y6 = distortion896k, distortion896h plt.plot(x, y3, '-ro', markersize=1, label='K-means (896)') plt.plot(x, y6, '-yo', markersize=1, label='Hierarchical (896)') plt.legend(loc='best') plt.show()
def initial_data_clustering(data_for_clustering, data_for_lda, customer_ids, category_names): """ Function to cluster raw dataset using: 1. DBSCAN algorithm 2. OPTICS algorithm 3. k-means algorithm 4. Clustering by LDA topics 5. Gaussian mixture model with Bayes classifier param: 1. data_for_clustering - pandas DataFrame (10000, 82), where values are the mean spendings of customers for every category 2. data_for_lda - pandas DataFrame (10000, 82), where values are the transactions number of customers for every category 3. customer_ids - numpy array (10000, ) with all customer ids 4. category_names - numpy array (82, ) with names of categories """ # Clustering using DBSCAN algorithm print("\t0. DBSCAN clustering with DMDBSCAN method") time_start = time.time() clustering.dbscan_clustering(data_for_clustering, "initial data") data_for_clustering = data_for_clustering.drop(columns='cluster') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/initial data/" "DBSCAN with DMDBSCAN/") print("\t Done. With time " + str(time_end) + " min") # Clustering using OPTICS algorithm print("\t1. OPTICS clustering") time_start = time.time() clustering.optics_clustering(data_for_clustering, "initial data") data_for_clustering = data_for_clustering.drop(columns='cluster') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/initial data/OPTICS/") print("\t Done. With time " + str(time_end) + " min") # Clustering using k-means algorithm with # Silhouette method for clusters number choosing print("\t2. k-means clustering (Silhouette method)") time_start = time.time() clustering.kmeans_clustering(data_for_clustering, "initial data", "k-means with Silhouette", clusters_number=clustering.silhouette_method( data_for_clustering, "clustering/initial data/k-means " "with Silhouette")) data_for_clustering = data_for_clustering.drop(columns='cluster') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/initial data/" "k-means with Silhouette/") print("\t Done. With time " + str(time_end) + " min") # Clustering using k-means algorithm with # Elbow (Knee) method for clusters number choosing print("\t3. k-means clustering (Elbow (Knee) method)") time_start = time.time() clustering.kmeans_clustering(data_for_clustering, "initial data", "k-means with Elbow", clusters_number=clustering.elbow_method( data_for_clustering, "clustering/initial data/k-means " "with Elbow")) data_for_clustering = data_for_clustering.drop(columns='cluster') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/initial data/" "k-means with Elbow/") print("\t Done. With time " + str(time_end) + " min") # Topic modeling using LDA print("\t4. Performing LDA") time_start = time.time() lda_results, topics_number = \ clustering.lda_performing(data_for_lda, customer_ids, category_names) time_end = (time.time() - time_start) / 60 print("\t Data folder: data/output/") print("\t Images folder: plots/LDA/") print("\t Done. With time " + str(time_end) + " min") # Clustering using k-means algorithm with LDA topics print("\t5. k-means clustering using LDA topics") time_start = time.time() clustering.kmeans_clustering( data_for_clustering, "initial data", "k-means with LDA", clusters_number=topics_number, auxiliary_data=lda_results.drop(columns=["dominant_topic"])) data_for_clustering = data_for_clustering.drop(columns='cluster') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/initial data/" "k-means with LDA/") print("\t Done. With time " + str(time_end) + " min") # Clustering using dominant topics as cluster labels print("\t6. Clustering using dominant LDA topics") time_start = time.time() clustering.topic_clustering(data_for_clustering, lda_results, topics_number) data_for_clustering = data_for_clustering.drop(columns='dominant_topic') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/initial data/" "clustering by LDA topics/") print("\t Done. With time " + str(time_end) + " min") # Clustering using Gaussian mixture model with Bayes classifier print("\t7. Gaussian mixture clustering") time_start = time.time() clustering.gaussian_mixture(data_for_clustering, "initial data") data_for_clustering = data_for_clustering.drop(columns='cluster') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/initial data/" "gaussian mixture/") print("\t Done. With time " + str(time_end) + " min") return
valid_vocab = vocab[args.lang_to_cluster] valid_vocab = [w for w in valid_vocab if w in embeddings] # cluster vocabulary embeddings from 1 language only print("\nComputing word weights using", args.weight_type) weight_type = args.weight_type vocab_scores = get_vocab_scores(valid_docs, valid_vocab, score_type=weight_type) valid_vocab = list(vocab_scores.keys()) emb_to_cluster = np.array([embeddings[w] for w in valid_vocab]) weights = np.array([vocab_scores[w] for w in valid_vocab]) print("Valid vocab:", len(valid_vocab)) print("\nClustering vocab words") if args.cluster_method == 'kmeans': if args.weight_type != 'none': labels, centers, model = kmeans_clustering(vocab_embeddings=emb_to_cluster, vocab=valid_vocab, n_topics=args.n_clusters, weights=weights) else: labels, centers, model = kmeans_clustering(vocab_embeddings=emb_to_cluster, vocab=valid_vocab, n_topics=args.n_clusters) elif args.cluster_method == 'affprop': if args.weight_type != 'none': labels, centers = affprop_clustering(vocab_embeddings=emb_to_cluster, vocab=valid_vocab, weights=weights) else: labels, centers = affprop_clustering(vocab_embeddings=emb_to_cluster, vocab=valid_vocab) elif args.cluster_method == 'gmm': labels, means, cov = GMM_clustering(vocab_embeddings=emb_to_cluster, vocab=valid_vocab, cov_type=args.cov_type, n_topics=args.n_clusters) else: labels, centers, model = kmeans_clustering(vocab_embeddings=emb_to_cluster, vocab=valid_vocab,
import sys sys.path.append('../../3_closest_pairs_&_clustering_algorithms') import data.load_clusters as lc import clustering as clr import data.cluster as cl import alg_clusters_matplotlib as cp data_table = lc.load_data_table(lc.DATA_3108_URL) #DATA_3108_URL singleton_list = [] for line in data_table: singleton_list.append(cl.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = clr.kmeans_clustering(singleton_list, 7, 7) cp.plot_clusters(data_table, cluster_list, True)
w = np.array([vocab_scores[lang][w] for w in valid_vocab[lang]]) combined_weights.append(w) combined_vocab.extend(valid_vocab[lang]) combined_emb = np.vstack(combined_emb) combined_weights = np.concatenate(combined_weights) print("Embs to cluster:", combined_emb.shape) print("Weights:", combined_weights.shape) print("Combined vocab:", len(combined_vocab)) print("\nClustering vocab words") if args.cluster_method == 'kmeans': if args.weight_type != 'none': labels, centers, model = kmeans_clustering( vocab_embeddings=combined_emb, vocab=combined_vocab, topics=args.n_clusters, weights=combined_weights) else: labels, centers, model = kmeans_clustering( vocab_embeddings=combined_emb, vocab=combined_vocab, topics=args.n_clusters) save_path = "/proj/zosa/results/cldr/" dump_file = "denews_" + args.emb_type + "_" + args.cluster_method + "_" + args.weight_type + "_" + str( args.n_clusters) + "clusters" + "_bilingual.pkl" dump_file_model = dump_file[:-4] + "_model.pkl" results = {'labels': labels, 'centers': centers, 'vocab': vocab_scores} with open(save_path + dump_file, 'wb') as f: pickle.dump(results, f) f.close()
def embedded_data_clustering(data_for_clustering, customer_ids, name): """ Function to cluster raw dataset using: 1. DBSCAN algorithm 2. OPTICS algorithm 3. k-means algorithm 4. Gaussian mixture model with Bayes classifier param: 1. data_for_clustering - pandas DataFrame (10000, 82), where values are the mean spendings of customers for every category 2. customer_ids - numpy array (10000, ) with all customer ids 3. name - string that represents name of embedding algorithm """ # Data embedding print("\t0. Transforming data using " + name + " method") time_start = time.time() data_embedded = \ dimensionality_reduction.data_embedding(data_for_clustering, customer_ids, name) time_end = (time.time() - time_start) / 60 print("\t Data folder: data/output/") print("\t Done. With time " + str(time_end) + " min") # Clustering using DBSCAN algorithm print("\t1. DBSCAN clustering with DMDBSCAN method") time_start = time.time() clustering.dbscan_clustering(data_for_clustering, name, auxiliary_data=data_embedded) data_for_clustering = data_for_clustering.drop(columns='cluster') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/" + name + "/DBSCAN with DMDBSCAN/") print("\t Done. With time " + str(time_end) + " min") # Clustering using OPTICS algorithm print("\t2. OPTICS clustering") time_start = time.time() clustering.optics_clustering(data_for_clustering, name, auxiliary_data=data_embedded) data_for_clustering = data_for_clustering.drop(columns='cluster') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/" + name + "/OPTICS/") print("\t Done. With time " + str(time_end) + " min") # Clustering using k-means algorithm with # Silhouette method for clusters number choosing print("\t3. k-means clustering (Silhouette method)") time_start = time.time() clustering.kmeans_clustering(data_for_clustering, name, "k-means with Silhouette", clusters_number=clustering.silhouette_method( data_embedded, "clustering/" + name + "/k-means with Silhouette"), auxiliary_data=data_embedded) data_for_clustering = data_for_clustering.drop(columns='cluster') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/" + name + "/k-means with Silhouette/") print("\t Done. With time " + str(time_end) + " min") # Clustering using k-means algorithm with # Elbow (Knee) method for clusters number choosing print("\t4. k-means clustering (Elbow (Knee) method)") time_start = time.time() clustering.kmeans_clustering(data_for_clustering, name, "k-means with Elbow", clusters_number=clustering.elbow_method( data_embedded, "clustering/" + name + "/k-means with Elbow"), auxiliary_data=data_embedded) data_for_clustering = data_for_clustering.drop(columns='cluster') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/" + name + "/k-means with Elbow/") print("\t Done. With time " + str(time_end) + " min") # Clustering using Gaussian mixture model with Bayes classifier print("\t5. Gaussian mixture clustering") time_start = time.time() clustering.gaussian_mixture(data_for_clustering, name, auxiliary_data=data_embedded) data_for_clustering = data_for_clustering.drop(columns='cluster') time_end = (time.time() - time_start) / 60 print("\t Images folder: plots/clustering/" + name + "/gaussian mixture/") print("\t Done. With time " + str(time_end) + " min") return