def test_agnes(db, k=2): db = copy.deepcopy(db) random.shuffle(db) agnes = AgnesMax(db[:300], k) clusters = agnes.cluster() plot_clusters(clusters)
def run_example(): """ Load a data table, compute a list of clusters and plot a list of clusters Set DESKTOP = True/False to use either matplotlib or simplegui """ data_table = load_data_table(DATA_3108_URL) singleton_list = [] for line in data_table: singleton_list.append( cluster.Cluster(set([line[0]]), line[1], line[2], line[3], line[4])) cluster_list = sequential_clustering(singleton_list, 15) print("Displaying " + str(len(cluster_list)) + " sequential clusters") #cluster_list = algos.hierarchical_clustering(singleton_list, 9) #print "Displaying", len(cluster_list), "hierarchical clusters" #cluster_list = algos.kmeans_clustering(singleton_list, 9, 5) #print "Displaying", len(cluster_list), "k-means clusters" # draw the clusters using matplotlib or simplegui if DESKTOP: plot.plot_clusters(data_table, cluster_list, False) #plot.plot_clusters(data_table, cluster_list, True) #add cluster centers else: alg_clusters_simplegui.PlotClusters( data_table, cluster_list) # use toggle in GUI to add cluster centers
def main(argv): parser = argparse.ArgumentParser(description='SCV Spam Classification Viaduct') parser.add_argument('FEATURE', type=str, help='Desired feature upon which the clustering algorithm is goin to be trained: POS, BOW, BIGRAMS, TRIGRAMS, W2V, NAMED_ENTITIES') parser.add_argument('k', type=int ,help='K number for K-mean') parser.add_argument('-eo','--english', action ='store_true', help='Filter to work only with english.') parser.add_argument('-sw','--stopword', action ='store_true', help='Remove stopwords.') parser.add_argument('-p','--punctuation', action ='store_true', help='Remove punctuation marks.') parser.add_argument('-l', '--lematize', action ='store_true',help='Lemmatize tokens.') parser.add_argument('-m', '--models', action ='store_true',help='Use already trained model on Models directory.') args = parser.parse_args() if args.FEATURE not in ['POS','BOW','BIGRAMS','TRIGRAMS','W2V','NAMED_ENTITIES','D2V']: print 'Feature not recognized by program.' sys.exit() if not os.path.isfile('../Resources/spam.txt'): prs.parse_raw_spam() if not os.path.isfile('../Resources/ham.txt'): prs.parse_raw_ham() spam_messages = pre_process(args.stopword, args.punctuation, args.lematize,isSpam=True) print 'Data pre-processed' spam_features = generate_features(spam_messages,args.FEATURE,args.english) print 'Features generated' results,labels = train_models_clustering(spam_messages,spam_features,args.FEATURE,args.k) print 'Model trained' score = metrics.silhouette_score(results, labels, metric='euclidean') print 'Plotting...' plot_clusters(results,args.k,labels,args.FEATURE) print 'K-means with '+ str(args.k) +' clusters using '+ str(args.FEATURE) +' silhouette score: ' + str(score)
def kmeans(principal_components,names,embeds,viz=True): kplus = KMeans(n_clusters=12,init='k-means++').fit(embeds) if viz: plot_clusters(kplus ,pc=principal_components ,text=True ,names=names ,n_names=15 ,figsize=(16,4)) plot_label_dist(kplus,palette='Reds',figsize=(4,2)) plot_3d_clusters([('Kmeans++',kplus)],pc=principal_components,figsize=(6,4.5)) return kplus
def getFlatLabels(model,embeds, names, urls, large_cutoff=100, medium_cutoff=75, small_cutoff=50, tiny_cutoff=30, viz=False): pca = PCA(n_components=30) principal_components = pca.fit_transform(embeds) pca0 = principal_components[:,0] pca1 = principal_components[:,1] pca2 = principal_components[:,2] t_values = [('Large clusters',large_cutoff),('Medium clusters',medium_cutoff),('Small clusters',small_cutoff),('Tiny clusters',tiny_cutoff)] agglom_labels = [] for label, t_value in t_values: print(label,'n:') clusters = fcluster(model, t=t_value, criterion='distance') print(len(np.unique(clusters)),'\n') agglom_labels += [clusters] agglom_labels = np.array(agglom_labels) for i in range(len(agglom_labels)): plot_clusters(model,pc=principal_components ,labels=agglom_labels[i] ,names=names ,text=True,figsize=(20,4) ,title=('{} derived from stopping at {} covariance' ).format(t_values[i][0],t_values[i][1])) plt.show() return agglom_labels
dfs.append(density_stats) tgt_image_name = constants.analysis_config[ 'FIGURE_NAME_FORMAT_MTOC_ENRICHMENT'].format( molecule_type=molecule_type) tgt_fp = pathlib.Path( constants.analysis_config['FIGURE_OUTPUT_PATH'].format( root_dir=global_root_dir), tgt_image_name) plot.enrichment_violin_plot(density_stats, molecule_type, tgt_fp, groupby_key=conf[1], limit_threshold=OUTLIERS_THRESHOLD) logger.info("Created figure {}", tgt_fp) plot.plot_clusters(molecule_type, density_stats.df, peripheral_flag=peripheral_flag) tgt_image_name = constants.analysis_config[ 'FIGURE_NAME_FORMAT_MPI'].format(molecule_type=molecule_type) tgt_fp = pathlib.Path( constants.analysis_config['FIGURE_OUTPUT_PATH'].format( root_dir=global_root_dir), tgt_image_name) plot.plot_MPI(density_stats, molecule_type, tgt_fp, use_mean=False) logger.info("Created figure {}", tgt_fp) if "original" in conf[0]: plot.plot_boxplot_MPI(dfs[0], dfs[1], constants.analysis_config['PROTEINS'], tp_mrna, tp_proteins)
def test_dbscan(db, radius=0.3, min_pts=50): dbscan = DBScan(db, radius, min_pts) clusters = dbscan.cluster() plot_clusters(clusters) print('Found %d clusters' % len(clusters))
def test_kmeans(db, k=2): kmeans = KMeans(db[:500], k) clusters = kmeans.cluster() plot_clusters(clusters) return clusters
results = [] for configuration in parameter_grid: model = AgglomerativeClustering(**configuration) predicted_clusters = model.fit_predict(features) v_score = v_measure_score(labels, predicted_clusters) results.append({'params': configuration, 'score': v_score}) results = sorted(results, key=lambda k: k['score'], reverse=True) best_params = results[0]['params'] model = AgglomerativeClustering(**best_params) predicted_clusters = model.fit_predict(features) v_score = v_measure_score(labels, predicted_clusters) silhouette_score = silhouette_score(features, predicted_clusters) print('V-measure score (external metric): {}'.format(v_score)) print('Silhouette score (internal metric): {}'.format(silhouette_score)) pca = PCA(n_components=2) principalComponents = pca.fit_transform(features) plot_clusters(best_params['n_clusters'], principalComponents, predicted_clusters, plt, 'predicted_clusters') plot_clusters(num_classes, principalComponents, labels, plt, 'true_clusters') plt.show()
# Save labels with open(output_folder + "results.txt", "w") as f: [f.write("%i " % c) for c in clone.centers] f.write("\n") for l, core, r in zip(clone.labels_, clone.core_card, clone.rho): f.write("%i %i %f\n" % (l, core, r)) # Display results # > Statistics on unscaled data for better interpretability # > If PCA was done, chose data to visualize. Sometimes it makes sense to look at PC, # sometimes to look at original coords... if not pca: # Stats show_cluster_info(clone, original_coords, output_folder, headers) # Plot plot_clusters(clone, original_coords, output_folder, headers) else: # Stats show_original = -1 while show_original not in [1, 2, 3]: show_original = int( input( "> Show statistics on:\n 1. Original coords (%i dimensions)\n 2. PCA coords (%i dimensions)\n 3. Both\n > Choice: " % (len(headers), len(pca_headers)))) if show_original == 1: show_cluster_info(clone, original_coords, output_folder, headers) elif show_original == 2: show_cluster_info(clone, coords, output_folder, pca_headers) else: show_cluster_info(clone, original_coords, output_folder, headers) show_cluster_info(clone, coords, output_folder, pca_headers)
print("> Clustering %s..."%name) t = time.time() clone = CLoNe(pdc=pdc, verbose=False) clone.fit(data) print("> Done: %.2f sec"%(time.time() - t)) # Get data from clustering centers = clone.centers core_card = clone.core_card labels = clone.labels_ labels_all = clone.labels_all rho = clone.rho # Summary header = " | #center | Dens #Core | # el | -outl |" subh = " |-----------|-------------------|--------|--------|" top = " " + "-" * (len(header) - 4) print(top + "\n" + header + "\n" + subh + "\n" + top) for c in range(len(centers)): elem = len(labels_all[labels_all == c]) outl = len(labels[labels == c]) line = " |%2i - %5i | %7.2f %7i | %6i | %6i |"%(c+1, centers[c]+1, rho[centers[c]], core_card[centers[c]], elem, outl) print(line) print(top) # Plot if data.shape[1] > 3: print("> WARNING: data has more than 3 dimensions. Not plotting.") else: plot_clusters(clone, data, ".")
file.write("Cluster " + str(i) + ":\n" + str(c)) file.close() # Normalize data in all features (1e-5 padding is added because clustering works on [0,1) interval) def normalize_features(data): normalized_data = data num_feat = np.shape(normalized_data)[1] for f in range(num_feat): normalized_data[:, f] -= min(normalized_data[:, f]) - 1e-5 normalized_data[:, f] *= 1 / (max(normalized_data[:, f]) + 1e-5) return normalized_data def visualize_clusters(features, data, clusters, title, xi, tau, file) title = ("Dataset: " + file + "tau = " +str(tau) + "xi = " + str(xi)) if len(features) <= 2: plot_clusters(data, clusters, title, xi) if __name__ == "__main__": xi = 3 tau = 0.1 ds_file = "data.txt" feature_columns = [4, 5, 6, 7] label_column = 3 delimiter = ' ' cluster_file = "clusters_info.txt" # ds_file = "mouse.csv" # feature_columns = [0, 1] # label_column = 2 # xi = 3 # tau = 0.1