def test_precomputed(): X, y = make_blobs(100, random_state=42) D = pairwise_distances(X) mst1 = MSTClustering(cutoff=0.1) mst2 = MSTClustering(cutoff=0.1, metric='precomputed') assert_equal(mst1.fit_predict(X), mst2.fit_predict(D))
def do_clustering(): # create some data with four clusters # X, y = make_blobs(200, centers=4, random_state=42) X = np.genfromtxt('./file16.csv', delimiter=',') print(X.shape) X = X[:, 1:] # predict the labels with the MST algorithm model = MSTClustering(cutoff_scale=2) labels = model.fit_predict(X) # plot the results plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='rainbow', marker='.') plt.savefig('./mst.png')
def PRIM_algo(X_train, y_train): # predict the labels with the MST algorithm silhouette_score_list = [] X_train = PCA(2, svd_solver="full").fit_transform(X_train) for i in range(2, 10): model = MSTClustering(cutoff_scale=i) labels = model.fit_predict(X_train, y_train) plt.title(str(i) + " scatter") x = [item[0] for item in X_train] y = [item[1] for item in X_train] print("this is x: ", x) print("this is y: ", y) plt.scatter(x, y, c=labels, cmap=cm.jet) plt.title("PRIM - " + str(i) + " scatter") plt.show() try: if (len(list(set(labels))) > 1): silhouette_score_list.append( metrics.silhouette_score(X_train, labels, metric='euclidean')) else: silhouette_score_list.append(-1) except: print("silhouette_score did not work") # print("Silhouette: ",silhouette_score(df,cluster_of_each_point_in_data)) # #Computing "the Silhouette Score" # print("Silhouette Coefficient: %0.3f" # % metrics.silhouette_score(X_train, labels, metric='euclidean')) t_Test(X_train, labels) print(labels) if (len(silhouette_score_list) != 0): kn = KneeLocator([i + 1 for i in range(len(silhouette_score_list))], silhouette_score_list, curve='convex', direction='decreasing') print(kn.elbow) create_graph(silhouette_score_list, y_text="SSE", start_point=2)
def cluster_positions(positions, plots=False, cutoff_scale_min=120, cutoff_scale_max=350, cutoff_scale_resolution=150, n_neighbors_max=5, min_cluster_size=4): """ Find clusters in the positions produced by `~shampoo.track2d.locate_from_hologram` using a Minimum Spanning Tree. Parameters ---------- positions : `~numpy.ndarray` Positions for each detected specimen in each frame, with values specified by `~shampoo.track2d.locate_from_hologram`. plots : bool (optional) Plot the scores for the grid search in ``(cutoff_scale, n_neighbors)`` space and the best clusters Returns ------- labels : `~numpy.ndarray` Cluster labels for each position in ``positions``. `-1` represents positions without a cluster. """ # Scale the time and max_intensity dims similarly to the spatial dimensions X = positions.copy() X = X[:, 0:4] X[:, 0] *= 5*positions[:, 1].ptp()/positions[:, 0].ptp() X[:, 3] *= positions[:, 1].ptp()/positions[:, 3].ptp() # Grid search in (cutoff_scales, n_neighbors) for the best clustering params cutoff_scales = np.linspace(cutoff_scale_min, cutoff_scale_max, cutoff_scale_resolution) n_neighbors = np.arange(1, n_neighbors_max) scores = np.zeros((len(cutoff_scales), len(n_neighbors)), dtype=np.float64) for i in range(cutoff_scales.shape[0]): for j in range(n_neighbors.shape[0]): model = MSTClustering(cutoff_scale=cutoff_scales[i], approximate=True, n_neighbors=n_neighbors[j], min_cluster_size=min_cluster_size) labels = model.fit_predict(X) distance_stds = [] for l in set(labels): if l != -1: pca = PCA(n_components=3) pca.fit(X[labels == l, 0:3]) X_prime = pca.transform(X[labels == l, 0:3]) distance_stds.append(X_prime[:, 1].std() / X_prime[:, 0].ptp()) f_labeled = np.count_nonzero(labels != -1)/float(len(labels)) scores[i, j] = np.mean(distance_stds)/f_labeled # With the best clustering parameters, label the clusters x_min_ind, y_min_ind = np.where(scores == scores.min()) n_neighbors_min = n_neighbors[y_min_ind[0]] cuttoff_scale_min = cutoff_scales[x_min_ind[0]] print(n_neighbors_min, cuttoff_scale_min) model = MSTClustering(cutoff_scale=cuttoff_scale_min, approximate=True, n_neighbors=n_neighbors_min, min_cluster_size=4) labels = model.fit_predict(X) if plots: # Plot the scores in (cutoff_scales, n_neighbors) space fig, ax = plt.subplots(figsize=(16, 10)) ax.imshow(np.log(scores).T, interpolation='nearest', origin='lower', cmap=plt.cm.viridis) ax.set_xticks(range(len(cutoff_scales))[::5]) ax.set_xticklabels(["{0:.2f}".format(cutoff_scale) for cutoff_scale in cutoff_scales[::5]]) ax.set_yticks(range(len(n_neighbors))) ax.set_yticklabels(range(1, len(n_neighbors)+1)) for l in ax.get_xticklabels(): l.set_rotation(45) l.set_ha('right') ax.set_xlabel('cutoff') ax.set_ylabel('n_neighbors') ax.set_aspect(10) # Plot the best clusters plot_segments = True fig, ax = plt.subplots(1, 3, figsize=(16, 6)) kwargs = dict(s=100, alpha=0.6, edgecolor='none', cmap=plt.cm.Spectral, c=labels) ax[0].scatter(X[:, 0], X[:, 1], **kwargs) ax[1].scatter(X[:, 0], X[:, 2], **kwargs) ax[2].scatter(X[:, 1], X[:, 2], **kwargs) ax[0].set(xlabel='t', ylabel='x') ax[1].set(xlabel='t', ylabel='y') ax[2].set(xlabel='x', ylabel='y') if plot_segments: segments = model.get_graph_segments(full_graph=False) ax[0].plot(segments[0], segments[1], '-k') ax[1].plot(segments[0], segments[2], '-k') ax[2].plot(segments[1], segments[2], '-k') fig.tight_layout() plt.show() return labels
def cluster(self): model = MSTClustering(cutoff_scale=self.classifyer, approximate=False) self.colors = model.fit_predict(self.positions)
segments = model.get_graph_segments(full_graph=full_graph) axi.plot(segments[0], segments[1], '-ok', zorder=1, lw=1) axi.scatter(X[:, 0], X[:, 1], c=colors, cmap=cmap, zorder=2) axi.axis('tight') ax[0].set_title('Full Minimum Spanning Tree', size=16) ax[1].set_title('Trimmed Minimum Spanning Tree', size=16) # create some data X, y = make_blobs(100, centers=5, cluster_std=0.90) print(X) # predict the labels with the MST algorithm model = MSTClustering(cutoff_scale=1.5, approximate=True, n_neighbors=100) labels = model.fit_predict(X) counts = np.bincount(labels) print("No. of clusters: ") clusters = len(counts) print(len(counts)) print("No. of elements in each Clusters: ") print(counts) # plot the results plt.scatter(X[0:, 0], X[0:, 1], marker='o', c=labels, cmap='rainbow') plt.show() # plot the brief model plot_mst(model) wcss = [] for i in range(1, 11):
def mst(instance_path, res_folder, strategy = 2): instances = instance_path.rsplit('/', 1)[0] + '/' file = instance_path.rsplit('/', 1)[1] input_type = '.' + file.rsplit('.', 1)[1] file = file.rsplit('.', 1)[0] data, row_names = parse.read(instances + file + input_type) print 'Size of data matrix: ', data.shape if len(data) <> len(row_names): print 'MST error: data and row_names have diff. lens', len(data), len(row_names) #save_matrix_fig(data, res_folder, file+'_in') dist_matrix = [] '''OLD try: dist_matrix = np.load(res_folder+file+'_dist'+str(strategy)+'.npy') print 'Distance matrix %s found.' %(res_folder+file+'_dist'+str(strategy)+'.npy') except: print 'Distance matrix %s NOT found!!!!' %(res_folder+file+'_dist'+str(strategy)+'.npy') dist_matrix = pp.strategy(data, 'distance',strategy) np.save(res_folder+file+'_dist'+str(strategy), dist_matrix)''' try: dist_matrix = scipy.io.mmread(res_folder+file+'_dist'+str(strategy)).tocsr() print 'Distance matrix %s found.' %(res_folder+file+'_dist'+str(strategy)) except: print 'Distance matrix %s NOT found!!!!' %(res_folder+file+'_dist'+str(strategy)) dist_matrix = pp.strategy(data, 'distance',strategy) scipy.io.mmwrite(res_folder+file+'_dist'+str(strategy),dist_matrix) occupancy = len(dist_matrix.data) / (dist_matrix.shape[0] * dist_matrix.shape[1]) * 100 q = 10 dist_percentile = np.percentile(a=dist_matrix.data, q=q, axis=None) if dist_percentile == 0: # or strategy == 6: q = 1 print 'Recalculating dist_percentile..' #dist_percentile = np.percentile(a=dist_matrix, q=q) dist_percentile = np.percentile(a=dist_matrix.data, q=q, axis=None) print 'dist_percentile = ', dist_percentile old_n_clusters = 0 old_non_clustered = 0 # list to save labels from all iterations, so we can later pick the best clustering res_from_diff_params = {} nr_clusters_from_diff_params = {} non_clustered_from_diff_params = {} distribution_from_diff_params = {} best_iteration = -1 sec_best_iteration = -1 n = dist_matrix.shape[0] min_non_clusterd = n s_min_non_clusterd = n min_std_dev = n sec_threshold = 0.0001 n_iterations = 49 # must be an odd number eps_list = get_eps_list_geom(mid=dist_percentile, length=n_iterations, strategy=strategy) pure_diagonal = False n_components, p_d_labels = connected_components(dist_matrix, directed=False) if n_components > 1: print 'MST: pure diagonal found...' pure_diagonal = True # cluster the data with MST --------------------------------------------- for iteration in range(n_iterations): gc.collect() if dist_percentile == 0: print 'dist_percentile = %i, -> we cannot use MST for clustering this instance.' %dist_percentile break # eps is in range: [dist_percentile - 0.5, dist_percentile + 0.5] but with geometric progression eps = eps_list[iteration] if eps <= 0: continue if eps >= 1: break # for distance strategy 1: 0.054... #eps = 0.1 + (iteration / 10) min_samples = 4 #print 'DEBUG: eps = ', eps labels = [] print '_______________________________________________________' print 'Running MST...' print 'iteration= ', iteration print 'eps = ', eps print 'min_samples = ', min_samples if pure_diagonal: labels = p_d_labels else: # cutoff_scale is min size of edges to cut model = MSTClustering(cutoff_scale=eps, min_cluster_size = min_samples, metric = "precomputed") labels = model.fit_predict(dist_matrix) #print 'labels = ', labels #raise Exception('Wait....') n_clusters = len(set(labels)) - (1 if -1 in labels else 0) num_per_cluster = {} for i in range(n_clusters): num_per_cluster[i] = 0 for label in labels: for i in range(n_clusters): if label == i: num_per_cluster[i] += 1; non_clustered = 0; for label in labels: if label == -1: non_clustered += 1 # criteria for skiping or breaking the loop --------------------------------------------- # skip the iteration if the number of clusters is as before if iteration == 0: old_n_clusters = n_clusters old_non_clustered = non_clustered if n_clusters == old_n_clusters and non_clustered == old_non_clustered and iteration > 0: print 'Same clustering...' continue old_n_clusters = n_clusters old_non_clustered = non_clustered if n_clusters == 1 and non_clustered == 0: print 'Stopping because bigger EPS will be the same.' break # --------------------------------------------------------------------------------------- # display some information print 'Estimated number of clusters: ', n_clusters print 'Number of points per cluster: ', num_per_cluster print 'Number of non clustered points:', non_clustered #draw(A=sim_matrix, colors=labels) # --------------------------------------------------------------------------------------- sorted_data, sotred_labels, sorted_names, column_labels = sort_matrix(data, labels, row_names) #print 'DEBUG:' #print 'column_labels = ', column_labels #print 'sotred_labels = ', sotred_labels #save_matrix_fig(sorted_data, res_folder, file + '_B_dec' + str(iteration)) # pull down the points which have non-zero value that colides with points from other clusters sorted_data2, sotred_labels2, sorted_names2 = postp.remove_colision_points2(sorted_data, sotred_labels, sorted_names, column_labels) num_per_cluster = {} n_clusters = len(set(sotred_labels2)) - (1 if -1 in sotred_labels2 else 0) if -1 in sotred_labels2: all_clusters_list = range(-1, n_clusters) else: all_clusters_list = range(n_clusters) for i in all_clusters_list: num_per_cluster[i] = 0 for label in sotred_labels2: for i in all_clusters_list: if label == i: num_per_cluster[i] += 1; non_clustered = 0; for label in sotred_labels2: if label == -1: non_clustered += 1 print 'Estimated number of clusters after removal: ', n_clusters print 'Number of points per cluster after removal: ', num_per_cluster print 'Number of non clustered points after removal:', non_clustered if 0 in num_per_cluster.values(): print 'TIME TO DEBUG:' print 'sotred_labels2 = ', sotred_labels2 # save picture of end matrix #save_matrix_fig(sorted_data2, res_folder, file + '_A_dec' + str(iteration)) #if res2_folder <> 'none': #save_matrix_fig(sorted_data2, res2_folder, file + '_A_dec' + str(iteration)) # find the best iteration, so we only save the best one -------------------------- label_name_pairs = zip(sotred_labels2, sorted_names2) if non_clustered < min_non_clusterd: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster min_non_clusterd = non_clustered best_iteration = iteration print 'this is best iteration currently' # find the best iteration (according variance of cluster sizes), ---------------- # so we only save the best one temp_num_per_cluster = num_per_cluster.copy() if -1 in temp_num_per_cluster.keys(): del temp_num_per_cluster[-1] if len(temp_num_per_cluster.values()) > 1: std_dev = np.std(temp_num_per_cluster.values()) mean = np.mean(temp_num_per_cluster.values()) rel_std_dev = std_dev / mean rel_std_dev *= pow(non_clustered/n, 2) print 'DEBUG: adjusted rel_std_dev = ', rel_std_dev std_dev = rel_std_dev # we accept the iteration if adjusted rel_std_dev is smaller, or # if it is within the threshold and number of nonclustered points is smaller if (std_dev - min_std_dev) <= sec_threshold and non_clustered < s_min_non_clusterd: sec_criteria_fulfiled = True else: sec_criteria_fulfiled = False if std_dev < min_std_dev or sec_criteria_fulfiled: res_from_diff_params[iteration] = label_name_pairs nr_clusters_from_diff_params[iteration] = n_clusters non_clustered_from_diff_params[iteration] = non_clustered distribution_from_diff_params[iteration] = num_per_cluster min_std_dev = std_dev s_min_non_clusterd = non_clustered sec_best_iteration = iteration print 'this is second best iteration currently' if pure_diagonal: break # ---------------------------------------------------------------------------------- print '_______________________________________________________' best_found = False best_n_clusters = 0 best_non_clusterd = data.shape[0] best_distro = {-1:data.shape[0]} best_dec = '' # name of dec file for best iteration s_best_found = False s_best_n_clusters = 0 s_best_non_clusterd = data.shape[0] s_best_distro = {-1:data.shape[0]} s_dec = '' # name of dec file for second best iteration # save .dec from best iteration print 'best_iteration= ', best_iteration print 'sec best iteration = ', sec_best_iteration if best_iteration >= 0: best_found = True best_n_clusters = nr_clusters_from_diff_params[best_iteration] best_non_clusterd = non_clustered_from_diff_params[best_iteration] best_distro = distribution_from_diff_params[best_iteration] best_dec = file + '_mst_' + str(best_n_clusters) + '_' + str(best_non_clusterd) +'_dist'+str(strategy) dec.write(path = res_folder, filename = best_dec, label_name_pairs = res_from_diff_params[best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+best_dec, best_iteration) if sec_best_iteration >= 0: if sec_best_iteration <> best_iteration: s_best_found = True s_best_n_clusters = nr_clusters_from_diff_params[sec_best_iteration] s_best_non_clusterd = non_clustered_from_diff_params[sec_best_iteration] s_best_distro = distribution_from_diff_params[sec_best_iteration] s_dec = file + '_mstSTD_' + str(s_best_n_clusters) + '_' + str(s_best_non_clusterd) +'_dist'+str(strategy) dec.write(path = res_folder, filename = s_dec, label_name_pairs = res_from_diff_params[sec_best_iteration]) print '.dec file %s for iteration %i saved.' %(res_folder+s_dec, sec_best_iteration) print '_______________________________________________________' print '_______________________________________________________' gc.collect() return best_found, best_n_clusters, best_non_clusterd, best_distro, best_dec, data.shape[0], \ s_best_found, s_best_n_clusters, s_best_non_clusterd, s_best_distro, s_dec
row = [float(w)] else: row.append(float(w)) old_sample=sample matrix.append(row) mat=np.array(matrix) mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=3, dissimilarity="precomputed", n_jobs=1) pos = mds.fit(mat).embedding_ clf = PCA(n_components=2) pos = clf.fit_transform(pos) fig, ax = plt.subplots() model = MSTClustering(cutoff_scale=200, approximate=False) labels = model.fit_predict(pos) #### Om man vill ha kanter: #X = model.X_fit_ #segments = model.get_graph_segments(full_graph=False) #ax.plot(segments[0], segments[1], '-k', zorder=1, lw=1) #ax.scatter(X[:, 0], X[:, 1], c=model.labels_, cmap='rainbow', zorder=2) #ax.axis('tight') ##### #### Utan kanter: plt.scatter(pos[:, 0], pos[:, 1], c=labels, s=100, lw=0) #### for i, n in enumerate(nodes):
class MST: """ Minimum Spanning Tree Class Compute MST for a set of input points using the MSTClustering code from jakevdp, calculate branch lengths from the MST and generate plots of the MST and cumulative distribution of branch lengths. ---- Inputs ---- data frame "df", which has two columns present: - ra: right ascension (deg) - dec: declination (deg) cutoff_scale (float): minimum size of edges, also known as the critical branch length. All edges larger than cutoff_scale will be removed. min_cluster_size (int): min number of galaxies in a cluster. n_neighbors (int): maximum number of neighbors of each point used for approximate Euclidean MST algorithm. ---- Attributes ---- labels: integer specifying the structure to which a given galaxy has been assigned. It will have a -1 if no membership was assigned. segments: sets of ra, dec coordinates for the MST branch segments seps: base-10 log of branch lengths (in degrees) """ def __init__(self, df, cutoff_scale=None, min_cluster_size=None, n_neighbors=None, set_mst=None, labels=None, segments=None, seps=None): self.df = df self.cutoff_scale = cutoff_scale self.min_cluster_size = min_cluster_size self.n_neighbors = n_neighbors self.set_mst = MSTClustering(cutoff_scale=cutoff_scale, min_cluster_size=min_cluster_size, n_neighbors=n_neighbors) pos = np.array([list(i) for i in zip(df.ra, df.dec)]) self.labels = self.set_mst.fit_predict(pos) self.segments = self.set_mst.get_graph_segments(full_graph=True) self.seps = self.get_sep_mst() """ Calculate branch lengths (in base-10 log(degrees)) from the MST segments """ def get_sep_mst(self): mst_coord0_ra = np.asarray(self.segments[0][0]) mst_coord1_ra = np.asarray(self.segments[0][1]) mst_coord0_dec = np.asarray(self.segments[1][0]) mst_coord1_dec = np.asarray(self.segments[1][1]) c0 = SkyCoord(mst_coord0_ra, mst_coord0_dec, unit=u.deg) c1 = SkyCoord(mst_coord1_ra, mst_coord1_dec, unit=u.deg) return np.log10(c0.separation(c1).degree) """ Plot the MST diagram (left) and the labeled structures identified from the MST (right) """ def plot_mst(self, model, cmap='rainbow', *args, **kwargs): """Utility code to visualize a minimum spanning tree""" xlim = kwargs.get('xlim', None) ylim = kwargs.get('ylim', None) ssize = kwargs.get('s', 8) savefigure = kwargs.get('savefigure', False) figname = kwargs.get('figname', 'MST_figure.png') X = model.X_fit_ # One little hack to get more clear color differentiation between the # points with cluster membership and without. Add 50(?) to the label numbers # of those that are cluster members. model.labels_[model.labels_ > -1] += 50 fig, ax = plt.subplots(1, 2, figsize=(20, 7), sharex=True, sharey=True) for axi, full_graph, colors in zip(ax, [True, False], ['lightblue', model.labels_]): segments = model.get_graph_segments(full_graph=full_graph) axi.plot(segments[0], segments[1], '-k', zorder=1, lw=1) plt.xlabel('Right Ascension (deg)', size=14) plt.ylabel('Declination (deg)', size=14) axi.scatter(X[:, 0], X[:, 1], c=colors, cmap=cmap, zorder=2, s=ssize) axi.axis('tight') if xlim != None: plt.xlim(xlim) if ylim != None: plt.ylim(ylim) ax[0].set_title('Full Minimum Spanning Tree', size=16) ax[1].set_title('Trimmed Minimum Spanning Tree', size=16) # Leave an option to save all the plots to output PNG files. if savefigure == True: pl.savefig(figname, bbox_inches='tight', dpi=250) """ Plot the cumulative distribution of MST branch lengths """ def plot_mst_cumul(self, *args, **kwargs): savefigure = kwargs.get('savefigure', False) figname = kwargs.get('figname', 'MST_cumul_dist.png') sns.distplot(self.seps, hist_kws=dict(cumulative=False), kde_kws=dict(cumulative=True)) plt.xlabel('log$_{10}$ (MST branch length)', fontsize=15) plt.ylabel('Norm. Counts/Cumul. Dist.', fontsize=15) # Leave an option to save all the plots to output PNG files. if savefigure == True: pl.savefig(figname, bbox_inches='tight')