def check_shape(ndim, cutoff, N=10): X = np.random.rand(N, ndim) mst = MSTClustering(cutoff=cutoff).fit(X) segments = mst.get_graph_segments() print(ndim, cutoff, segments[0].shape) assert len(segments) == ndim assert all(seg.shape == (2, N - 1 - cutoff) for seg in segments) segments = mst.get_graph_segments(full_graph=True) print(segments[0].shape) assert len(segments) == ndim assert all(seg.shape == (2, N - 1) for seg in segments)
def check_graph_segments_vals(): X = np.arange(5)[:, None] ** 2 mst = MSTClustering(cutoff=0).fit(X) segments = mst.get_graph_segments() assert len(segments) == 1 assert_allclose(segments[0], [[0, 4, 4, 9], [1, 1, 9, 16]])
def check_graph_segments_vals(): X = np.arange(5)[:, None]**2 mst = MSTClustering(cutoff=0).fit(X) segments = mst.get_graph_segments() assert len(segments) == 1 assert_allclose(segments[0], [[0, 4, 4, 9], [1, 1, 9, 16]])
def cluster_positions(positions, plots=False, cutoff_scale_min=120, cutoff_scale_max=350, cutoff_scale_resolution=150, n_neighbors_max=5, min_cluster_size=4): """ Find clusters in the positions produced by `~shampoo.track2d.locate_from_hologram` using a Minimum Spanning Tree. Parameters ---------- positions : `~numpy.ndarray` Positions for each detected specimen in each frame, with values specified by `~shampoo.track2d.locate_from_hologram`. plots : bool (optional) Plot the scores for the grid search in ``(cutoff_scale, n_neighbors)`` space and the best clusters Returns ------- labels : `~numpy.ndarray` Cluster labels for each position in ``positions``. `-1` represents positions without a cluster. """ # Scale the time and max_intensity dims similarly to the spatial dimensions X = positions.copy() X = X[:, 0:4] X[:, 0] *= 5*positions[:, 1].ptp()/positions[:, 0].ptp() X[:, 3] *= positions[:, 1].ptp()/positions[:, 3].ptp() # Grid search in (cutoff_scales, n_neighbors) for the best clustering params cutoff_scales = np.linspace(cutoff_scale_min, cutoff_scale_max, cutoff_scale_resolution) n_neighbors = np.arange(1, n_neighbors_max) scores = np.zeros((len(cutoff_scales), len(n_neighbors)), dtype=np.float64) for i in range(cutoff_scales.shape[0]): for j in range(n_neighbors.shape[0]): model = MSTClustering(cutoff_scale=cutoff_scales[i], approximate=True, n_neighbors=n_neighbors[j], min_cluster_size=min_cluster_size) labels = model.fit_predict(X) distance_stds = [] for l in set(labels): if l != -1: pca = PCA(n_components=3) pca.fit(X[labels == l, 0:3]) X_prime = pca.transform(X[labels == l, 0:3]) distance_stds.append(X_prime[:, 1].std() / X_prime[:, 0].ptp()) f_labeled = np.count_nonzero(labels != -1)/float(len(labels)) scores[i, j] = np.mean(distance_stds)/f_labeled # With the best clustering parameters, label the clusters x_min_ind, y_min_ind = np.where(scores == scores.min()) n_neighbors_min = n_neighbors[y_min_ind[0]] cuttoff_scale_min = cutoff_scales[x_min_ind[0]] print(n_neighbors_min, cuttoff_scale_min) model = MSTClustering(cutoff_scale=cuttoff_scale_min, approximate=True, n_neighbors=n_neighbors_min, min_cluster_size=4) labels = model.fit_predict(X) if plots: # Plot the scores in (cutoff_scales, n_neighbors) space fig, ax = plt.subplots(figsize=(16, 10)) ax.imshow(np.log(scores).T, interpolation='nearest', origin='lower', cmap=plt.cm.viridis) ax.set_xticks(range(len(cutoff_scales))[::5]) ax.set_xticklabels(["{0:.2f}".format(cutoff_scale) for cutoff_scale in cutoff_scales[::5]]) ax.set_yticks(range(len(n_neighbors))) ax.set_yticklabels(range(1, len(n_neighbors)+1)) for l in ax.get_xticklabels(): l.set_rotation(45) l.set_ha('right') ax.set_xlabel('cutoff') ax.set_ylabel('n_neighbors') ax.set_aspect(10) # Plot the best clusters plot_segments = True fig, ax = plt.subplots(1, 3, figsize=(16, 6)) kwargs = dict(s=100, alpha=0.6, edgecolor='none', cmap=plt.cm.Spectral, c=labels) ax[0].scatter(X[:, 0], X[:, 1], **kwargs) ax[1].scatter(X[:, 0], X[:, 2], **kwargs) ax[2].scatter(X[:, 1], X[:, 2], **kwargs) ax[0].set(xlabel='t', ylabel='x') ax[1].set(xlabel='t', ylabel='y') ax[2].set(xlabel='x', ylabel='y') if plot_segments: segments = model.get_graph_segments(full_graph=False) ax[0].plot(segments[0], segments[1], '-k') ax[1].plot(segments[0], segments[2], '-k') ax[2].plot(segments[1], segments[2], '-k') fig.tight_layout() plt.show() return labels
class MST: """ Minimum Spanning Tree Class Compute MST for a set of input points using the MSTClustering code from jakevdp, calculate branch lengths from the MST and generate plots of the MST and cumulative distribution of branch lengths. ---- Inputs ---- data frame "df", which has two columns present: - ra: right ascension (deg) - dec: declination (deg) cutoff_scale (float): minimum size of edges, also known as the critical branch length. All edges larger than cutoff_scale will be removed. min_cluster_size (int): min number of galaxies in a cluster. n_neighbors (int): maximum number of neighbors of each point used for approximate Euclidean MST algorithm. ---- Attributes ---- labels: integer specifying the structure to which a given galaxy has been assigned. It will have a -1 if no membership was assigned. segments: sets of ra, dec coordinates for the MST branch segments seps: base-10 log of branch lengths (in degrees) """ def __init__(self, df, cutoff_scale=None, min_cluster_size=None, n_neighbors=None, set_mst=None, labels=None, segments=None, seps=None): self.df = df self.cutoff_scale = cutoff_scale self.min_cluster_size = min_cluster_size self.n_neighbors = n_neighbors self.set_mst = MSTClustering(cutoff_scale=cutoff_scale, min_cluster_size=min_cluster_size, n_neighbors=n_neighbors) pos = np.array([list(i) for i in zip(df.ra, df.dec)]) self.labels = self.set_mst.fit_predict(pos) self.segments = self.set_mst.get_graph_segments(full_graph=True) self.seps = self.get_sep_mst() """ Calculate branch lengths (in base-10 log(degrees)) from the MST segments """ def get_sep_mst(self): mst_coord0_ra = np.asarray(self.segments[0][0]) mst_coord1_ra = np.asarray(self.segments[0][1]) mst_coord0_dec = np.asarray(self.segments[1][0]) mst_coord1_dec = np.asarray(self.segments[1][1]) c0 = SkyCoord(mst_coord0_ra, mst_coord0_dec, unit=u.deg) c1 = SkyCoord(mst_coord1_ra, mst_coord1_dec, unit=u.deg) return np.log10(c0.separation(c1).degree) """ Plot the MST diagram (left) and the labeled structures identified from the MST (right) """ def plot_mst(self, model, cmap='rainbow', *args, **kwargs): """Utility code to visualize a minimum spanning tree""" xlim = kwargs.get('xlim', None) ylim = kwargs.get('ylim', None) ssize = kwargs.get('s', 8) savefigure = kwargs.get('savefigure', False) figname = kwargs.get('figname', 'MST_figure.png') X = model.X_fit_ # One little hack to get more clear color differentiation between the # points with cluster membership and without. Add 50(?) to the label numbers # of those that are cluster members. model.labels_[model.labels_ > -1] += 50 fig, ax = plt.subplots(1, 2, figsize=(20, 7), sharex=True, sharey=True) for axi, full_graph, colors in zip(ax, [True, False], ['lightblue', model.labels_]): segments = model.get_graph_segments(full_graph=full_graph) axi.plot(segments[0], segments[1], '-k', zorder=1, lw=1) plt.xlabel('Right Ascension (deg)', size=14) plt.ylabel('Declination (deg)', size=14) axi.scatter(X[:, 0], X[:, 1], c=colors, cmap=cmap, zorder=2, s=ssize) axi.axis('tight') if xlim != None: plt.xlim(xlim) if ylim != None: plt.ylim(ylim) ax[0].set_title('Full Minimum Spanning Tree', size=16) ax[1].set_title('Trimmed Minimum Spanning Tree', size=16) # Leave an option to save all the plots to output PNG files. if savefigure == True: pl.savefig(figname, bbox_inches='tight', dpi=250) """ Plot the cumulative distribution of MST branch lengths """ def plot_mst_cumul(self, *args, **kwargs): savefigure = kwargs.get('savefigure', False) figname = kwargs.get('figname', 'MST_cumul_dist.png') sns.distplot(self.seps, hist_kws=dict(cumulative=False), kde_kws=dict(cumulative=True)) plt.xlabel('log$_{10}$ (MST branch length)', fontsize=15) plt.ylabel('Norm. Counts/Cumul. Dist.', fontsize=15) # Leave an option to save all the plots to output PNG files. if savefigure == True: pl.savefig(figname, bbox_inches='tight')