def analyze_zN(z, outdir, vg, skip_umap=False): zdim = z.shape[1] # Principal component analysis log('Perfoming principal component analysis...') pc, pca = analysis.run_pca(z) start, end = np.percentile(pc[:,0],(5,95)) z_pc1 = analysis.get_pc_traj(pca, z.shape[1], 10, 1, start, end) start, end = np.percentile(pc[:,1],(5,95)) z_pc2 = analysis.get_pc_traj(pca, z.shape[1], 10, 2, start, end) # kmeans clustering log('K-means clustering...') K = 20 kmeans_labels, centers = analysis.cluster_kmeans(z, K) centers, centers_ind = analysis.get_nearest_point(z, centers) if not os.path.exists(f'{outdir}/kmeans20'): os.mkdir(f'{outdir}/kmeans20') utils.save_pkl(kmeans_labels, f'{outdir}/kmeans20/labels.pkl') np.savetxt(f'{outdir}/kmeans20/centers.txt', centers) np.savetxt(f'{outdir}/kmeans20/centers_ind.txt', centers_ind, fmt='%d') # Generate volumes log('Generating volumes...') vg.gen_volumes(f'{outdir}/pc1', z_pc1) vg.gen_volumes(f'{outdir}/pc2', z_pc2) vg.gen_volumes(f'{outdir}/kmeans20', centers) # UMAP -- slow step if zdim > 2 and not skip_umap: log('Running UMAP...') umap_emb = analysis.run_umap(z) utils.save_pkl(umap_emb, f'{outdir}/umap.pkl') # Make some plots log('Generating plots...') plt.figure(1) plt.scatter(pc[:,0], pc[:,1], alpha=.1, s=2) plt.xlabel('PC1') plt.ylabel('PC2') plt.savefig(f'{outdir}/z_pca.png') if zdim > 2 and not skip_umap: plt.figure(2) plt.scatter(umap_emb[:,0], umap_emb[:,1], alpha=.1, s=2) plt.xlabel('UMAP1') plt.ylabel('UMAP2') plt.savefig(f'{outdir}/umap.png') analysis.plot_by_cluster(pc[:,0], pc[:,1], K, kmeans_labels, centers_ind=centers_ind, annotate=True) plt.xlabel('PC1') plt.ylabel('PC2') plt.savefig(f'{outdir}/kmeans20/z_pca.png') if zdim > 2 and not skip_umap: analysis.plot_by_cluster(umap_emb[:,0], umap_emb[:,1], K, kmeans_labels, centers_ind=centers_ind, annotate=True) plt.xlabel('UMAP1') plt.ylabel('UMAP2') plt.savefig(f'{outdir}/kmeans20/umap.png')
def generate_volumes(z, outdir, vg, K): # kmeans clustering log('Sketching distribution...') kmeans_labels, centers = analysis.cluster_kmeans(z, K, on_data=True, reorder=True) centers, centers_ind = analysis.get_nearest_point(z, centers) if not os.path.exists(f'{outdir}/kmeans{K}'): os.mkdir(f'{outdir}/kmeans{K}') utils.save_pkl(kmeans_labels, f'{outdir}/kmeans{K}/labels.pkl') np.savetxt(f'{outdir}/kmeans{K}/centers.txt', centers) np.savetxt(f'{outdir}/kmeans{K}/centers_ind.txt', centers_ind, fmt='%d') log('Generating volumes...') vg.gen_volumes(f'{outdir}/kmeans{K}', centers)
def follow_candidate_particles(workdir, outdir, epochs, n_dim, binned_ptcls_mask, labels, LOG): ''' Monitor how the labeled set of particles migrates within latent space at selected epochs over training Inputs: workdir: path to directory containing cryodrgn training results outdir: path to base directory to save outputs epochs: array of epochs for which to calculate UMAPs n_dim: latent dimensionality binned_ptcls_mask: (n_particles, len(labels)) binary mask of which particles belong to which class labels: unique identifier for each class of representative latent encodings Outputs plot.png tracking representative latent encodings through epochs latent.txt of representative latent encodings for each epoch ''' # track sketched points from epoch E through selected previous epochs and plot overtop UMAP embedding n_cols = int(np.ceil(len(epochs) ** 0.5)) n_rows = int(np.ceil(len(epochs) / n_cols)) fig, axes = plt.subplots(n_rows, n_cols, figsize=(2 * n_cols, 2 * n_rows), sharex='all', sharey='all') fig.tight_layout() ind_subset = utils.load_pkl(f'{outdir}/ind_subset.pkl') for i, ax in enumerate(axes.flat): try: umap = utils.load_pkl(f'{outdir}/umaps/umap.{epochs[i]}.pkl') z = utils.load_pkl(f'{workdir}/z.{epochs[i]}.pkl')[ind_subset,:] z_maxima_median = np.zeros((len(labels), n_dim)) for k in range(len(labels)): z_maxima_median[k, :] = np.median(z[binned_ptcls_mask[:, k]], axis=0) # find median latent value of each maximum in a given epoch z_maxima_median_ondata, z_maxima_median_ondata_ind = analysis.get_nearest_point(z, z_maxima_median) # find on-data latent encoding of each median latent value umap_maxima_median_ondata = umap[z_maxima_median_ondata_ind] # find on-data UMAP embedding of each median latent encoding # Write out the on-data median latent values of each labeled set of particles for each epoch in epochs with open(f'{outdir}/repr_particles/latent_representative.{epochs[i]}.txt', 'w') as f: np.savetxt(f, z_maxima_median_ondata, delimiter=' ', newline='\n', header='', footer='', comments='# ') flog(f'Saved representative latent encodings for epoch {epochs[i]} to {outdir}/repr_particles/latent_representative.{epochs[i]}.txt', LOG) for k in range(len(labels)): ax.text(x=umap_maxima_median_ondata[k, 0] + 0.3, y=umap_maxima_median_ondata[k, 1] + 0.3, s=labels[k], fontdict=dict(color='r', size=10)) toplot = ax.hexbin(*umap.T, bins='log', mincnt=1) ax.scatter(umap_maxima_median_ondata[:, 0], umap_maxima_median_ondata[:, 1], s=10, linewidth=0, c='r', alpha=1) ax.set_title(f'epoch {epochs[i]}') except IndexError: pass if len(axes.shape) == 1: axes[0].set_ylabel('UMAP2') for a in axes[:]: a.set_xlabel('UMAP1') else: assert len(axes.shape) == 2 #there are more than one row and column of axes for a in axes[:, 0]: a.set_ylabel('UMAP2') for a in axes[-1, :]: a.set_xlabel('UMAP1') fig.subplots_adjust(right=0.96) cbar_ax = fig.add_axes([0.98, 0.15, 0.02, 0.7]) cbar = fig.colorbar(toplot, cax=cbar_ax) cbar.ax.set_ylabel('Particle Density', rotation=90) plt.subplots_adjust(wspace=0.1) plt.subplots_adjust(hspace=0.25) plt.savefig(f'{outdir}/plots/04_decoder_maxima-sketch-consistency.png', dpi=300, format='png', transparent=True, bbox_inches='tight') flog(f'Saved plot tracking representative latent encodings through epochs {epochs} to {outdir}/plots/04_decoder_maxima-sketch-consistency.png', LOG)
def analyze_zN(z, outdir, vg, skip_umap=False, num_pcs=2, num_ksamples=20): zdim = z.shape[1] # Principal component analysis log('Perfoming principal component analysis...') pc, pca = analysis.run_pca(z) log('Generating volumes...') for i in range(num_pcs): start, end = np.percentile(pc[:, i], (5, 95)) z_pc = analysis.get_pc_traj(pca, z.shape[1], 10, i + 1, start, end) vg.gen_volumes(f'{outdir}/pc{i+1}', z_pc) # kmeans clustering log('K-means clustering...') K = num_ksamples kmeans_labels, centers = analysis.cluster_kmeans(z, K) centers, centers_ind = analysis.get_nearest_point(z, centers) if not os.path.exists(f'{outdir}/kmeans{K}'): os.mkdir(f'{outdir}/kmeans{K}') utils.save_pkl(kmeans_labels, f'{outdir}/kmeans{K}/labels.pkl') np.savetxt(f'{outdir}/kmeans{K}/centers.txt', centers) np.savetxt(f'{outdir}/kmeans{K}/centers_ind.txt', centers_ind, fmt='%d') log('Generating volumes...') vg.gen_volumes(f'{outdir}/kmeans{K}', centers) # UMAP -- slow step if zdim > 2 and not skip_umap: log('Running UMAP...') umap_emb = analysis.run_umap(z) utils.save_pkl(umap_emb, f'{outdir}/umap.pkl') # Make some plots log('Generating plots...') plt.figure(1) g = sns.jointplot(x=pc[:, 0], y=pc[:, 1], alpha=.1, s=2) g.set_axis_labels('PC1', 'PC2') plt.tight_layout() plt.savefig(f'{outdir}/z_pca.png') plt.figure(2) g = sns.jointplot(x=pc[:, 0], y=pc[:, 1], kind='hex') g.set_axis_labels('PC1', 'PC2') plt.tight_layout() plt.savefig(f'{outdir}/z_pca_hexbin.png') if zdim > 2 and not skip_umap: plt.figure(3) g = sns.jointplot(x=umap_emb[:, 0], y=umap_emb[:, 1], alpha=.1, s=2) g.set_axis_labels('UMAP1', 'UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/umap.png') plt.figure(4) g = sns.jointplot(x=umap_emb[:, 0], y=umap_emb[:, 1], kind='hex') g.set_axis_labels('UMAP1', 'UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/umap_hexbin.png') analysis.scatter_annotate(pc[:, 0], pc[:, 1], centers_ind=centers_ind, annotate=True) plt.xlabel('PC1') plt.ylabel('PC2') plt.savefig(f'{outdir}/kmeans{K}/z_pca.png') g = analysis.scatter_annotate_hex(pc[:, 0], pc[:, 1], centers_ind=centers_ind, annotate=True) g.set_axis_labels('PC1', 'PC2') plt.tight_layout() plt.savefig(f'{outdir}/kmeans{K}/z_pca_hex.png') if zdim > 2 and not skip_umap: analysis.scatter_annotate(umap_emb[:, 0], umap_emb[:, 1], centers_ind=centers_ind, annotate=True) plt.xlabel('UMAP1') plt.ylabel('UMAP2') plt.savefig(f'{outdir}/kmeans{K}/umap.png') g = analysis.scatter_annotate_hex(umap_emb[:, 0], umap_emb[:, 1], centers_ind=centers_ind, annotate=True) g.set_axis_labels('UMAP1', 'UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/kmeans{K}/umap_hex.png') for i in range(num_pcs): if not skip_umap: analysis.scatter_color(umap_emb[:, 0], umap_emb[:, 1], pc[:, i], label=f'PC{i+1}') plt.xlabel('UMAP1') plt.ylabel('UMAP2') plt.tight_layout() plt.savefig(f'{outdir}/pc{i+1}/umap.png')