Ejemplo n.º 1
0
def analyze_zN(z, outdir, vg, skip_umap=False):
    zdim = z.shape[1]

    # Principal component analysis
    log('Perfoming principal component analysis...')
    pc, pca = analysis.run_pca(z)
    start, end = np.percentile(pc[:,0],(5,95))
    z_pc1 = analysis.get_pc_traj(pca, z.shape[1], 10, 1, start, end)
    start, end = np.percentile(pc[:,1],(5,95))
    z_pc2 = analysis.get_pc_traj(pca, z.shape[1], 10, 2, start, end)

    # kmeans clustering
    log('K-means clustering...')
    K = 20
    kmeans_labels, centers = analysis.cluster_kmeans(z, K)
    centers, centers_ind = analysis.get_nearest_point(z, centers)
    if not os.path.exists(f'{outdir}/kmeans20'): 
        os.mkdir(f'{outdir}/kmeans20')
    utils.save_pkl(kmeans_labels, f'{outdir}/kmeans20/labels.pkl')
    np.savetxt(f'{outdir}/kmeans20/centers.txt', centers)
    np.savetxt(f'{outdir}/kmeans20/centers_ind.txt', centers_ind, fmt='%d')

    # Generate volumes
    log('Generating volumes...')
    vg.gen_volumes(f'{outdir}/pc1', z_pc1)
    vg.gen_volumes(f'{outdir}/pc2', z_pc2)
    vg.gen_volumes(f'{outdir}/kmeans20', centers)

    # UMAP -- slow step
    if zdim > 2 and not skip_umap:
        log('Running UMAP...')
        umap_emb = analysis.run_umap(z)
        utils.save_pkl(umap_emb, f'{outdir}/umap.pkl')

    # Make some plots
    log('Generating plots...')
    plt.figure(1)
    plt.scatter(pc[:,0], pc[:,1], alpha=.1, s=2)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.savefig(f'{outdir}/z_pca.png')
    
    if zdim > 2 and not skip_umap:
        plt.figure(2)
        plt.scatter(umap_emb[:,0], umap_emb[:,1], alpha=.1, s=2)
        plt.xlabel('UMAP1')
        plt.ylabel('UMAP2')
        plt.savefig(f'{outdir}/umap.png')

    analysis.plot_by_cluster(pc[:,0], pc[:,1], K, kmeans_labels, centers_ind=centers_ind, annotate=True)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.savefig(f'{outdir}/kmeans20/z_pca.png')

    if zdim > 2 and not skip_umap:
        analysis.plot_by_cluster(umap_emb[:,0], umap_emb[:,1], K, kmeans_labels, centers_ind=centers_ind, annotate=True)
        plt.xlabel('UMAP1')
        plt.ylabel('UMAP2')
        plt.savefig(f'{outdir}/kmeans20/umap.png')
Ejemplo n.º 2
0
def generate_volumes(z, outdir, vg, K):
    # kmeans clustering
    log('Sketching distribution...')
    kmeans_labels, centers = analysis.cluster_kmeans(z,
                                                     K,
                                                     on_data=True,
                                                     reorder=True)
    centers, centers_ind = analysis.get_nearest_point(z, centers)
    if not os.path.exists(f'{outdir}/kmeans{K}'):
        os.mkdir(f'{outdir}/kmeans{K}')
    utils.save_pkl(kmeans_labels, f'{outdir}/kmeans{K}/labels.pkl')
    np.savetxt(f'{outdir}/kmeans{K}/centers.txt', centers)
    np.savetxt(f'{outdir}/kmeans{K}/centers_ind.txt', centers_ind, fmt='%d')
    log('Generating volumes...')
    vg.gen_volumes(f'{outdir}/kmeans{K}', centers)
Ejemplo n.º 3
0
def main(args):
    fig, ax = plt.subplots()
    print(args)
    z = pickle.load(open(args.input, 'rb'))
    if args.stride:
        z = z[::args.stride]
    print('{} points'.format(len(z)))

    # k-means clustering
    labels, centers = analysis.cluster_kmeans(z,
                                              args.k,
                                              on_data=args.on_data,
                                              reorder=args.reorder)

    # use the nearest data point instead of cluster centroid
    if args.on_data:
        centers_zi = cdist(centers, z).argmin(axis=1)
        print(centers_zi)
        centers_z = z[centers_zi]
        centers = centers_z
        if args.out_k_ind:
            np.savetxt(args.out_k_ind, centers_zi, fmt='%d')

    if args.o:
        with open(args.o, 'wb') as f:
            pickle.dump(labels, f)

    if args.out_k:
        np.savetxt(args.out_k, centers)

    # dimensionality reduction for viz
    pca = PCA(z.shape[1])
    pca.fit(z)
    print('PCA explained variance ratio:')
    print(pca.explained_variance_ratio_)
    pc = pca.transform(z)

    for i in range(args.k):
        ii = np.where(labels == i)
        pc_sub = pc[ii]
        plt.scatter(pc_sub[:, 0],
                    pc_sub[:, 1],
                    s=2,
                    alpha=0.1,
                    label='cluster {}'.format(i))

    c = pca.transform(centers)
    plt.scatter(c[:, 0], c[:, 1], c='k')
    for i in range(args.k):
        ax.annotate(str(i), c[i, 0:2])

    xx, yy = 0, 1
    plt.xlabel('PC{} ({:3f})'.format(xx + 1,
                                     pca.explained_variance_ratio_[xx]))
    plt.ylabel('PC{} ({:3f})'.format(yy + 1,
                                     pca.explained_variance_ratio_[yy]))

    if args.out_png:
        plt.savefig(args.out_png)
    else:
        plt.show()
Ejemplo n.º 4
0
def analyze_zN(z, outdir, vg, skip_umap=False, num_pcs=2, num_ksamples=20):
    zdim = z.shape[1]

    # Principal component analysis
    log('Perfoming principal component analysis...')
    pc, pca = analysis.run_pca(z)
    log('Generating volumes...')
    for i in range(num_pcs):
        start, end = np.percentile(pc[:, i], (5, 95))
        z_pc = analysis.get_pc_traj(pca, z.shape[1], 10, i + 1, start, end)
        vg.gen_volumes(f'{outdir}/pc{i+1}', z_pc)

    # kmeans clustering
    log('K-means clustering...')
    K = num_ksamples
    kmeans_labels, centers = analysis.cluster_kmeans(z, K)
    centers, centers_ind = analysis.get_nearest_point(z, centers)
    if not os.path.exists(f'{outdir}/kmeans{K}'):
        os.mkdir(f'{outdir}/kmeans{K}')
    utils.save_pkl(kmeans_labels, f'{outdir}/kmeans{K}/labels.pkl')
    np.savetxt(f'{outdir}/kmeans{K}/centers.txt', centers)
    np.savetxt(f'{outdir}/kmeans{K}/centers_ind.txt', centers_ind, fmt='%d')
    log('Generating volumes...')
    vg.gen_volumes(f'{outdir}/kmeans{K}', centers)

    # UMAP -- slow step
    if zdim > 2 and not skip_umap:
        log('Running UMAP...')
        umap_emb = analysis.run_umap(z)
        utils.save_pkl(umap_emb, f'{outdir}/umap.pkl')

    # Make some plots
    log('Generating plots...')
    plt.figure(1)
    g = sns.jointplot(x=pc[:, 0], y=pc[:, 1], alpha=.1, s=2)
    g.set_axis_labels('PC1', 'PC2')
    plt.tight_layout()
    plt.savefig(f'{outdir}/z_pca.png')

    plt.figure(2)
    g = sns.jointplot(x=pc[:, 0], y=pc[:, 1], kind='hex')
    g.set_axis_labels('PC1', 'PC2')
    plt.tight_layout()
    plt.savefig(f'{outdir}/z_pca_hexbin.png')

    if zdim > 2 and not skip_umap:
        plt.figure(3)
        g = sns.jointplot(x=umap_emb[:, 0], y=umap_emb[:, 1], alpha=.1, s=2)
        g.set_axis_labels('UMAP1', 'UMAP2')
        plt.tight_layout()
        plt.savefig(f'{outdir}/umap.png')

        plt.figure(4)
        g = sns.jointplot(x=umap_emb[:, 0], y=umap_emb[:, 1], kind='hex')
        g.set_axis_labels('UMAP1', 'UMAP2')
        plt.tight_layout()
        plt.savefig(f'{outdir}/umap_hexbin.png')

    analysis.scatter_annotate(pc[:, 0],
                              pc[:, 1],
                              centers_ind=centers_ind,
                              annotate=True)
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.savefig(f'{outdir}/kmeans{K}/z_pca.png')

    g = analysis.scatter_annotate_hex(pc[:, 0],
                                      pc[:, 1],
                                      centers_ind=centers_ind,
                                      annotate=True)
    g.set_axis_labels('PC1', 'PC2')
    plt.tight_layout()
    plt.savefig(f'{outdir}/kmeans{K}/z_pca_hex.png')

    if zdim > 2 and not skip_umap:
        analysis.scatter_annotate(umap_emb[:, 0],
                                  umap_emb[:, 1],
                                  centers_ind=centers_ind,
                                  annotate=True)
        plt.xlabel('UMAP1')
        plt.ylabel('UMAP2')
        plt.savefig(f'{outdir}/kmeans{K}/umap.png')

        g = analysis.scatter_annotate_hex(umap_emb[:, 0],
                                          umap_emb[:, 1],
                                          centers_ind=centers_ind,
                                          annotate=True)
        g.set_axis_labels('UMAP1', 'UMAP2')
        plt.tight_layout()
        plt.savefig(f'{outdir}/kmeans{K}/umap_hex.png')

    for i in range(num_pcs):
        if not skip_umap:
            analysis.scatter_color(umap_emb[:, 0],
                                   umap_emb[:, 1],
                                   pc[:, i],
                                   label=f'PC{i+1}')
            plt.xlabel('UMAP1')
            plt.ylabel('UMAP2')
            plt.tight_layout()
            plt.savefig(f'{outdir}/pc{i+1}/umap.png')