Ejemplo n.º 1
0
def hc_plot_dendrogram(data,
                       row_colours,
                       mad=None,
                       n_ftr=3000,
                       metric='correlation',
                       **kwargs):
    """
    For each value in n_gene_arr, plot a dendrogram showing the result of hierarchical clustering of the data using
    that many genes (selected in descending MAD order)
    :param data: Cols are samples, rows are genes (or similar)
    :param row_colours: As passed to dendrogram routine
    :param n_gene_arr: The values to test
    :return:
    """
    if 'fig_kws' not in kwargs:
        kwargs['fig_kws'] = {'figsize': (5.5, 10)}
    if mad is None:
        mad = transformations.median_absolute_deviation(data).sort_values(
            ascending=False)
    the_dat = data.loc[mad.index[:n_ftr]]
    fig_dict = clustering.dendrogram_with_colours(the_dat,
                                                  row_colours,
                                                  vertical=False,
                                                  metric=metric,
                                                  **kwargs)
    return fig_dict
def hc_plot_dendrogram_vary_n_gene(data,
                                   row_colours,
                                   mad=None,
                                   n_ftr=(1000, 2000, 3000, 5000, 10000),
                                   metric='correlation'):
    """
    For each value in n_gene_arr, plot a dendrogram showing the result of hierarchical clustering of the data using
    that many genes (selected in descending MAD order)
    :param data: Cols are samples, rows are genes (or similar)
    :param row_colours: As passed to dendrogram routine
    :param n_gene_arr: The values to test
    :return:
    """
    if mad is None:
        mad = transformations.median_absolute_deviation(data).sort_values(
            ascending=False)
    fig_dict = {}
    for ng in n_ftr:
        the_dat = data.loc[mad.index[:ng]]
        d = clustering.dendrogram_with_colours(the_dat,
                                               row_colours,
                                               fig_kws={'figsize': (5.5, 10)},
                                               vertical=False,
                                               metric=metric)
        fig_dict[ng] = d
    return fig_dict
Ejemplo n.º 3
0
def plot_dendrogram(obj_arr,
                    n_by_mad=None,
                    qn_method=None,
                    eps=0.01,
                    min_val=1,
                    n_above_min=3,
                    vertical=False,
                    figsize=(7, 8),
                    **kwargs):
    if len(obj_arr) > 1:
        the_obj = loader.MultipleBatchLoader(obj_arr)
    else:
        the_obj = obj_arr[0]

    the_obj = filter_loader(the_obj, min_val=min_val, n_above_min=n_above_min)
    dat = np.log2(the_obj.data + eps)
    if qn_method is not None:
        dat = transformations.quantile_normalisation(dat, method=qn_method)

    if n_by_mad is not None:
        mad = transformations.median_absolute_deviation(dat).sort_values(
            ascending=False)
        dat = dat.loc[mad.index[:n_by_mad]]

    cc, st, leg_dict = construct_colour_array_legend_studies(the_obj.meta)

    dend = clustering.dendrogram_with_colours(dat,
                                              cc,
                                              vertical=vertical,
                                              legend_labels=leg_dict,
                                              fig_kws={'figsize': figsize},
                                              **kwargs)

    return dend
Ejemplo n.º 4
0
        ax.figure.savefig(os.path.join(
            outdir, "pca_top%d_by_mad_with_names.png" % n_t),
                          dpi=200)

    row_colours = pd.DataFrame('gray', index=our_dat.columns, columns=[''])
    row_colours.loc[row_colours.index.str.contains(
        r'eNSC[0-9]med')] = '#66c2a5'
    row_colours.loc[row_colours.index.str.contains(
        r'eNSC[0-9]mouse')] = '#fc8d62'
    row_colours.loc[row_colours.index.str.contains(
        r'mDura.[AN0-9]*mouse')] = '#8da0cb'
    row_colours.loc[row_colours.index.str.contains(
        r'mDura.[AN0-9]*human')] = '#e78ac3'

    for n_t in n_gene_try:
        fname = "clustering_by_gene_corr_log_top%d_by_mad.{ext}" % n_t

        d = clustering.dendrogram_with_colours(our_dat.loc[mad.index[:n_t]],
                                               row_colours,
                                               fig_kws={'figsize': (10, 5.5)})
        d['fig'].savefig(os.path.join(outdir, fname.format(ext='png')),
                         dpi=200)

        cm = clustering.plot_clustermap(our_dat.loc[mad.index[:n_t]],
                                        cmap='RdBu_r',
                                        metric='correlation',
                                        col_colors=row_colours)
        cm.gs.update(bottom=0.2)
        cm.savefig(os.path.join(
            outdir, "clustermap_by_gene_corr_log_top%d_by_mad.png" % n_t),
                   dpi=200)
                        'Cell type'] = '#f6ffaa'  # pale yellow
    row_colours_all.loc[row_colours_all.index.str.contains(r'ESC'),
                        'Cell type'] = '#8b33dd'  # pale purple
    row_colours_all.loc[row_colours_all.index.str.contains(r'[iI]PSC'),
                        'Cell type'] = '#8b33dd'  # pale purple
    row_colours_all.loc[row_colours_all.index.str.contains(r'[Mm]icroglia'),
                        'Cell type'] = '#ffd8af'  # pale orange
    row_colours_all.loc[row_colours_all.index.str.contains(r'Yanez'),
                        'Cell type'] = '#ffa03d'  # orange

    # these override previously-defined colours
    row_colours_all.loc[row_colours_all.index.str.contains(r'eNSC[0-9]med'),
                        'Cell type'] = '#96ff9d'  # pale green
    row_colours_all.loc[row_colours_all.index.str.contains(r'eNSC[0-9]mouse'),
                        'Cell type'] = '#008408'  # dark green
    row_colours_all.loc[
        row_colours_all.index.str.contains(r'mDura.[AN0-9]*mouse'),
        'Cell type'] = '#3543ff'  # dark blue
    row_colours_all.loc[
        row_colours_all.index.str.contains(r'mDura.[AN0-9]*human'),
        'Cell type'] = '#c4c8ff'  # pale blue

    d = clustering.dendrogram_with_colours(dat.loc[mad.index[:3000]],
                                           row_colours_all,
                                           fig_kws={'figsize': (5.5, 10)},
                                           vertical=False)

    # cm = clustering.plot_correlation_clustermap(
    #     dat.loc[mad.index[:3000]],
    #     row_colors=row_colours_all,
    # )
    fig.savefig(os.path.join(outdir, "filtering_effect.pdf"))

    idx = gene_idx & ((obj.data > 10).sum(axis=1) > 10)
    data = obj.data.loc[idx]
    mad = transformations.median_absolute_deviation(data).sort_values(
        ascending=False)
    logdata = np.log(data + 1)

    # start with a dendrogram
    col_colours = clustering.generate_colour_map_dict(obj.meta,
                                                      'sample',
                                                      matches,
                                                      label='Patient',
                                                      non_matching='gray')
    out = clustering.dendrogram_with_colours(logdata,
                                             col_colours=col_colours,
                                             vertical=False)
    dist = clustering.dendrogram_threshold_by_nclust(out['linkage'], 3)
    out['dendrogram_ax'].axvline(dist, ls='--', c='gray')
    out['fig'].savefig(os.path.join(outdir, "dendrogram_all_genes.png"),
                       dpi=200)
    out['fig'].savefig(os.path.join(outdir, "dendrogram_all_genes.pdf"))

    # repeat but now only use N genes (by MAD)
    # tested and the result is unchanged for most values in the region [500, 5000]
    n_gene = 1500
    out = clustering.dendrogram_with_colours(logdata.loc[mad.index[:n_gene]],
                                             col_colours=col_colours,
                                             vertical=False)
    dist = clustering.dendrogram_threshold_by_nclust(out['linkage'], 3)
    out['dendrogram_ax'].axvline(dist, ls='--', c='gray')
        r'Fibroblast')] = '#fff89e'
    row_colours_all.loc[row_colours_all.index.str.contains(
        r'Fetal')] = 'yellow'
    row_colours_all.loc[row_colours_all.index.str.contains('ES1')] = '#ff7777'
    row_colours_all.loc[row_colours_all.index.str.contains('PSC')] = '#ff7777'
    row_colours_all.loc[row_colours_all.index.str.contains(
        r'DURA[0-9]*_NSC')] = '#7fc97f'  # green
    row_colours_all.loc[row_colours_all.index.str.contains(
        r'DURA[0-9]*_IPSC')] = '#fdc086'  # orange

    n_gene = 3000

    fname = "hier_clust_by_gene_log_corr_top%d_by_mad.{ext}" % n_gene

    cm, mad_all = cluster_logdata_with_threshold(abg,
                                                 n=n_gene,
                                                 eps=eps,
                                                 col_colors=row_colours_all)
    cm.gs.update(bottom=0.3)
    cm.savefig(os.path.join(outdir, fname.format(ext='png')), dpi=300)
    cm.savefig(os.path.join(outdir, fname.format(ext='tiff')), dpi=200)

    fname = "hier_clust_dendrogram_log_corr_top%d_by_mad.{ext}" % n_gene
    d = clustering.dendrogram_with_colours(
        abg_log.loc[amad_log.index[:n_gene]],
        row_colours_all,
        fig_kws={'figsize': (5.5, 10)},
        vertical=False)
    d['fig'].savefig(os.path.join(outdir, fname.format(ext='png')), dpi=300)
    d['fig'].savefig(os.path.join(outdir, fname.format(ext='tiff')), dpi=200)
                                    vmin=-10,
                                    vmax=10)
    clustering.add_legend(leg_dict, cm.ax_heatmap, loc='right')
    cm.gs.update(bottom=0.25, right=0.85, left=0.03)

    cm.savefig(os.path.join(
        outdir, "nogbm_clustermap_M_corr_linkage%d_heatmap%d.png" %
        (clust_n_ftr, n_probe_to_show)),
               dpi=200)
    cm.savefig(os.path.join(
        outdir, "nogbm_clustermap_M_corr_linkage%d_heatmap%d.tiff" %
        (clust_n_ftr, n_probe_to_show)),
               dpi=200)

    ## our samples only, all probes, beta values

    idx = ((bdat.columns.str.contains('GBM')) |
           (bdat.columns.str.contains('DURA')))
    bdat_ours = bdat.loc[:, idx]
    bmad_ours = transformations.median_absolute_deviation(bdat_ours)

    row_colours_ours = row_colours_all.loc[bdat_ours.columns]
    x = clustering.dendrogram_with_colours(bdat_ours,
                                           row_colours_ours,
                                           fig_kws={'figsize': (5.5, 10)},
                                           vertical=False,
                                           metric='correlation')
    fname = "ours_dendrogram_B_corr_all.{ext}"
    x['fig'].savefig(os.path.join(outdir, fname.format(ext='png')), dpi=200)
    x['fig'].savefig(os.path.join(outdir, fname.format(ext='png')), dpi=200)
Ejemplo n.º 9
0
# normalised version with counts / sum(counts)
datan = data.divide(data.sum(axis=0), axis=1)

contains_arr = ['MES', re.compile(r'RTK_I$'), 'RTK_II']
col_colours, legend_labels = clustering.generate_colour_map_dict(
    meta,
    'subgroup',
    contains_arr,
    label='group',
    sample_names=data.columns,
    non_matching='gray',
    group_names=['Mesenchymal', 'RTK I', 'RTK II'])

clustering.dendrogram_with_colours(data,
                                   col_colours,
                                   legend_labels=legend_labels,
                                   metric='euclidean',
                                   method='average')
clustering.dendrogram_with_colours(data,
                                   col_colours,
                                   legend_labels=legend_labels,
                                   metric='euclidean',
                                   method='single')
clustering.dendrogram_with_colours(data,
                                   col_colours,
                                   legend_labels=legend_labels,
                                   metric='correlation',
                                   method='average')
clustering.dendrogram_with_colours(data,
                                   col_colours,
                                   legend_labels=legend_labels,
Ejemplo n.º 10
0
    row_colours = pd.DataFrame('gray', index=our_dat.columns, columns=[''])
    row_colours.loc[row_colours.index.str.contains(
        r'eNSC[0-9]med')] = '#66c2a5'
    row_colours.loc[row_colours.index.str.contains(
        r'eNSC[0-9]mouse')] = '#fc8d62'
    row_colours.loc[row_colours.index.str.contains(
        r'mDura.[AN0-9]*mouse')] = '#8da0cb'
    row_colours.loc[row_colours.index.str.contains(
        r'mDura.[AN0-9]*human')] = '#e78ac3'

    for n_t in n_gene_try:
        fname = "clustering_by_gene_corr_log_top%d_by_mad.{ext}" % n_t

        d = clustering.dendrogram_with_colours(our_dat.loc[mad.index[:n_t]],
                                               row_colours,
                                               fig_kws={'figsize': (10, 5.5)})
        d['fig'].savefig(os.path.join(outdir, fname.format(ext='png')),
                         dpi=200)

        cm, _ = cluster_data_with_threshold(our_dat,
                                            n=n_t,
                                            mad=mad,
                                            col_colors=row_colours)
        cm.savefig(os.path.join(
            outdir, "clustermap_by_gene_corr_log_top%d_by_mad.png" % n_t),
                   dpi=200)

    raise Exception("TODO: complete the script refactor")

    # bring in reference data
Ejemplo n.º 11
0
    # normalise by read count
    cpm = (data + 1).divide((data + 1).sum(axis=0), axis=1) * 1e6

    # transform
    log_data = np.log2(cpm)
    vst_data = variance_stabilizing_transform(cpm)
    mad_log_srt = median_absolute_deviation(log_data).sort_values(
        ascending=False)
    mad_vst_srt = median_absolute_deviation(vst_data).sort_values(
        ascending=False)

    for NGENE in [500, 1000, 1500, 2000, 2500]:
        out = clustering.dendrogram_with_colours(
            log_data.loc[mad_log_srt.index[:NGENE]],
            col_colours=col_colours,
            vertical=False,
            metric='correlation',
            method='average',
        )
        out['fig'].savefig(os.path.join(
            outdir,
            "gbm_nsc_correlation_dendrogram_logtransform_top%d.png" % NGENE),
                           dpi=200)
        out['fig'].savefig(
            os.path.join(
                outdir,
                "gbm_nsc_correlation_dendrogram_logtransform_top%d.pdf" %
                NGENE))

        cg = clustering.plot_correlation_clustermap(
            log_data.loc[mad_log_srt.index[:NGENE]],
Ejemplo n.º 12
0
    ax.figure.savefig(os.path.join(outdir, "pca_ribozero_polya.pdf"))

    # by cell line
    subgroups = data_all_n.columns.str.replace(r'[^0-9]*', '')
    ax = pca_plot_by_group_2d(y, subgroups=subgroups, ellipses=False, auto_scale=False)
    ax.legend(loc='upper left', frameon=True, facecolor='w', edgecolor='b')
    plt.tight_layout()
    ax.figure.savefig(os.path.join(outdir, "pca_ribozero_polya_byline.png"), dpi=200)
    ax.figure.savefig(os.path.join(outdir, "pca_ribozero_polya_byline.pdf"))

    # hierarchical clustering
    subgroups = pd.DataFrame(
        ['b'] * 3 + ['r'] * 2 + ['g'] * 5,
        index=data_all_n.columns,
        columns=['Prep type']
    )
    legend_lbl = {'FFPE': 'b', 'frozen': 'r', 'Poly(A)': 'g'}
    res = clustering.dendrogram_with_colours(
        comp_data,
        subgroups,
        legend_labels=legend_lbl,
        metric='correlation',
        method='average'
    )
    res['fig'].savefig(os.path.join(outdir, 'clustering_dendrogram.png'), dpi=200)
    res['fig'].savefig(os.path.join(outdir, 'clustering_dendrogram.pdf'))

    # can add n_gene kwarg here to pick top N genes by MAD:
    cg = clustering.plot_correlation_clustermap(comp_data)
    cg.savefig(os.path.join(outdir, 'clustering_corr_map.png'), dpi=200)
    cg.savefig(os.path.join(outdir, 'clustering_corr_map.pdf'))
Ejemplo n.º 13
0
    # Spearman rank correlation distance
    # pdist = spearman_rank_corr(dat)
    # dist = hc.distance.squareform(1 - pdist.values)
    # lnk = hc.linkage(dist)
    # dend = clustering.dendrogram_with_colours(
    #     dat,
    #     cc,
    #     linkage=lnk,
    #     vertical=True,
    #     legend_labels=leg_dict,
    #     fig_kws={'figsize': [14, 6]}
    # )

    # Pearson correlation distance
    dend = clustering.dendrogram_with_colours(dat, cc, vertical=True, legend_labels=leg_dict, fig_kws={'figsize': [14, 6]})

    # Pearson with a limited number of probes
    # dend = clustering.dendrogram_with_colours(dat.loc[mad.index[:5000]], cc, vertical=True, legend_labels=leg_dict, fig_kws={'figsize': [14, 6]})

    dend['fig'].savefig(os.path.join(outdir, "cluster_ipsc_esc_fb_all_probes.png"), dpi=200)

    # similar, but clustermap (dendrogram + heatmap)
    gc = clustering.plot_clustermap(
        dat.loc[mad.index[:5000]],
        cmap='RdBu_r',
        col_linkage=dend['linkage'],
        col_colors=cc
    )
    clustering.add_legend(leg_dict, gc.ax_heatmap, loc='right')
    gc.gs.update(bottom=0.2, right=0.82)
Ejemplo n.º 14
0
    obj_salmon = loader.load_by_patient(['ICb1299', '3021'], source='salmon', type='cell_culture', include_control=False)

    # cluster plot
    tpm = filter.filter_by_cpm(obj_salmon.data, min_cpm=1, min_n_samples=4)

    batch_colours = common.COLOUR_BREWERS[len(obj_salmon.meta.batch.unique())]
    line_colours = common.COLOUR_BREWERS[2]
    cc = pd.DataFrame(line_colours[0], index=tpm.columns, columns=['Batch', 'Cell line'])

    aa, bb = obj_salmon.meta.batch.factorize()
    for i in range(aa.max()):
        cc.loc[aa == i, 'Batch'] = batch_colours[i]
    cc.loc[cc.index.str.contains('3021'), 'Cell line'] = line_colours[1]

    cg = clustering.dendrogram_with_colours(
        np.log2(tpm + eps),
        cc,
    )
    cg['fig'].savefig(os.path.join(outdir, "dendrogram_pearson_log_tpm_all_genes.png"), dpi=200)

    # pca plot
    p = PCA()
    y = p.fit_transform(np.log2(tpm + eps).transpose())
    fig = plt.figure()
    ax = fig.add_subplot(111)
    for a, b in enumerate(bb):
        ax.scatter(
            y[aa == a, 0],
            y[aa == a, 1],
            facecolor=batch_colours[a],
            edgecolor='k',
            s=30,
Ejemplo n.º 15
0
    pdbg_log = np.log2(pdbg + eps)
    mad_log = transformations.median_absolute_deviation(pdbg_log).sort_values(
        ascending=False)
    row_colours = pd.DataFrame('gray', index=pdbg_log.columns, columns=[''])
    row_colours.loc[row_colours.index.str.contains('IPSC')] = '#fdc086'
    row_colours.loc[row_colours.index.str.contains(
        r'DURA[0-9]*_NSC')] = '#7fc97f'
    row_colours.loc[row_colours.index.str.contains('GIBCO')] = '#96daff'

    for n_t in n_gene_try:
        fname = "clustering_by_gene_corr_top%d_by_mad.{ext}" % n_t
        fname_log = "clustering_by_gene_corr_log_top%d_by_mad.{ext}" % n_t

        d = clustering.dendrogram_with_colours(
            pdbg_log.loc[mad_log.index[:n_t]],
            row_colours,
            fig_kws={'figsize': (5.5, 10)},
            vertical=False)
        d['fig'].savefig(os.path.join(outdir, fname_log.format(ext='png')),
                         dpi=200)

    plt.draw()
    plt.close('all')

    # bring in reference data
    # IDs (if req), lab (appears in label), loader

    ref_dats = [
        (
            None,
            'Barres et al.',