# matching samples between two different preps

    matched_data = filter.filter_by_cpm(
        data.loc[:, data.columns.str.contains('NSC')],
        min_cpm=min_cpm,
        min_n_samples=1)
    matched_log_cpm = log_cpm(matched_data)
    row_colours = pd.DataFrame(common.COLOUR_BREWERS[2][0],
                               index=matched_data.columns,
                               columns=['Library'])
    row_colours.loc[row_colours.index.str.contains(
        'smartseq')] = common.COLOUR_BREWERS[2][1]

    # clustering plot
    cg = clustering.plot_correlation_clustermap(matched_log_cpm,
                                                row_colors=row_colours)
    cg.gs.update(bottom=0.35, right=0.65)
    cg.savefig(os.path.join(outdir, "cluster_log_cpm_corr_all_genes.png"),
               dpi=200)

    mad = transformations.median_absolute_deviation(
        matched_log_cpm).sort_values(ascending=False)
    cg = clustering.plot_correlation_clustermap(
        matched_log_cpm.loc[mad.index[:3000]], row_colors=row_colours)
    cg.gs.update(bottom=0.35, right=0.65)
    cg.savefig(os.path.join(outdir, "cluster_log_cpm_corr_3000_genes.png"),
               dpi=200)

    # repeat with TMM norming
    matched_log_cpm_n = transformations.edger_tmm_normalisation_cpm(
        matched_data)
    row_colours.loc[row_colours.index.str.contains('mDURA')] = 'k'
    row_colours.loc[row_colours.index.str.contains(
        'mDURA5_NSCmus_N3BE50.2')] = 'y'
    row_colours.loc[row_colours.index.str.contains('mDURA6_NSCmus')] = 'y'

    # Spearman distance

    cor, pval = pairwise_correlation(log_dat_filt.transpose(),
                                     method='spearman')
    di = 1 - cor
    for i in range(di.shape[0]):
        di.iloc[i, i] = 0.
    di = hc.distance.squareform(di)

    cg = clustering.plot_correlation_clustermap(log_dat_filt,
                                                row_colors=row_colours,
                                                distance=di,
                                                method='average')
    cg.gs.update(bottom=0.3, right=0.7)
    cg.savefig(os.path.join(outdir,
                            "log_tpm_spearman_distance_average_linkage.png"),
               dpi=200)

    # Euclidean distance

    # cg2 = clustering.plot_correlation_clustermap(
    #     log_dat_filt,
    #     row_colors=row_colours,
    #     method='complete',
    #     metric='euclidean',
    # )
    # cg2.gs.update(bottom=0.3, right=0.7)
    meta.loc[meta.index.str.contains('024'), 'subgroup'] = 'Unknown'
    meta.loc[meta.index.str.contains('026'), 'subgroup'] = 'Unknown'
    meta.loc[meta.index.str.contains('044'), 'subgroup'] = 'Mesenchymal'
    meta.loc[meta.index.str.contains('GIBCO'), 'subgroup'] = 'NSC'

    p = PCA(n_components=3)
    p.fit(data.transpose())
    y = p.transform(data.transpose())

    pca.pca_plot_by_group_3d(y, meta.cell_type, plot_ellipsoids=False)

    s = data.std(axis=1).sort_values(ascending=False)

    # plot a correlation clustermap of GBM without 024 (which is a definite outlier)
    clustering.plot_correlation_clustermap(
        data.loc[s.index[:8000], (meta.cell_type == 'GBM') &
                 (~meta.index.str.contains('024'))])

    ref_data, ref_meta = methylation_array.gse36278()

    # only keep intersecting probes
    probe_idx = ref_data.index.intersection(data.index)

    sample_names = np.random.permutation(ref_data.columns)
    n_train = 120
    n_test = ref_data.shape[1] - n_train

    ref_train = ref_data.loc[probe_idx, sample_names[:n_train]]
    ref_test = ref_data.loc[probe_idx, sample_names[n_train:]]
    ref_meta_train = ref_meta.loc[sample_names[:n_train]]
    ref_meta_test = ref_meta.loc[sample_names[n_train:]]
        out = sns.jointplot(np.log2(abs_diff + 1.),
                            rel_diff,
                            alpha=0.3,
                            stat_func=None)
        out.ax_joint.set_ylim([0, 1])
        out.ax_marg_y.set_ylim([0, 1])
        ax = out.ax_joint

        ax.set_xlabel("Log2 absolute difference")
        ax.set_ylabel("Relative difference = absolute difference / mean")
        ax.set_title(m)
        plt.tight_layout()
        ax.figure.savefig(os.path.join(outdir,
                                       "abs_rel_difference_%s.png" % m),
                          dpi=200)
        ax.figure.savefig(os.path.join(outdir,
                                       "abs_rel_difference_%s.pdf" % m))
        # list the genes for which one is zero
        the_list = rnaseq_data.annotate(
            twodata.loc[(twodata == 0).any(axis=1)],
            annotate_by='Approved Symbol')
        if the_list.size > 0:
            print "%s. Genes for which one is zero and one >100: %s" % (
                m, ', '.join(the_list.index))

    cg = clustering.plot_correlation_clustermap(logdata)
    cg.gs.update(bottom=0.25, right=0.7)
    cg.fig.savefig(os.path.join(outdir, "clustermap_correlation_coeff.png"),
                   dpi=200)
    cg.fig.savefig(os.path.join(outdir, "clustermap_correlation_coeff.pdf"))
Example #5
0
    meta_all = obj_all.meta

    cpm_all = data_all.divide(data_all.sum(axis=0), axis=1) * 1e6
    keep = (cpm_all > .5).sum(axis=1) > 5

    the_dat = np.log2(data_all.loc[keep] + 1)

    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    ax = corr.plot_correlation_coefficient_array(the_dat, vmin=0.4, ax=ax)
    plt.setp(ax.xaxis.get_ticklabels(), rotation=90)
    fig.tight_layout()
    fig.savefig(os.path.join(outdir, 'corr_coeff.png'), dpi=200)
    fig.savefig(os.path.join(outdir, 'corr_coeff.pdf'))

    cg = clustering.plot_correlation_clustermap(the_dat)
    cg.fig.savefig(os.path.join(outdir, 'corr_clustermap_all_genes.png'),
                   dpi=200)
    cg.fig.savefig(os.path.join(outdir, 'corr_clustermap_all_genes.pdf'))

    # correlation clustermap with only the CL57BL/6 samples
    obj_all2 = rnaseq_data.MultipleBatchLoader(
        [obj, obj64411, obj43916, obj86248, obj36114])
    # remove some samples
    to_remove = ['eNSC%dmouse' % i for i in (3, 5, 6)] + \
                ['mDura%smouse' % i for i in ('3N1', '5N24A', '6N6')] + \
                ['E14_Day4_RNA-seq', 'EmbryonicNSC1', 'EmbryonicNSC2']
    meta2 = obj_all2.meta.drop(
        to_remove,
        axis=0,
    )
Example #6
0
            cm.gs.update(bottom=0.3)
            cm.savefig(os.path.join(outdir, fname.format(ext='png')), dpi=200)

        fname = "all_samples_dendrogram_log_corr_top%d_by_mad.{ext}" % n_t
        d = clustering.dendrogram_with_colours(
            abg_log.loc[amad_log.index[:n_t]],
            row_colours_all,
            fig_kws={'figsize': (5.5, 10)},
            vertical=False)
        d['fig'].savefig(os.path.join(outdir, fname.format(ext='png')),
                         dpi=200)

        fname = "all_samples_corrplot_log_top%d_by_mad.{ext}" % n_t
        cm = clustering.plot_correlation_clustermap(
            abg_log.loc[amad_log.index[:n_t]],
            row_colors=row_colours_all,
            n_gene=n_t,
        )
        plt.setp(cm.ax_heatmap.get_xticklabels(), rotation=90, fontsize=10)
        plt.setp(cm.ax_heatmap.get_yticklabels(), rotation=0, fontsize=10)
        cm.gs.update(bottom=0.35, right=0.65)
        cm.savefig(os.path.join(outdir, fname.format(ext='png')), dpi=200)

    # data subset: our data and Pten/P53 samples

    mouse_data = rnaseq_data.mouse_nsc_salmon(units=units)
    mouse_data_by_gene = general.ensembl_transcript_quant_to_gene(mouse_data,
                                                                  tax_id=10090)

    dat_pten = general.ensembl_transcript_quant_to_gene(
        rnaseq_data.mouse_gbm_pten_p53(source='salmon', units=units),
Example #7
0
            metric='correlation',
            method='average',
        )
        out['fig'].savefig(os.path.join(
            outdir,
            "gbm_nsc_correlation_dendrogram_logtransform_top%d.png" % NGENE),
                           dpi=200)
        out['fig'].savefig(
            os.path.join(
                outdir,
                "gbm_nsc_correlation_dendrogram_logtransform_top%d.pdf" %
                NGENE))

        cg = clustering.plot_correlation_clustermap(
            log_data.loc[mad_log_srt.index[:NGENE]],
            vmin=0.,
            vmax=1.,
            metric='correlation')
        cg.gs.update(bottom=0.3, right=0.7)
        cg.fig.savefig(os.path.join(
            outdir,
            "gbm_nsc_correlation_clustermap_logtransform_top%d.png" % NGENE),
                       dpi=200)
        cg.fig.savefig(
            os.path.join(
                outdir,
                "gbm_nsc_correlation_clustermap_logtransform_top%d.pdf" %
                NGENE))

        out = clustering.dendrogram_with_colours(
            vst_data.loc[mad_vst_srt.index[:NGENE]],
Example #8
0
    ax.figure.savefig(os.path.join(outdir, "pca_ribozero_polya.pdf"))

    # by cell line
    subgroups = data_all_n.columns.str.replace(r'[^0-9]*', '')
    ax = pca_plot_by_group_2d(y, subgroups=subgroups, ellipses=False, auto_scale=False)
    ax.legend(loc='upper left', frameon=True, facecolor='w', edgecolor='b')
    plt.tight_layout()
    ax.figure.savefig(os.path.join(outdir, "pca_ribozero_polya_byline.png"), dpi=200)
    ax.figure.savefig(os.path.join(outdir, "pca_ribozero_polya_byline.pdf"))

    # hierarchical clustering
    subgroups = pd.DataFrame(
        ['b'] * 3 + ['r'] * 2 + ['g'] * 5,
        index=data_all_n.columns,
        columns=['Prep type']
    )
    legend_lbl = {'FFPE': 'b', 'frozen': 'r', 'Poly(A)': 'g'}
    res = clustering.dendrogram_with_colours(
        comp_data,
        subgroups,
        legend_labels=legend_lbl,
        metric='correlation',
        method='average'
    )
    res['fig'].savefig(os.path.join(outdir, 'clustering_dendrogram.png'), dpi=200)
    res['fig'].savefig(os.path.join(outdir, 'clustering_dendrogram.pdf'))

    # can add n_gene kwarg here to pick top N genes by MAD:
    cg = clustering.plot_correlation_clustermap(comp_data)
    cg.savefig(os.path.join(outdir, 'clustering_corr_map.png'), dpi=200)
    cg.savefig(os.path.join(outdir, 'clustering_corr_map.pdf'))