Ejemplo n.º 1
0
def load_cnc_colors(df):
    metadata_df = utils.load_metadata_df(config.metadata_path, df.index)

    # Get color scheme
    cnc_to_color = utils.load_color_scheme(config.color_scheme_path)
    cancer_types = cnc_to_color.keys()
    cnc_colors = metadata_df['cnc'].map(cnc_to_color)
    return metadata_df['cnc'], cnc_to_color
Ejemplo n.º 2
0
def load_mrna_colors(df):
    key = 'Subtype_mRNA'
    metadata_df = utils.load_metadata_df(config.metadata_path, df.index)
    metadata_df, subtype_names = utils.append_subtype(config.subtype_path,
                                                      metadata_df)
    assert key in subtype_names and key in metadata_df.columns

    subtype_series = metadata_df[key].loc[df.index].dropna().apply(
        lambda x: x.lower())
    value_counts = subtype_series.value_counts()
    subtype_classes = value_counts[value_counts > 30].index
    subtype_series = subtype_series[subtype_series.isin(subtype_classes)]
    colors = utils.load_color_scheme(config.color_scheme_path).values()
    color_lut = dict(zip(sorted(subtype_classes), colors))
    return subtype_series, color_lut
Ejemplo n.º 3
0
    # Add Expression
    if True:
        path = os.path.join(config.embed_dir, 'expression', 'data.tsv')
        outdir = os.path.join(config.plot_dir, 'expression', 'heatmaps')
        if not os.path.exists(outdir): os.makedirs(outdir)
        desc = 'Expression'
        if NORM: desc = 'Normalized ' + desc
        try:
            df = utils.load_large_df(path.replace('.tsv', ''))
        except IOError:
            df = pd.read_csv(path, sep='\t', index_col=0)

        df.iloc[:] = np.minimum(df.values, np.percentile(df.values, 99, axis=0))
        keep_cols = filter_to_high_var(df.values, df.columns, MAX_EVENTS)
        df = df.iloc[:, keep_cols]
        metadata_df = utils.load_metadata_df(config.metadata_path, df.index)
        medians = collapse_to_median(df, metadata_df['cnc'])
        heatmap_dists_with_dendro(medians, norm=NORM)
        outpath = os.path.join(outdir, desc.lower().replace(' ', '_') +'_rep_dists_heatmap.png')
        plot_utils.save(outpath, do_pdf=True)

    # Add AltSplice
    if False:
        altsplice_event_list= ['alt_3prime', 'alt_5prime', 'intron_retention', 'exon_skip']
        for event in altsplice_event_list:
            path = os.path.join(config.embed_dir, 'altsplice', event, 'data.tsv')
            outdir = os.path.join(config.plot_dir, 'altsplice', event, 'heatmap')
            if not os.path.exists(outdir): os.makedirs(outdir)
            desc = 'AltSplice %s'%event.title()
            if NORM: desc = 'Normalized ' + desc
            print desc
Ejemplo n.º 4
0
def main(embed_dir, plot_dir, desc):
    '''Runs all tasks on a single embedding.

    embed_dir: location of pca & tsne embeddings
    plot_dir: directory to write plots
    desc: identifies embedding (used for e.g. plot titles)
    '''
    assert os.path.exists(embed_dir), embed_dir
    if not os.path.exists(plot_dir): os.makedirs(plot_dir)

    rnadeg_df = utils.load_rnadeg(config.rnadeg_path) if DEGSCORE else None
    libsize_df = utils.load_libsize(config.libsize_path) if LIBSIZE else None

    pca_model, pca_embeds, tsne_embeds_dict = utils.load_embeds(
        embed_dir,
        whitelist=WHITELIST,
        pp_set=TSNE_PP_PLOT_SET,
        lr_set=TSNE_LR_PLOT_SET)

    metadata_df = utils.load_metadata_df(config.metadata_path,
                                         pca_embeds.index)
    metadata_df, subtype_names = utils.append_subtype(config.subtype_path,
                                                      metadata_df)

    if ALL_TASKS or COLOR_CNC:
        outdir = os.path.join(plot_dir, 'complete')
        if not os.path.exists(outdir): os.makedirs(outdir)
        plot_configs = plot_args.cnc_plotting_args(metadata_df)
        run_embedding_proc(outdir, plot_configs, desc, tsne_embeds_dict)

    if ALL_TASKS or HL_CNC is not None:
        if ALL_TASKS or HL_CNC == 'all':
            cnc_list = np.unique(metadata_df['cnc'].values)
        else:
            cnc_list = HL_CNC
        cnc_groups = metadata_df.groupby('cnc')
        for cnc in cnc_list:
            cnc_index = cnc_groups.get_group(cnc).index
            other_index = np.setdiff1d(metadata_df.index, cnc_index)
            for subtype in subtype_names:
                if DEBUG: print cnc, subtype
                subtype_configs = plot_args.load_subtype_color_tumor_marker_kwargs(
                    metadata_df, subtype, cnc)
                outdir = os.path.join(plot_dir, 'highlights_subtype', cnc)
                if not os.path.exists(outdir): os.makedirs(outdir)
                subtype_desc = ' '.join([subtype, desc])
                run_embedding_proc(outdir,
                                   subtype_configs,
                                   subtype_desc,
                                   tsne_embeds_dict,
                                   prefix=subtype)
        pass

    if ALL_TASKS or TUMOR_NORMAL:
        if DEBUG: print "start tumor normal"
        tn_configs = plot_args.tumor_normal_plotting_args(metadata_df)
        outdir = os.path.join(plot_dir, 'complete_tumor_normal')
        if not os.path.exists(outdir): os.makedirs(outdir)
        tn_desc = '%s Tumor/Normal' % desc
        run_embedding_proc(outdir, tn_configs, tn_desc, tsne_embeds_dict)

    if ALL_TASKS or LIBSIZE:
        if DEBUG: print " start libsize"
        cbar_title = libsize_df.columns[0]
        outdir = os.path.join(plot_dir, 'qc', 'libsize')
        if not os.path.exists(outdir): os.makedirs(outdir)
        for (pp, lr), embed_df in tsne_embeds_dict.items():
            outpath = os.path.join(outdir,
                                   'tsne_embeds_pp_%d_lr_%d.png' % (pp, lr))
            axis_title = '%s Library Size Effects\ntSNE(perplexity=%d, learning_rate=%d)' % (
                desc.title(), pp, lr)
            fig, ax = plt.subplots()
            plot_utils.plot_continuous_color_embeddings(embed_df,
                                                        libsize_df.iloc[:, 0],
                                                        ax=ax,
                                                        axis_title=axis_title,
                                                        cbar_title=cbar_title)
            plot_utils.save(outpath, do_pdf=DO_PDF)

    if ALL_TASKS or DEGSCORE:
        if DEBUG: print " start degscore"
        outdir = os.path.join(plot_dir, 'qc', 'degscore')
        if not os.path.exists(outdir): os.makedirs(outdir)
        cbar_title = rnadeg_df.columns[0]
        for (pp, lr), embed_df in tsne_embeds_dict.items():
            outpath = os.path.join(outdir,
                                   'tsne_embeds_pp_%d_lr_%d.png' % (pp, lr))
            fig, ax = plt.subplots()
            axis_title = '%s RNADeg Effects\ntSNE(perplexity=%d, learning_rate=%d)' % (
                desc.title(), pp, lr)
            plot_utils.plot_continuous_color_embeddings(embed_df,
                                                        rnadeg_df.iloc[:, 0],
                                                        ax=ax,
                                                        axis_title=axis_title,
                                                        cbar_title=cbar_title)
            plot_utils.save(outpath, do_pdf=DO_PDF)
if __name__ == '__main__':

    # load data
    map_event_to_file = {
        'exon_skip': config.alt_splice_exon_skip_path,
        'intron_retention': config.alt_splice_intron_retention_path,
        'alt_3prime': config.alt_splce_alt_3prime_path,
        'alt_5prime': config.alt_splce_alt_5prime_path
    }

    if DO_INDIVIDUAL_EVENTS:
        for event, path in map_event_to_file.items():
            print "Loading %s data from %s" % (event, path)
            psi, strains, gene_idx = preproc.load_data(path)
            metadata_df = utils.load_metadata_df(config.metadata_path, strains)
            cnc_groups = metadata_df.groupby('cnc')
            for cnc, cnc_md in cnc_groups:
                cnc_psi, cnc_strains = subset_to_cnc(psi, strains,
                                                     cnc_md.index)
                if cnc_psi is None:
                    print "WARNING: Did not find %s samples in %s" % (cnc,
                                                                      event)
                cnc_event_embed_dir = os.path.join(EMBED_DIR, event,
                                                   'cancer_only', cnc)
                if not os.path.exists(cnc_event_embed_dir):
                    os.makedirs(cnc_event_embed_dir)
                pca_embeds_df, pca = run_pca_embeddings(cnc_psi, cnc_strains)
                save_pca_embeddings(cnc_event_embed_dir, pca_embeds_df, pca)
                if DO_TSNE:
                    for lr in TSNE_LR_LIST:
        counts_raw_df = pd.read_csv(input_cache, sep='\t', index_col=0)
    else:
        print "Loading expression count data from %s" % config.expression_count_path
        wl = utils.load_whitelist(config.whitelist_path)
        counts_raw, gids, sids = load_raw_counts(config.expression_count_path)
        wl_mask = np.in1d(sids, wl)
        assert counts_raw.shape[1] == sids.size
        counts_raw = counts_raw[:, wl_mask]
        sids = sids[wl_mask]
        counts_raw, gids, sids = filter_counts(counts_raw, gids, sids)
        assert np.all(np.isfinite(counts_raw))
        counts_raw = np.log10(counts_raw + 1).T
        counts_raw_df = pd.DataFrame(counts_raw, index=sids, columns=gids)
        # df is sids x gids
        counts_raw_df.to_csv(input_cache, sep='\t')
    metadata_df = utils.load_metadata_df(config.metadata_path)
    metadata_df = utils.translate_tcga_to_strain_index(metadata_df,
                                                       counts_raw_df.index)

    if DO_CNC_BARS:
        bar_outpath = os.path.join(PLOT_DIR, 'cancer_type_barplot.png')
        cnc = metadata_df['cnc'].loc[counts_raw_df.index].values
        plot_cnc_bars(bar_outpath, cnc)

    if DO_BOXPLOTS:
        box_outpath = os.path.join(PLOT_DIR, 'expression_boxplot.png')
        plot_box_raw(box_outpath, counts_raw_df.values)

    if DO_PCA:
        pca_embeds_raw_out = os.path.join(EMBED_DIR, 'pca_embeds.tsv')
        pca_model_raw_out = os.path.join(EMBED_DIR, 'pca_model.npy')
Ejemplo n.º 7
0
def add_normal_label(md):
    normal_mask = np.invert(md['is_tumor'])
    md.loc[normal_mask, 'cnc'] = md.loc[normal_mask]['cnc'].map(lambda x: x+' (Normal)')
    return md

DEBUG = False
if __name__ == '__main__':

    if sys.argv[1] == '--event':

        ### load actual data
        panc_df, gtex_df = load_data(sys.argv[2], is_event=True)

        ### load metadata
        panc_md = utils.load_metadata_df(config.metadata_path, panc_df.index).dropna()
        gtex_md = load_gtex_metadata(config.gtex_metadata_path, gtex_df.index).dropna()
        panc_md_wnorm = add_normal_label(panc_md)
        assert panc_df.index.equals(panc_md.index)
        assert gtex_df.index.equals(gtex_md.index)

        if len(sys.argv) > 3:
            PLOT_DIR = os.path.join(config.plot_dir, 'altsplice', 'outlier_events_%s' % sys.argv[3])
        else:    
            PLOT_DIR = os.path.join(config.plot_dir, 'altsplice', 'outlier_events')
        if not os.path.exists(PLOT_DIR): os.makedirs(PLOT_DIR)

        run_strip_plot(panc_df, gtex_df, panc_md_wnorm['cnc'], gtex_md['histological_type_s'])
    else:
        sf_interest = get_sf_interest()
Ejemplo n.º 8
0
def do_embedding(gtdf):
    gtdf = gtdf.copy()
    DO_TSNE = True

    DO_INDIVIDUAL_EVENTS = True
    DO_COMBINED_EVENTS = True

    PLOT_DIR = os.path.join(config.plot_dir, 'altsplice')
    EMBED_DIR = os.path.join(config.embed_dir, 'altsplice')
    EVENT_LIST = [
        'exon_skip', 'intron_retention', 'alt_3prime', 'alt_5prime',
        'concatenated'
    ]

    TSNE_PP_PLOT_SET = [50]
    TSNE_LR_PLOT_SET = [500]
    do_pdf = True

    nan_class = 3
    if np.any(gtdf.isnull()):
        assert not nan_class in gtdf.values
        gtdf.values[np.isnan(gtdf.values)] = nan_class
    gtdf = gtdf.apply(lambda x: pd.to_numeric(x, downcast='integer'))
    mut_cmap = sns.color_palette('Set2', nan_class + 1)

    for event in EVENT_LIST:
        embed_base_dir = os.path.join(config.embed_dir, 'altsplice', event)
        pca_model, pca_embeds, tsne_embeds_dict = utils.load_embeds(
            embed_base_dir)
        pca_embeds.index = pca_embeds.index.map(
            lambda x: x.replace('.aligned', ''))
        pca_embeds.index = pca_embeds.index.map(
            lambda x: x.replace('.npz', ''))
        for df in tsne_embeds_dict.values():
            df.index = df.index.map(lambda x: x.replace('.aligned', ''))
            df.index = df.index.map(lambda x: x.replace('.npz', ''))
        metadata_df = utils.load_metadata_df(config.metadata_path,
                                             pca_embeds.index)

        # Define plotting kwargs
        default_kwargs = {'alpha': .25, 's': 10}
        plotting_df = pd.DataFrame(index=metadata_df.index)
        assert isinstance(mut_cmap[nan_class], tuple)
        plotting_df['facecolors'] = pd.Series([mut_cmap[nan_class]],
                                              index=plotting_df.index)
        plotting_df['marker'] = pd.Series('o', index=plotting_df.index)

        # List of kwargs used to create a legend
        legend_kwargs_list = list()
        legend_defaults = {'lw': 0, 's': 30, 'alpha': .75}
        for i, color in enumerate(mut_cmap):
            label = str(i) if i != nan_class else 'Missing'
            legend_kwargs_list.append(
                _copy_and_update(legend_defaults, {
                    'c': color,
                    'label': label
                }))

        embed_base_out = os.path.join(config.plot_dir, 'altsplice', event,
                                      'embeds', 'sqtl')
        for gene in gtdf:
            ensg = gene.split('_')[0].split('.')[0]
            mut_status = gtdf[gene]
            tag = gene.replace('.', '-')
            for (pp, lr), tsne_embeds in tsne_embeds_dict.items():
                if not TSNE_PP_PLOT_SET == 'all' and not pp in TSNE_PP_PLOT_SET:
                    continue
                if not TSNE_LR_PLOT_SET == 'all' and not lr in TSNE_LR_PLOT_SET:
                    continue
                tsne_plot_out = os.path.join(
                    embed_base_out, ensg,
                    '%s_tsne_embeds_pp_%d_lr_%d.png' % (tag, pp, lr))
                if not os.path.exists(os.path.dirname(tsne_plot_out)):
                    os.makedirs(os.path.dirname(tsne_plot_out))
                plotting_df['facecolors'] = pd.Series([mut_cmap[nan_class]],
                                                      index=plotting_df.index)
                plotting_df['facecolors'].loc[mut_status.index] = pd.Series(
                    mut_status.map(lambda x: mut_cmap[x]),
                    index=mut_status.index)
                fig, ax = ebhelp.plot_tsne_embeddings(tsne_embeds, plotting_df,
                                                      legend_kwargs_list,
                                                      default_kwargs)
                fig.suptitle(
                    'AltSplice %s tSNE(perplexity=%d, learning_rate=%d)' %
                    (tag, pp, lr))
                print "Writing %s" % tsne_plot_out
                plt.savefig(tsne_plot_out, bbox_inches='tight', dpi=300)
                if do_pdf: ebhelp.save_as_pdf(tsne_plot_out)
                plt.close()
    return
Ejemplo n.º 9
0
def do_heatmap_full(gtdf):

    # Load the data
    combined_df_desc = '%s_%d_high_var_events_concat' % (etype_desc,
                                                         MAX_EVENTS)
    combined_df = hmhelp.load_full_dataset(map_etype_to_file,
                                           MAX_EVENTS,
                                           combined_df_desc,
                                           reset=False)

    metadata_df = utils.load_metadata_df(config.metadata_path,
                                         combined_df.index)
    gtdf = _match_gtid_to_data_index(gtdf, combined_df.index)

    # Drop missing samples
    index_with_gt_data = gtdf.index[~np.all(gtdf.isnull(), axis=1)]
    index_with_all_data = index_with_gt_data.intersection(metadata_df.index)
    combined_df = combined_df.loc[index_with_all_data]
    gtdf = gtdf.loc[index_with_all_data]
    metadata_df = metadata_df.loc[index_with_all_data]

    # Order by cancer type, event_type
    combined_df = combined_df.loc[metadata_df['cnc'].sort_values().index]
    combined_df = combined_df.iloc[:, combined_df.columns.argsort()]

    # Get color scheme
    cnc_to_color = utils.load_color_scheme(config.color_scheme_path)
    col_cmap = np.array(
        sns.color_palette('Set2', len(map_etype_to_file.keys())))
    col_cmap_lut = dict(zip(map_etype_to_file.keys(), col_cmap))

    gtdf = gtdf.copy()
    nan_class = 3
    if np.any(gtdf.isnull()):
        assert not nan_class in gtdf.values
        gtdf.values[np.isnan(gtdf.values)] = nan_class
    gtdf = gtdf.apply(lambda x: pd.to_numeric(x, downcast='integer'))
    row_cmap = sns.color_palette('Set2', nan_class + 1)
    all_col_colors = hmhelp.ash.map_col_colors(combined_df.columns,
                                               col_cmap_lut)
    full_col_linkage = get_col_linkage(combined_df)

    for gene in gtdf.columns:
        pass
    for gene in ['ENSG00000138413.9_2_209113112']:
        # TODO: normalize -- subtract mean and divide by stdv
        # TODO: change color
        # sort by mutation status
        mut_status = gtdf[gene].sort_values()
        all_row_colors = mut_status.map(lambda x: row_cmap[x])

        for viz in ['full', 'sampled', 'averaged']:
            if viz == 'full':
                continue
                graph_df = combined_df.loc[mut_status.index]

            counts = mut_status.value_counts()
            if viz == 'sampled':
                min_count = counts.iloc[counts.index != nan_class].min()
                if min_count < 50: min_count = 50
                index = list()
                for val, cnt in counts.sort_index().iteritems():
                    mask = mut_status == val
                    nsamples = min(min_count, mask.sum())
                    index.extend(
                        np.random.choice(mut_status.index[mask],
                                         nsamples,
                                         replace=False))
                graph_df = combined_df.loc[index]

            if viz == 'averaged':
                avgs = list()
                index = list()
                for val, cnt in counts.sort_index().iteritems():
                    mask = mut_status == val
                    avgs.append(combined_df.loc[mask].mean(0))
                    index.append(combined_df.loc[mask].index[0])
                graph_df = pd.concat(avgs, axis=1).T
                graph_df.index = index

            graph_df.iloc[:] = (graph_df.values - graph_df.values.mean(0))
            row_colors = all_row_colors.loc[graph_df.index]
            col_colors = all_col_colors.loc[graph_df.columns]

            # And finally plot the data
            sys.setrecursionlimit(100000)
            print "Plotting data ... "
            graph = sns.clustermap(graph_df,
                                   row_colors=row_colors,
                                   col_colors=col_colors,
                                   row_cluster=False,
                                   col_linkage=full_col_linkage,
                                   cmap='BrBG')

            graph.ax_heatmap.xaxis.set_ticklabels([])
            graph.ax_heatmap.yaxis.set_ticklabels([])
            graph.ax_heatmap.xaxis.set_ticks([])
            graph.ax_heatmap.yaxis.set_ticks([])

            mut_event = "Gene: %s, Chr %s Loc %s" % tuple(gene.split('_'))
            graph.ax_col_dendrogram.set_title(
                "%s AltSplice %s Clustering" %
                (mut_event, combined_df_desc.replace('_', ' ').title()))
            graph.ax_heatmap.set_xlabel("Events")
            graph.ax_heatmap.set_ylabel("Samples")
            graph.cax.set_title("psi")
            row_labels = map(str, range(nan_class)) + ['Missing']
            hmhelp.ash.add_legend(graph, dict(zip(row_labels, row_cmap)))
            hmhelp.ash.add_col_legend(graph, col_cmap_lut)

            ensg = gene.split('_')[0].split('.')[0]
            outpath = os.path.join(
                _HEATMAP_BASE, ensg,
                gene.replace('.', '-') + '_' + combined_df_desc +
                '_v%d.png' % _VERSION)
            if viz != 'full':
                outpath = os.path.join(os.path.dirname(outpath),
                                       viz + '_' + os.path.basename(outpath))
            if not os.path.exists(os.path.dirname(outpath)):
                os.makedirs(os.path.dirname(outpath))

            print "Saving heatmap to: %s" % outpath
            plt.savefig(outpath, bbox_inches='tight', dpi=300)
            plt.close()
            if DEBUG and viz == 'sampled': return