def load_cnc_colors(df): metadata_df = utils.load_metadata_df(config.metadata_path, df.index) # Get color scheme cnc_to_color = utils.load_color_scheme(config.color_scheme_path) cancer_types = cnc_to_color.keys() cnc_colors = metadata_df['cnc'].map(cnc_to_color) return metadata_df['cnc'], cnc_to_color
def load_mrna_colors(df): key = 'Subtype_mRNA' metadata_df = utils.load_metadata_df(config.metadata_path, df.index) metadata_df, subtype_names = utils.append_subtype(config.subtype_path, metadata_df) assert key in subtype_names and key in metadata_df.columns subtype_series = metadata_df[key].loc[df.index].dropna().apply( lambda x: x.lower()) value_counts = subtype_series.value_counts() subtype_classes = value_counts[value_counts > 30].index subtype_series = subtype_series[subtype_series.isin(subtype_classes)] colors = utils.load_color_scheme(config.color_scheme_path).values() color_lut = dict(zip(sorted(subtype_classes), colors)) return subtype_series, color_lut
# Add Expression if True: path = os.path.join(config.embed_dir, 'expression', 'data.tsv') outdir = os.path.join(config.plot_dir, 'expression', 'heatmaps') if not os.path.exists(outdir): os.makedirs(outdir) desc = 'Expression' if NORM: desc = 'Normalized ' + desc try: df = utils.load_large_df(path.replace('.tsv', '')) except IOError: df = pd.read_csv(path, sep='\t', index_col=0) df.iloc[:] = np.minimum(df.values, np.percentile(df.values, 99, axis=0)) keep_cols = filter_to_high_var(df.values, df.columns, MAX_EVENTS) df = df.iloc[:, keep_cols] metadata_df = utils.load_metadata_df(config.metadata_path, df.index) medians = collapse_to_median(df, metadata_df['cnc']) heatmap_dists_with_dendro(medians, norm=NORM) outpath = os.path.join(outdir, desc.lower().replace(' ', '_') +'_rep_dists_heatmap.png') plot_utils.save(outpath, do_pdf=True) # Add AltSplice if False: altsplice_event_list= ['alt_3prime', 'alt_5prime', 'intron_retention', 'exon_skip'] for event in altsplice_event_list: path = os.path.join(config.embed_dir, 'altsplice', event, 'data.tsv') outdir = os.path.join(config.plot_dir, 'altsplice', event, 'heatmap') if not os.path.exists(outdir): os.makedirs(outdir) desc = 'AltSplice %s'%event.title() if NORM: desc = 'Normalized ' + desc print desc
def main(embed_dir, plot_dir, desc): '''Runs all tasks on a single embedding. embed_dir: location of pca & tsne embeddings plot_dir: directory to write plots desc: identifies embedding (used for e.g. plot titles) ''' assert os.path.exists(embed_dir), embed_dir if not os.path.exists(plot_dir): os.makedirs(plot_dir) rnadeg_df = utils.load_rnadeg(config.rnadeg_path) if DEGSCORE else None libsize_df = utils.load_libsize(config.libsize_path) if LIBSIZE else None pca_model, pca_embeds, tsne_embeds_dict = utils.load_embeds( embed_dir, whitelist=WHITELIST, pp_set=TSNE_PP_PLOT_SET, lr_set=TSNE_LR_PLOT_SET) metadata_df = utils.load_metadata_df(config.metadata_path, pca_embeds.index) metadata_df, subtype_names = utils.append_subtype(config.subtype_path, metadata_df) if ALL_TASKS or COLOR_CNC: outdir = os.path.join(plot_dir, 'complete') if not os.path.exists(outdir): os.makedirs(outdir) plot_configs = plot_args.cnc_plotting_args(metadata_df) run_embedding_proc(outdir, plot_configs, desc, tsne_embeds_dict) if ALL_TASKS or HL_CNC is not None: if ALL_TASKS or HL_CNC == 'all': cnc_list = np.unique(metadata_df['cnc'].values) else: cnc_list = HL_CNC cnc_groups = metadata_df.groupby('cnc') for cnc in cnc_list: cnc_index = cnc_groups.get_group(cnc).index other_index = np.setdiff1d(metadata_df.index, cnc_index) for subtype in subtype_names: if DEBUG: print cnc, subtype subtype_configs = plot_args.load_subtype_color_tumor_marker_kwargs( metadata_df, subtype, cnc) outdir = os.path.join(plot_dir, 'highlights_subtype', cnc) if not os.path.exists(outdir): os.makedirs(outdir) subtype_desc = ' '.join([subtype, desc]) run_embedding_proc(outdir, subtype_configs, subtype_desc, tsne_embeds_dict, prefix=subtype) pass if ALL_TASKS or TUMOR_NORMAL: if DEBUG: print "start tumor normal" tn_configs = plot_args.tumor_normal_plotting_args(metadata_df) outdir = os.path.join(plot_dir, 'complete_tumor_normal') if not os.path.exists(outdir): os.makedirs(outdir) tn_desc = '%s Tumor/Normal' % desc run_embedding_proc(outdir, tn_configs, tn_desc, tsne_embeds_dict) if ALL_TASKS or LIBSIZE: if DEBUG: print " start libsize" cbar_title = libsize_df.columns[0] outdir = os.path.join(plot_dir, 'qc', 'libsize') if not os.path.exists(outdir): os.makedirs(outdir) for (pp, lr), embed_df in tsne_embeds_dict.items(): outpath = os.path.join(outdir, 'tsne_embeds_pp_%d_lr_%d.png' % (pp, lr)) axis_title = '%s Library Size Effects\ntSNE(perplexity=%d, learning_rate=%d)' % ( desc.title(), pp, lr) fig, ax = plt.subplots() plot_utils.plot_continuous_color_embeddings(embed_df, libsize_df.iloc[:, 0], ax=ax, axis_title=axis_title, cbar_title=cbar_title) plot_utils.save(outpath, do_pdf=DO_PDF) if ALL_TASKS or DEGSCORE: if DEBUG: print " start degscore" outdir = os.path.join(plot_dir, 'qc', 'degscore') if not os.path.exists(outdir): os.makedirs(outdir) cbar_title = rnadeg_df.columns[0] for (pp, lr), embed_df in tsne_embeds_dict.items(): outpath = os.path.join(outdir, 'tsne_embeds_pp_%d_lr_%d.png' % (pp, lr)) fig, ax = plt.subplots() axis_title = '%s RNADeg Effects\ntSNE(perplexity=%d, learning_rate=%d)' % ( desc.title(), pp, lr) plot_utils.plot_continuous_color_embeddings(embed_df, rnadeg_df.iloc[:, 0], ax=ax, axis_title=axis_title, cbar_title=cbar_title) plot_utils.save(outpath, do_pdf=DO_PDF)
if __name__ == '__main__': # load data map_event_to_file = { 'exon_skip': config.alt_splice_exon_skip_path, 'intron_retention': config.alt_splice_intron_retention_path, 'alt_3prime': config.alt_splce_alt_3prime_path, 'alt_5prime': config.alt_splce_alt_5prime_path } if DO_INDIVIDUAL_EVENTS: for event, path in map_event_to_file.items(): print "Loading %s data from %s" % (event, path) psi, strains, gene_idx = preproc.load_data(path) metadata_df = utils.load_metadata_df(config.metadata_path, strains) cnc_groups = metadata_df.groupby('cnc') for cnc, cnc_md in cnc_groups: cnc_psi, cnc_strains = subset_to_cnc(psi, strains, cnc_md.index) if cnc_psi is None: print "WARNING: Did not find %s samples in %s" % (cnc, event) cnc_event_embed_dir = os.path.join(EMBED_DIR, event, 'cancer_only', cnc) if not os.path.exists(cnc_event_embed_dir): os.makedirs(cnc_event_embed_dir) pca_embeds_df, pca = run_pca_embeddings(cnc_psi, cnc_strains) save_pca_embeddings(cnc_event_embed_dir, pca_embeds_df, pca) if DO_TSNE: for lr in TSNE_LR_LIST:
counts_raw_df = pd.read_csv(input_cache, sep='\t', index_col=0) else: print "Loading expression count data from %s" % config.expression_count_path wl = utils.load_whitelist(config.whitelist_path) counts_raw, gids, sids = load_raw_counts(config.expression_count_path) wl_mask = np.in1d(sids, wl) assert counts_raw.shape[1] == sids.size counts_raw = counts_raw[:, wl_mask] sids = sids[wl_mask] counts_raw, gids, sids = filter_counts(counts_raw, gids, sids) assert np.all(np.isfinite(counts_raw)) counts_raw = np.log10(counts_raw + 1).T counts_raw_df = pd.DataFrame(counts_raw, index=sids, columns=gids) # df is sids x gids counts_raw_df.to_csv(input_cache, sep='\t') metadata_df = utils.load_metadata_df(config.metadata_path) metadata_df = utils.translate_tcga_to_strain_index(metadata_df, counts_raw_df.index) if DO_CNC_BARS: bar_outpath = os.path.join(PLOT_DIR, 'cancer_type_barplot.png') cnc = metadata_df['cnc'].loc[counts_raw_df.index].values plot_cnc_bars(bar_outpath, cnc) if DO_BOXPLOTS: box_outpath = os.path.join(PLOT_DIR, 'expression_boxplot.png') plot_box_raw(box_outpath, counts_raw_df.values) if DO_PCA: pca_embeds_raw_out = os.path.join(EMBED_DIR, 'pca_embeds.tsv') pca_model_raw_out = os.path.join(EMBED_DIR, 'pca_model.npy')
def add_normal_label(md): normal_mask = np.invert(md['is_tumor']) md.loc[normal_mask, 'cnc'] = md.loc[normal_mask]['cnc'].map(lambda x: x+' (Normal)') return md DEBUG = False if __name__ == '__main__': if sys.argv[1] == '--event': ### load actual data panc_df, gtex_df = load_data(sys.argv[2], is_event=True) ### load metadata panc_md = utils.load_metadata_df(config.metadata_path, panc_df.index).dropna() gtex_md = load_gtex_metadata(config.gtex_metadata_path, gtex_df.index).dropna() panc_md_wnorm = add_normal_label(panc_md) assert panc_df.index.equals(panc_md.index) assert gtex_df.index.equals(gtex_md.index) if len(sys.argv) > 3: PLOT_DIR = os.path.join(config.plot_dir, 'altsplice', 'outlier_events_%s' % sys.argv[3]) else: PLOT_DIR = os.path.join(config.plot_dir, 'altsplice', 'outlier_events') if not os.path.exists(PLOT_DIR): os.makedirs(PLOT_DIR) run_strip_plot(panc_df, gtex_df, panc_md_wnorm['cnc'], gtex_md['histological_type_s']) else: sf_interest = get_sf_interest()
def do_embedding(gtdf): gtdf = gtdf.copy() DO_TSNE = True DO_INDIVIDUAL_EVENTS = True DO_COMBINED_EVENTS = True PLOT_DIR = os.path.join(config.plot_dir, 'altsplice') EMBED_DIR = os.path.join(config.embed_dir, 'altsplice') EVENT_LIST = [ 'exon_skip', 'intron_retention', 'alt_3prime', 'alt_5prime', 'concatenated' ] TSNE_PP_PLOT_SET = [50] TSNE_LR_PLOT_SET = [500] do_pdf = True nan_class = 3 if np.any(gtdf.isnull()): assert not nan_class in gtdf.values gtdf.values[np.isnan(gtdf.values)] = nan_class gtdf = gtdf.apply(lambda x: pd.to_numeric(x, downcast='integer')) mut_cmap = sns.color_palette('Set2', nan_class + 1) for event in EVENT_LIST: embed_base_dir = os.path.join(config.embed_dir, 'altsplice', event) pca_model, pca_embeds, tsne_embeds_dict = utils.load_embeds( embed_base_dir) pca_embeds.index = pca_embeds.index.map( lambda x: x.replace('.aligned', '')) pca_embeds.index = pca_embeds.index.map( lambda x: x.replace('.npz', '')) for df in tsne_embeds_dict.values(): df.index = df.index.map(lambda x: x.replace('.aligned', '')) df.index = df.index.map(lambda x: x.replace('.npz', '')) metadata_df = utils.load_metadata_df(config.metadata_path, pca_embeds.index) # Define plotting kwargs default_kwargs = {'alpha': .25, 's': 10} plotting_df = pd.DataFrame(index=metadata_df.index) assert isinstance(mut_cmap[nan_class], tuple) plotting_df['facecolors'] = pd.Series([mut_cmap[nan_class]], index=plotting_df.index) plotting_df['marker'] = pd.Series('o', index=plotting_df.index) # List of kwargs used to create a legend legend_kwargs_list = list() legend_defaults = {'lw': 0, 's': 30, 'alpha': .75} for i, color in enumerate(mut_cmap): label = str(i) if i != nan_class else 'Missing' legend_kwargs_list.append( _copy_and_update(legend_defaults, { 'c': color, 'label': label })) embed_base_out = os.path.join(config.plot_dir, 'altsplice', event, 'embeds', 'sqtl') for gene in gtdf: ensg = gene.split('_')[0].split('.')[0] mut_status = gtdf[gene] tag = gene.replace('.', '-') for (pp, lr), tsne_embeds in tsne_embeds_dict.items(): if not TSNE_PP_PLOT_SET == 'all' and not pp in TSNE_PP_PLOT_SET: continue if not TSNE_LR_PLOT_SET == 'all' and not lr in TSNE_LR_PLOT_SET: continue tsne_plot_out = os.path.join( embed_base_out, ensg, '%s_tsne_embeds_pp_%d_lr_%d.png' % (tag, pp, lr)) if not os.path.exists(os.path.dirname(tsne_plot_out)): os.makedirs(os.path.dirname(tsne_plot_out)) plotting_df['facecolors'] = pd.Series([mut_cmap[nan_class]], index=plotting_df.index) plotting_df['facecolors'].loc[mut_status.index] = pd.Series( mut_status.map(lambda x: mut_cmap[x]), index=mut_status.index) fig, ax = ebhelp.plot_tsne_embeddings(tsne_embeds, plotting_df, legend_kwargs_list, default_kwargs) fig.suptitle( 'AltSplice %s tSNE(perplexity=%d, learning_rate=%d)' % (tag, pp, lr)) print "Writing %s" % tsne_plot_out plt.savefig(tsne_plot_out, bbox_inches='tight', dpi=300) if do_pdf: ebhelp.save_as_pdf(tsne_plot_out) plt.close() return
def do_heatmap_full(gtdf): # Load the data combined_df_desc = '%s_%d_high_var_events_concat' % (etype_desc, MAX_EVENTS) combined_df = hmhelp.load_full_dataset(map_etype_to_file, MAX_EVENTS, combined_df_desc, reset=False) metadata_df = utils.load_metadata_df(config.metadata_path, combined_df.index) gtdf = _match_gtid_to_data_index(gtdf, combined_df.index) # Drop missing samples index_with_gt_data = gtdf.index[~np.all(gtdf.isnull(), axis=1)] index_with_all_data = index_with_gt_data.intersection(metadata_df.index) combined_df = combined_df.loc[index_with_all_data] gtdf = gtdf.loc[index_with_all_data] metadata_df = metadata_df.loc[index_with_all_data] # Order by cancer type, event_type combined_df = combined_df.loc[metadata_df['cnc'].sort_values().index] combined_df = combined_df.iloc[:, combined_df.columns.argsort()] # Get color scheme cnc_to_color = utils.load_color_scheme(config.color_scheme_path) col_cmap = np.array( sns.color_palette('Set2', len(map_etype_to_file.keys()))) col_cmap_lut = dict(zip(map_etype_to_file.keys(), col_cmap)) gtdf = gtdf.copy() nan_class = 3 if np.any(gtdf.isnull()): assert not nan_class in gtdf.values gtdf.values[np.isnan(gtdf.values)] = nan_class gtdf = gtdf.apply(lambda x: pd.to_numeric(x, downcast='integer')) row_cmap = sns.color_palette('Set2', nan_class + 1) all_col_colors = hmhelp.ash.map_col_colors(combined_df.columns, col_cmap_lut) full_col_linkage = get_col_linkage(combined_df) for gene in gtdf.columns: pass for gene in ['ENSG00000138413.9_2_209113112']: # TODO: normalize -- subtract mean and divide by stdv # TODO: change color # sort by mutation status mut_status = gtdf[gene].sort_values() all_row_colors = mut_status.map(lambda x: row_cmap[x]) for viz in ['full', 'sampled', 'averaged']: if viz == 'full': continue graph_df = combined_df.loc[mut_status.index] counts = mut_status.value_counts() if viz == 'sampled': min_count = counts.iloc[counts.index != nan_class].min() if min_count < 50: min_count = 50 index = list() for val, cnt in counts.sort_index().iteritems(): mask = mut_status == val nsamples = min(min_count, mask.sum()) index.extend( np.random.choice(mut_status.index[mask], nsamples, replace=False)) graph_df = combined_df.loc[index] if viz == 'averaged': avgs = list() index = list() for val, cnt in counts.sort_index().iteritems(): mask = mut_status == val avgs.append(combined_df.loc[mask].mean(0)) index.append(combined_df.loc[mask].index[0]) graph_df = pd.concat(avgs, axis=1).T graph_df.index = index graph_df.iloc[:] = (graph_df.values - graph_df.values.mean(0)) row_colors = all_row_colors.loc[graph_df.index] col_colors = all_col_colors.loc[graph_df.columns] # And finally plot the data sys.setrecursionlimit(100000) print "Plotting data ... " graph = sns.clustermap(graph_df, row_colors=row_colors, col_colors=col_colors, row_cluster=False, col_linkage=full_col_linkage, cmap='BrBG') graph.ax_heatmap.xaxis.set_ticklabels([]) graph.ax_heatmap.yaxis.set_ticklabels([]) graph.ax_heatmap.xaxis.set_ticks([]) graph.ax_heatmap.yaxis.set_ticks([]) mut_event = "Gene: %s, Chr %s Loc %s" % tuple(gene.split('_')) graph.ax_col_dendrogram.set_title( "%s AltSplice %s Clustering" % (mut_event, combined_df_desc.replace('_', ' ').title())) graph.ax_heatmap.set_xlabel("Events") graph.ax_heatmap.set_ylabel("Samples") graph.cax.set_title("psi") row_labels = map(str, range(nan_class)) + ['Missing'] hmhelp.ash.add_legend(graph, dict(zip(row_labels, row_cmap))) hmhelp.ash.add_col_legend(graph, col_cmap_lut) ensg = gene.split('_')[0].split('.')[0] outpath = os.path.join( _HEATMAP_BASE, ensg, gene.replace('.', '-') + '_' + combined_df_desc + '_v%d.png' % _VERSION) if viz != 'full': outpath = os.path.join(os.path.dirname(outpath), viz + '_' + os.path.basename(outpath)) if not os.path.exists(os.path.dirname(outpath)): os.makedirs(os.path.dirname(outpath)) print "Saving heatmap to: %s" % outpath plt.savefig(outpath, bbox_inches='tight', dpi=300) plt.close() if DEBUG and viz == 'sampled': return