def sorted_stripplot(prepped_df, out_path, gray, now): """Plots waterfall plot/sorted stripplot. Input: prepped_df: df prepared by function prep_sorted_stripplot (pandas df) out_path: path for storing output files (string) Returns none """ plt.rcParams.update({'figure.autolayout': True}) plt.rcParams['figure.figsize'] = mm2inch((178, 69.8)) sns.set_context("paper") sns.set_style("whitegrid") cancer_abbrs = prepped_df.color.unique() color_set = [] midpoints = {} x_locs = [] x_labels = cancer_abbrs.tolist() gray_ranges = [] for abbr in cancer_abbrs: color_set.append(_CANCER_COLORS[abbr]) can_xrange = prepped_df[prepped_df.color == abbr]['x_val'] midpoints[abbr] = can_xrange.mean() x_locs.append(midpoints[abbr]) sides = 0 if abbr in gray: gray_ranges.append( [can_xrange.min() - sides, can_xrange.max() + sides]) ax = sns.scatterplot(x='x_val', y='neojx_count', hue='color', data=prepped_df, legend=False, palette=sns.xkcd_palette(color_set), edgecolor='None', s=4) if gray: for gray_range in gray_ranges: plt.axvspan(gray_range[0], gray_range[1], color='gray', alpha=0.1) fig_name = 'fig1B_ncnjx_count_sorted_stripplot_{}.pdf'.format(now) plt.xlabel('') plt.xticks(x_locs, x_labels) plt.setp(ax.get_xticklabels(), rotation=90, fontsize=6) plt.yscale('log') ax.xaxis.grid(False) ax.yaxis.set_major_formatter( ticker.FuncFormatter(lambda y, _: '{:,g}'.format(y))) ylabel_text = ('junctions/sample (#)') plt.ylabel(ylabel_text, fontsize=7) plt.setp(ax.yaxis.get_majorticklabels(), fontsize=5, color='black') fig = plt.gcf() fig_file = os.path.join(out_path, fig_name) fig.savefig(fig_file, dpi=300) plt.close() return
def cluster_tcga_jxs(db_jx_df, out_path, now): """ :param db_jx_df: :param out_path: :param now: :return: """ tcga_cols = db_jx_df.drop(['jx'], axis=1).columns.values.tolist() renames = {} xlabels_init = [] for name in tcga_cols: can = name.split('_Sample_Percents')[0] abbr = _ALL_ABBR[can] xlabels_init.append(abbr) renames[name] = abbr db_jx_df.rename(renames, axis=1, inplace=True) db_jx_df = db_jx_df[(db_jx_df!=0).any(axis=1)] full_df = db_jx_df.fillna(0) metric = 'seuclidean' method = 'ward' xlabelset = [ 'ESCA', 'STAD', 'OV', 'BLCA', 'LUSC', 'HNSC', 'CESC', 'THYM', 'DLBC', 'ACC', 'PCPG', 'PAAD', 'LUAD', 'THCA', 'MESO', 'SARC', 'UCS', 'UCEC', 'BRCA', 'TGCT', 'PRAD', 'LIHC', 'CHOL', 'KIRC', 'KIRP', 'KICH', 'LAML', 'GBM', 'LGG', 'READ', 'COAD', 'UVM', 'SKCM' ] full_cluster = sns.clustermap( full_df.drop('jx', axis=1), method=method, metric=metric, standard_scale=1, col_cluster=False ) row_order = full_cluster.dendrogram_row.reordered_ind tcga_cluster = full_df.reindex(row_order, axis='rows') tcga_cluster = tcga_cluster.reset_index(drop=True) full_merge = tcga_cluster.drop('jx', axis=1) merge_df = full_merge[xlabelset] logging.info('total junction top_x is: {}'.format(len(merge_df))) fig_name = ('fig2A_TCGA_heatmap_{}.pdf'.format(now)) fig_file = os.path.join(out_path, fig_name) label_ranges = [] df_cols = merge_df.columns.values.tolist() for i in range(len(df_cols)): label_ranges.append([i, i+1]) label_ranges = np.array(label_ranges) colors = [ 'xkcd:{}'.format(_CANCER_COLORS[x]) for x in df_cols ] texts = None fontcolors = None colorbar_dict = { 'ranges': label_ranges, 'colors': colors, 'labels': texts, 'fontcolors': fontcolors, 'height_percent': '2.0%' } fig_size = mm2inch(89, 84) masked_double_heatmap( merge_df, merge_df.columns.values.tolist(), fig_file, size=fig_size, colorbar=colorbar_dict, cbarticks = [0.001, .01, .1, .8], cbar_font_adder=2, xlabel_fontsize=4, bottom_pad=-3.5 ) logging.info('saving figure at {}'.format(fig_file)) return
def cluster_subtype_jxs(db_jx_df, out_path, now): """ :param db_jx_df: :param expt_jx_dict: :param mode: :param bin_sizes: :param bin_max: :param cancer: :return: """ tcga_master_cols = ['CESC', 'PCPG', 'SARC', 'LGG', 'ESCA'] tcga_cols = db_jx_df.drop(['jx'], axis=1).columns.values.tolist() renames = {} xlabels_init = [] for name in tcga_cols: can = name.split('_Sample_Percents')[0] abbr = _ALL_ABBR[can] xlabels_init.append(abbr) renames[name] = abbr db_jx_df.rename(renames, axis=1, inplace=True) db_jx_df = db_jx_df[(db_jx_df != 0).any(axis=1)] full_df = db_jx_df.fillna(0) metric = 'cityblock' method = 'ward' xlabelset = [ 'CESC', 'CSC', 'ECAD', 'CASC', 'PCPG', 'PCHC', 'PGG', 'SARC', 'LMS', 'UPLS', 'DT', 'SYNS', 'MFS', 'MPNT', 'LGG', 'AC', 'OAC', 'ODG', 'ESCA', 'ESSC', 'ESAD' ] full_cluster = sns.clustermap(full_df.drop('jx', axis=1), method=method, metric=metric, standard_scale=1, col_cluster=False) row_order = full_cluster.dendrogram_row.reordered_ind tcga_cluster = full_df.reindex(row_order, axis='rows') tcga_cluster = tcga_cluster.reset_index(drop=True) full_merge = tcga_cluster.drop('jx', axis=1) merge_df = full_merge[xlabelset] logging.info('total junction top_x is: {}'.format(len(merge_df))) fig_name = ('fig2B_TCGA_subtype_heatmap_{}.pdf'.format(now)) fig_file = os.path.join(out_path, fig_name) label_ranges = np.array([[0, 4], [4, 7], [7, 14], [14, 18], [18, 21]]) texts = [ 'CESC', 'PCPG', 'SARC', 'LGG', 'ESCA', ] colors = ['xkcd:{}'.format(_CANCER_COLORS[x]) for x in texts] fontcolors = {text: _FONT_COLORS[text] for text in texts} colorbar_dict = { 'ranges': label_ranges, 'colors': colors, 'labels': texts, 'fontcolors': fontcolors } vline_pos = [4, 7, 14, 18] fig_size = mm2inch(89, 80) masked_double_heatmap(merge_df, tcga_master_cols, fig_file, masked_cmap=cm.Greys, other_cmap=cm.Blues, colorbar=colorbar_dict, size=fig_size, vline_pos=vline_pos, cbarticks=[.01, .1, 1], other_cbar_label='cancer subtype prevalence', cbar_font_adder=2, xlabel_fontsize=5, bottom_pad=-3.5, cbar_fraction=0.1, cbar_pad=0.01) logging.info('saving figure at: {}'.format(fig_file)) return
def barplots_with_table(data_dict, plot_dict, out_path, now, figsize): plt.rcParams.update({'figure.autolayout': True}) plt.rcParams['figure.figsize'] = mm2inch(figsize) sns.set_context("paper") sns.set_style("whitegrid") sns.set(style="ticks") bot_data = data_dict['paired'] mid_data = data_dict['gtex'] top_data = data_dict['other'] bot_color = plot_dict['light colors'][0] mid_color = plot_dict['light colors'][1] top_color = plot_dict['light colors'][2] f, (ax, ax2) = plt.subplots(nrows=2, ncols=1, gridspec_kw={'height_ratios': [10000, 1]}) plt.sca(ax) num_groups = len(data_dict['paired']) ind_l = list(range(1, (3 * num_groups) + 1, 3)) ind_l = [x - 0.15 for x in ind_l] ind_m = [x + 0.85 for x in ind_l] ind_r = [x + 0.85 for x in ind_m] barwidths = 0.75 plt.bar(ind_l, bot_data, barwidths, color=bot_color, linewidth=0) plt.bar(ind_m, mid_data, barwidths, color=mid_color, linewidth=0) plt.bar(ind_r, top_data, barwidths, color=top_color, linewidth=0) ax.set_xticklabels([]) ax.set_xticks([]) ax.xaxis.grid(False) ax.yaxis.grid(True) edge_buffer = .7 ax.set_xlim(left=ind_l[0] - edge_buffer, right=ind_r[-1] + edge_buffer) plt.yscale('log') plt.ylabel('relative abundance (%)', fontsize=7) ax.yaxis.set_major_formatter( ticker.FuncFormatter(lambda y, _: '{:g}'.format(y))) plt.setp(ax.yaxis.get_majorticklabels(), fontsize=5, color='black') columns = data_dict['abbr'] # Add Table rows = [''] whitefont_cols = [] col_cols = [] for i, abbr in enumerate(columns): try: font_color = _FONT_COLORS[abbr] col_cols.append('xkcd:{}'.format(_CANCER_COLORS[abbr])) except KeyError: for can_abbr in _FONT_COLORS.keys(): if can_abbr in abbr: font_color = _FONT_COLORS[can_abbr] for can_abbr in _CANCER_COLORS.keys(): if can_abbr in abbr: col_cols.append('xkcd:{}'.format(_CANCER_COLORS[can_abbr])) break if font_color == 'white': whitefont_cols.append(i) table_vals = [['' for _ in columns]] row_label_cols = ['white'] the_table = ax.table(cellText=table_vals, rowLabels=rows, colLabels=columns, loc='bottom', cellLoc='center', colColours=col_cols, rowColours=row_label_cols, bbox=[0, -0.09, 1, .075]) the_table.auto_set_font_size(False) the_table.set_fontsize(5) for (row, col), cell in the_table.get_celld().items(): if (row == 0): cell.set_text_props( fontproperties=FontProperties(weight='bold', size=3.75)) if len(columns[0]) > 5: cell.set_height(cell.get_height() * 1.5) if col in whitefont_cols: cell._text.set_color('white') else: cell.set_height(0) if (col == -1): cell.set_width(0) cell.set_height(0) ax2.yaxis.grid(False) ax2.spines['left'].set_visible(False) ax2.spines['bottom'].set_visible(False) ax2.spines['right'].set_visible(False) ax2.spines['top'].set_visible(False) ax2.patch.set_alpha(0) plt.sca(ax2) plt.plot([]) plt.xticks([], []) plt.yticks([], []) fig = plt.gcf() fig_name = 'fig1A_grouped_barplots_{}.pdf'.format(now) fig_file = os.path.join(out_path, fig_name) logging.info('saving figure at {}'.format(fig_file)) fig.savefig(fig_file, dpi=300) plt.close() return
def antisense_boxplot(jx_dir, out_path, now): """ :param jx_dir: :param all_files: :return: """ all_cancers = list(_TCGA_ABBR.keys()) main_header = 'median junction count\nacross cancer types' data_dict = {main_header: {'data': [[], [], [], [], []]}} gtex_total = 0 adult_total = 0 sc_total = 0 dev_total = 0 un_total = 0 counts = {'gtex': [], 'adult': [], 'sc': [], 'dev': [], 'un': []} can_count = 0 for cancer in all_cancers: logging.info('starting cancer {}'.format(cancer)) jx_file, flag, prev_glob = get_jx_prev_filename(jx_dir, cancer) if not jx_file: continue can_count += 1 jx_df = jx_df_from_file( jx_file, 0, 1, True, glob_form=prev_glob, sample=False, top_x=False, drop_ann=False, cancer=cancer ) gtex = jx_df[(jx_df.gtex == 1)] adult = jx_df[(jx_df.gtex == 0) & (jx_df.sra_adult == 1)] un = jx_df[ (jx_df.gtex == 0) & (jx_df.sra_stemcells == 0) & (jx_df.sra_adult == 0) & (jx_df.sra_developmental == 0) ] # Prep boxplots dev = jx_df[ (~jx_df.jx.isin(un.jx)) & (~jx_df.jx.isin(adult.jx)) & (~jx_df.jx.isin(gtex.jx)) & (jx_df.sra_developmental == 1) ] sc = jx_df[ (~jx_df.jx.isin(un.jx)) & (~jx_df.jx.isin(adult.jx)) & (~jx_df.jx.isin(gtex.jx)) &(jx_df.sra_stemcells == 1) ] gtex_antisense = gtex[gtex.antisense == 1] gtex_antiratio = 100 * len(gtex_antisense) / len(gtex) adult_antisense = adult[adult.antisense == 1] adult_anti_ratio = 100 * len(adult_antisense) / len(adult) dev_antisense = dev[dev.antisense == 1] dev_anti_ratio = 100 * len(dev_antisense) / len(dev) sc_antisense = sc[sc.antisense == 1] sc_ratio = 100 * len(sc_antisense) / len(sc) un_antisense = un[un.antisense == 1] un_ratio = 100 * len(un_antisense) / len(un) gtex_total += len(gtex) adult_total += len(adult) dev_total += len(dev) sc_total += len(sc) un_total += len(un) counts['gtex'].append(len(gtex)) counts['adult'].append(len(adult)) counts['dev'].append(len(dev)) counts['sc'].append(len(sc)) counts['un'].append(len(un)) data_dict[main_header]['data'][0].extend([gtex_antiratio]) data_dict[main_header]['data'][1].extend([adult_anti_ratio]) data_dict[main_header]['data'][2].extend([dev_anti_ratio]) data_dict[main_header]['data'][3].extend([sc_ratio]) data_dict[main_header]['data'][4].extend([un_ratio]) print_df = pd.DataFrame({ 'Cancer type': all_cancers, 'Core normals jx top_x': counts['gtex'], 'Core normals anti %': data_dict[main_header]['data'][0], 'Other adult non-cancer jx top_x': counts['adult'], 'Other adult non-cancer anti %': data_dict[main_header]['data'][1], 'Developmental jx top_x': counts['dev'], 'Developmental anti %': data_dict[main_header]['data'][2], 'Stem cell jc': counts['sc'], 'Stem cell ar': data_dict[main_header]['data'][3], 'Unexplained jc': counts['un'], 'Unexplained ar': data_dict[main_header]['data'][4] }) supp_table = os.path.join(out_path, 'antisense_table_S5.csv') with open(supp_table, 'w') as output: print_df.to_csv(output, index=False) # Assemble boxplot plt.rcParams.update({'figure.autolayout': True}) sns.set_context("paper") sns.set_style("whitegrid") counts_df = pd.DataFrame(counts) table_data = [] for column in counts_df.columns.values.tolist(): table_data.append( '{:,}\n(IQR: {:,}-{:,})'.format( int(round(counts_df[column].median())), int(counts_df[column].quantile([0.25])[0.25]), int(counts_df[column].quantile([0.75])[0.75]) ) ) data_dict[main_header]['table_data'] = table_data plot_info_dict = {} plot_info_dict['light colors'] = [ 'xkcd:kermit green', 'xkcd:light teal', 'xkcd:pale purple', 'xkcd:apricot', 'xkcd:light red' ] plot_info_dict['dark colors'] = plot_info_dict['light colors'] plot_info_dict['legend'] = [ 'Core normals', 'Other adult non-cancer', 'Developmental', 'Stem cell', 'Unexplained' ] plot_info_dict['row colors'] = plot_info_dict['light colors'] plot_info_dict['row font color'] = [ 'black', 'black', 'black', 'black', 'black' ] plot_info_dict['row labels'] = plot_info_dict['legend'] fig_name = 'fig3B_antisense_ratios_boxplot_{}.pdf'.format(now) fig_file = os.path.join(out_path, fig_name) logging.info('saving figure at {}'.format(fig_file)) # fig_size = mm2inch(84, 140) # fig_size = mm2inch(84, 110) fig_size = mm2inch(60, 70) grouped_boxplots_with_table( data_dict, plot_info_dict, fig_file, logscale=False, y_label='antisense junctions (%)', percent=False, right_lim_shift=3, fig_size=fig_size, intab_fontsize=7, tabrow_fontsize=7, tabcol_fontsize=7, expand_rows=1.4 ) return
def cluster_jxs_together(db_jx_df, expt_jx_dict, out_path, count_dict, sra_threshold, now): """ :param db_jx_df: :param expt_jx_dict: :param mode: :param bin_sizes: :param bin_max: :param cancer: :return: """ metric = 'seuclidean' method = 'ward' tcga_cols = db_jx_df.drop(['jx'], axis=1).columns.values.tolist() renames = {} xlabels_init = [] for name in tcga_cols: can = name.split('_Sample_Percents')[0] abbr = _ALL_ABBR[can] xlabels_init.append(abbr) renames[name] = abbr db_jx_df.rename(renames, axis=1, inplace=True) db_jx_df = db_jx_df[(db_jx_df != 0).any(axis=1)] all_sra_jxs = set() for key in expt_jx_dict: expt_jxs = expt_jx_dict[key] jxs = collect_mutual_jxs(db_jx_df, expt_jxs, key, ret_type='jxs') all_sra_jxs = all_sra_jxs.union(jxs) full_df = db_jx_df.fillna(0) full_df = full_df[full_df['jx'].isin(all_sra_jxs)] full_df = full_df.reset_index(drop=True) tcga_master_cols = full_df.drop('jx', axis=1).columns.values.tolist() sra_cols = list(expt_jx_dict.keys()) sra_abbr_cols = [] for item in sra_cols: try: sra_abbr_cols.append(_SRA_ABBR[item]) except KeyError: continue sra_master = pd.DataFrame(full_df['jx']) for key in expt_jx_dict: try: name = _SRA_ABBR[key] except KeyError: logging.info('key error: {}'.format(key)) continue sra_master[name] = sra_master.jx.apply( lambda x: expt_jx_dict[key].get(x, 0) / count_dict[key]) null_thresh = floor(sra_threshold * len(sra_master)) sra_master = sra_master.replace(to_replace=0, value=np.nan) sra_master = sra_master.dropna(thresh=null_thresh, axis=1).fillna(0) xlabelset = [ 'PAAD', 'pancreatic islet primary cell', 'THYM', 'thymus primary cell', 'thymus tissue', 'LIHC', 'hepatocyte primary cell', 'epithelial primary cell', 'SKCM', 'melanocyte cell line', 'melanocyte primary cell', ] full_merge = cluster_by_tcga(sra_master, full_df, metric, method) merge_df = full_merge[xlabelset] logging.info('total junction top_x is: {}'.format(len(merge_df))) fig_name = ('fig2C_TCGA_cell_of_origin_heatmap_{}.pdf'.format(now)) fig_file = os.path.join(out_path, fig_name) label_ranges = [] df_cols = merge_df.columns.values.tolist() for i in range(len(df_cols)): label_ranges.append([i, i + 1]) label_ranges = np.array(label_ranges) colors = ['xkcd:{}'.format(_FULL_HEATMAP_COLORS[x]) for x in df_cols] texts = None fontcolors = None colorbar_dict = { 'ranges': label_ranges, 'colors': colors, 'labels': texts, 'fontcolors': fontcolors, 'height_percent': '2%' } fig_size = mm2inch(63.5, 114.3) masked_double_heatmap(merge_df, tcga_master_cols, fig_file, colorbar=colorbar_dict, other_cbar_label='SRA cell type prevalence', size=fig_size, bottom_pad=-3.5, xlabel_fontsize=5, cbar_fraction=0.1, cbar_pad=0.077, cbarticks=[.01, .1, .5], cbar_font_adder=3) logging.info('saving figure at: {}'.format(fig_file)) return