def sorted_stripplot(prepped_df, out_path, gray, now):
    """Plots waterfall plot/sorted stripplot.

    Input:
    prepped_df: df prepared by function prep_sorted_stripplot (pandas df)
    out_path: path for storing output files (string)

    Returns none
    """
    plt.rcParams.update({'figure.autolayout': True})
    plt.rcParams['figure.figsize'] = mm2inch((178, 69.8))
    sns.set_context("paper")
    sns.set_style("whitegrid")

    cancer_abbrs = prepped_df.color.unique()
    color_set = []
    midpoints = {}
    x_locs = []
    x_labels = cancer_abbrs.tolist()
    gray_ranges = []
    for abbr in cancer_abbrs:
        color_set.append(_CANCER_COLORS[abbr])
        can_xrange = prepped_df[prepped_df.color == abbr]['x_val']
        midpoints[abbr] = can_xrange.mean()
        x_locs.append(midpoints[abbr])
        sides = 0
        if abbr in gray:
            gray_ranges.append(
                [can_xrange.min() - sides,
                 can_xrange.max() + sides])

    ax = sns.scatterplot(x='x_val',
                         y='neojx_count',
                         hue='color',
                         data=prepped_df,
                         legend=False,
                         palette=sns.xkcd_palette(color_set),
                         edgecolor='None',
                         s=4)
    if gray:
        for gray_range in gray_ranges:
            plt.axvspan(gray_range[0], gray_range[1], color='gray', alpha=0.1)

    fig_name = 'fig1B_ncnjx_count_sorted_stripplot_{}.pdf'.format(now)

    plt.xlabel('')
    plt.xticks(x_locs, x_labels)
    plt.setp(ax.get_xticklabels(), rotation=90, fontsize=6)
    plt.yscale('log')
    ax.xaxis.grid(False)
    ax.yaxis.set_major_formatter(
        ticker.FuncFormatter(lambda y, _: '{:,g}'.format(y)))
    ylabel_text = ('junctions/sample (#)')
    plt.ylabel(ylabel_text, fontsize=7)
    plt.setp(ax.yaxis.get_majorticklabels(), fontsize=5, color='black')
    fig = plt.gcf()
    fig_file = os.path.join(out_path, fig_name)
    fig.savefig(fig_file, dpi=300)
    plt.close()
    return
コード例 #2
0
def cluster_tcga_jxs(db_jx_df, out_path, now):
    """

    :param db_jx_df:
    :param out_path:
    :param now:
    :return:
    """
    tcga_cols = db_jx_df.drop(['jx'], axis=1).columns.values.tolist()
    renames = {}
    xlabels_init = []
    for name in tcga_cols:
        can = name.split('_Sample_Percents')[0]
        abbr = _ALL_ABBR[can]
        xlabels_init.append(abbr)
        renames[name] = abbr
    db_jx_df.rename(renames, axis=1, inplace=True)
    db_jx_df = db_jx_df[(db_jx_df!=0).any(axis=1)]
    full_df = db_jx_df.fillna(0)

    metric = 'seuclidean'
    method = 'ward'
    xlabelset = [
        'ESCA', 'STAD', 'OV', 'BLCA', 'LUSC', 'HNSC', 'CESC', 'THYM',
        'DLBC', 'ACC', 'PCPG', 'PAAD', 'LUAD', 'THCA', 'MESO', 'SARC',
        'UCS', 'UCEC', 'BRCA', 'TGCT', 'PRAD', 'LIHC', 'CHOL', 'KIRC',
        'KIRP', 'KICH', 'LAML', 'GBM', 'LGG', 'READ', 'COAD', 'UVM',
        'SKCM'
    ]

    full_cluster = sns.clustermap(
        full_df.drop('jx', axis=1), method=method, metric=metric,
        standard_scale=1, col_cluster=False
    )

    row_order = full_cluster.dendrogram_row.reordered_ind
    tcga_cluster = full_df.reindex(row_order, axis='rows')
    tcga_cluster = tcga_cluster.reset_index(drop=True)

    full_merge = tcga_cluster.drop('jx', axis=1)
    merge_df = full_merge[xlabelset]

    logging.info('total junction top_x is: {}'.format(len(merge_df)))

    fig_name = ('fig2A_TCGA_heatmap_{}.pdf'.format(now))
    fig_file = os.path.join(out_path, fig_name)
    label_ranges = []
    df_cols = merge_df.columns.values.tolist()
    for i in range(len(df_cols)):
        label_ranges.append([i, i+1])
    label_ranges = np.array(label_ranges)
    colors = [
        'xkcd:{}'.format(_CANCER_COLORS[x]) for x in df_cols
    ]
    texts = None
    fontcolors = None

    colorbar_dict = {
        'ranges': label_ranges, 'colors': colors, 'labels': texts,
        'fontcolors': fontcolors, 'height_percent': '2.0%'
    }

    fig_size = mm2inch(89, 84)
    masked_double_heatmap(
        merge_df, merge_df.columns.values.tolist(), fig_file, size=fig_size,
        colorbar=colorbar_dict, cbarticks = [0.001, .01, .1, .8],
        cbar_font_adder=2, xlabel_fontsize=4, bottom_pad=-3.5
    )
    logging.info('saving figure at {}'.format(fig_file))
    return
def cluster_subtype_jxs(db_jx_df, out_path, now):
    """

    :param db_jx_df:
    :param expt_jx_dict:
    :param mode:
    :param bin_sizes:
    :param bin_max:
    :param cancer:
    :return:
    """
    tcga_master_cols = ['CESC', 'PCPG', 'SARC', 'LGG', 'ESCA']
    tcga_cols = db_jx_df.drop(['jx'], axis=1).columns.values.tolist()
    renames = {}
    xlabels_init = []
    for name in tcga_cols:
        can = name.split('_Sample_Percents')[0]
        abbr = _ALL_ABBR[can]
        xlabels_init.append(abbr)
        renames[name] = abbr
    db_jx_df.rename(renames, axis=1, inplace=True)
    db_jx_df = db_jx_df[(db_jx_df != 0).any(axis=1)]
    full_df = db_jx_df.fillna(0)

    metric = 'cityblock'
    method = 'ward'
    xlabelset = [
        'CESC', 'CSC', 'ECAD', 'CASC', 'PCPG', 'PCHC', 'PGG', 'SARC', 'LMS',
        'UPLS', 'DT', 'SYNS', 'MFS', 'MPNT', 'LGG', 'AC', 'OAC', 'ODG', 'ESCA',
        'ESSC', 'ESAD'
    ]
    full_cluster = sns.clustermap(full_df.drop('jx', axis=1),
                                  method=method,
                                  metric=metric,
                                  standard_scale=1,
                                  col_cluster=False)
    row_order = full_cluster.dendrogram_row.reordered_ind
    tcga_cluster = full_df.reindex(row_order, axis='rows')
    tcga_cluster = tcga_cluster.reset_index(drop=True)
    full_merge = tcga_cluster.drop('jx', axis=1)
    merge_df = full_merge[xlabelset]
    logging.info('total junction top_x is: {}'.format(len(merge_df)))

    fig_name = ('fig2B_TCGA_subtype_heatmap_{}.pdf'.format(now))
    fig_file = os.path.join(out_path, fig_name)
    label_ranges = np.array([[0, 4], [4, 7], [7, 14], [14, 18], [18, 21]])
    texts = [
        'CESC',
        'PCPG',
        'SARC',
        'LGG',
        'ESCA',
    ]
    colors = ['xkcd:{}'.format(_CANCER_COLORS[x]) for x in texts]
    fontcolors = {text: _FONT_COLORS[text] for text in texts}
    colorbar_dict = {
        'ranges': label_ranges,
        'colors': colors,
        'labels': texts,
        'fontcolors': fontcolors
    }
    vline_pos = [4, 7, 14, 18]
    fig_size = mm2inch(89, 80)
    masked_double_heatmap(merge_df,
                          tcga_master_cols,
                          fig_file,
                          masked_cmap=cm.Greys,
                          other_cmap=cm.Blues,
                          colorbar=colorbar_dict,
                          size=fig_size,
                          vline_pos=vline_pos,
                          cbarticks=[.01, .1, 1],
                          other_cbar_label='cancer subtype prevalence',
                          cbar_font_adder=2,
                          xlabel_fontsize=5,
                          bottom_pad=-3.5,
                          cbar_fraction=0.1,
                          cbar_pad=0.01)
    logging.info('saving figure at: {}'.format(fig_file))
    return
コード例 #4
0
def barplots_with_table(data_dict, plot_dict, out_path, now, figsize):
    plt.rcParams.update({'figure.autolayout': True})
    plt.rcParams['figure.figsize'] = mm2inch(figsize)

    sns.set_context("paper")
    sns.set_style("whitegrid")
    sns.set(style="ticks")

    bot_data = data_dict['paired']
    mid_data = data_dict['gtex']
    top_data = data_dict['other']
    bot_color = plot_dict['light colors'][0]
    mid_color = plot_dict['light colors'][1]
    top_color = plot_dict['light colors'][2]
    f, (ax, ax2) = plt.subplots(nrows=2,
                                ncols=1,
                                gridspec_kw={'height_ratios': [10000, 1]})
    plt.sca(ax)

    num_groups = len(data_dict['paired'])

    ind_l = list(range(1, (3 * num_groups) + 1, 3))
    ind_l = [x - 0.15 for x in ind_l]
    ind_m = [x + 0.85 for x in ind_l]
    ind_r = [x + 0.85 for x in ind_m]

    barwidths = 0.75
    plt.bar(ind_l, bot_data, barwidths, color=bot_color, linewidth=0)
    plt.bar(ind_m, mid_data, barwidths, color=mid_color, linewidth=0)
    plt.bar(ind_r, top_data, barwidths, color=top_color, linewidth=0)

    ax.set_xticklabels([])
    ax.set_xticks([])
    ax.xaxis.grid(False)
    ax.yaxis.grid(True)

    edge_buffer = .7
    ax.set_xlim(left=ind_l[0] - edge_buffer, right=ind_r[-1] + edge_buffer)
    plt.yscale('log')
    plt.ylabel('relative abundance (%)', fontsize=7)
    ax.yaxis.set_major_formatter(
        ticker.FuncFormatter(lambda y, _: '{:g}'.format(y)))
    plt.setp(ax.yaxis.get_majorticklabels(), fontsize=5, color='black')
    columns = data_dict['abbr']

    # Add Table
    rows = ['']
    whitefont_cols = []
    col_cols = []
    for i, abbr in enumerate(columns):
        try:
            font_color = _FONT_COLORS[abbr]
            col_cols.append('xkcd:{}'.format(_CANCER_COLORS[abbr]))
        except KeyError:
            for can_abbr in _FONT_COLORS.keys():
                if can_abbr in abbr:
                    font_color = _FONT_COLORS[can_abbr]
            for can_abbr in _CANCER_COLORS.keys():
                if can_abbr in abbr:
                    col_cols.append('xkcd:{}'.format(_CANCER_COLORS[can_abbr]))
                    break
        if font_color == 'white':
            whitefont_cols.append(i)

    table_vals = [['' for _ in columns]]
    row_label_cols = ['white']
    the_table = ax.table(cellText=table_vals,
                         rowLabels=rows,
                         colLabels=columns,
                         loc='bottom',
                         cellLoc='center',
                         colColours=col_cols,
                         rowColours=row_label_cols,
                         bbox=[0, -0.09, 1, .075])
    the_table.auto_set_font_size(False)
    the_table.set_fontsize(5)

    for (row, col), cell in the_table.get_celld().items():
        if (row == 0):
            cell.set_text_props(
                fontproperties=FontProperties(weight='bold', size=3.75))
            if len(columns[0]) > 5:
                cell.set_height(cell.get_height() * 1.5)
            if col in whitefont_cols:
                cell._text.set_color('white')
        else:
            cell.set_height(0)
        if (col == -1):
            cell.set_width(0)
            cell.set_height(0)

    ax2.yaxis.grid(False)
    ax2.spines['left'].set_visible(False)
    ax2.spines['bottom'].set_visible(False)
    ax2.spines['right'].set_visible(False)
    ax2.spines['top'].set_visible(False)
    ax2.patch.set_alpha(0)

    plt.sca(ax2)
    plt.plot([])
    plt.xticks([], [])
    plt.yticks([], [])

    fig = plt.gcf()
    fig_name = 'fig1A_grouped_barplots_{}.pdf'.format(now)
    fig_file = os.path.join(out_path, fig_name)
    logging.info('saving figure at {}'.format(fig_file))
    fig.savefig(fig_file, dpi=300)
    plt.close()
    return
コード例 #5
0
def antisense_boxplot(jx_dir, out_path, now):
    """

    :param jx_dir:
    :param all_files:
    :return:
    """
    all_cancers = list(_TCGA_ABBR.keys())
    main_header = 'median junction count\nacross cancer types'
    data_dict = {main_header: {'data': [[], [], [], [], []]}}
    gtex_total = 0
    adult_total = 0
    sc_total = 0
    dev_total = 0
    un_total = 0
    counts = {'gtex': [], 'adult': [], 'sc': [], 'dev': [], 'un': []}

    can_count = 0
    for cancer in all_cancers:
        logging.info('starting cancer {}'.format(cancer))
        jx_file, flag, prev_glob = get_jx_prev_filename(jx_dir, cancer)
        if not jx_file:
            continue

        can_count += 1
        jx_df = jx_df_from_file(
            jx_file, 0, 1, True, glob_form=prev_glob, sample=False,
            top_x=False, drop_ann=False, cancer=cancer
        )
        gtex = jx_df[(jx_df.gtex == 1)]
        adult = jx_df[(jx_df.gtex == 0) & (jx_df.sra_adult == 1)]
        un = jx_df[
            (jx_df.gtex == 0) &
            (jx_df.sra_stemcells == 0) &
            (jx_df.sra_adult == 0) &
            (jx_df.sra_developmental == 0)
        ]

        # Prep boxplots
        dev = jx_df[
            (~jx_df.jx.isin(un.jx)) & (~jx_df.jx.isin(adult.jx)) &
            (~jx_df.jx.isin(gtex.jx)) & (jx_df.sra_developmental == 1)
        ]
        sc = jx_df[
            (~jx_df.jx.isin(un.jx)) & (~jx_df.jx.isin(adult.jx)) &
            (~jx_df.jx.isin(gtex.jx)) &(jx_df.sra_stemcells == 1)
        ]

        gtex_antisense = gtex[gtex.antisense == 1]
        gtex_antiratio = 100 * len(gtex_antisense) / len(gtex)
        adult_antisense = adult[adult.antisense == 1]
        adult_anti_ratio = 100 * len(adult_antisense) / len(adult)
        dev_antisense = dev[dev.antisense == 1]
        dev_anti_ratio = 100 * len(dev_antisense) / len(dev)
        sc_antisense = sc[sc.antisense == 1]
        sc_ratio = 100 * len(sc_antisense) / len(sc)
        un_antisense = un[un.antisense == 1]
        un_ratio = 100 * len(un_antisense) / len(un)

        gtex_total += len(gtex)
        adult_total += len(adult)
        dev_total += len(dev)
        sc_total += len(sc)
        un_total += len(un)

        counts['gtex'].append(len(gtex))
        counts['adult'].append(len(adult))
        counts['dev'].append(len(dev))
        counts['sc'].append(len(sc))
        counts['un'].append(len(un))

        data_dict[main_header]['data'][0].extend([gtex_antiratio])
        data_dict[main_header]['data'][1].extend([adult_anti_ratio])
        data_dict[main_header]['data'][2].extend([dev_anti_ratio])
        data_dict[main_header]['data'][3].extend([sc_ratio])
        data_dict[main_header]['data'][4].extend([un_ratio])

    print_df = pd.DataFrame({
        'Cancer type': all_cancers,
        'Core normals jx top_x': counts['gtex'],
        'Core normals anti %': data_dict[main_header]['data'][0],
        'Other adult non-cancer jx top_x': counts['adult'],
        'Other adult non-cancer anti %': data_dict[main_header]['data'][1],
        'Developmental jx top_x': counts['dev'],
        'Developmental anti %': data_dict[main_header]['data'][2],
        'Stem cell jc': counts['sc'],
        'Stem cell ar': data_dict[main_header]['data'][3],
        'Unexplained jc': counts['un'],
        'Unexplained ar': data_dict[main_header]['data'][4]
    })
    supp_table = os.path.join(out_path, 'antisense_table_S5.csv')
    with open(supp_table, 'w') as output:
        print_df.to_csv(output, index=False)

    # Assemble boxplot
    plt.rcParams.update({'figure.autolayout': True})
    sns.set_context("paper")
    sns.set_style("whitegrid")

    counts_df = pd.DataFrame(counts)

    table_data = []
    for column in counts_df.columns.values.tolist():
        table_data.append(
            '{:,}\n(IQR: {:,}-{:,})'.format(
                int(round(counts_df[column].median())),
                int(counts_df[column].quantile([0.25])[0.25]),
                int(counts_df[column].quantile([0.75])[0.75])
            )
        )

    data_dict[main_header]['table_data'] = table_data

    plot_info_dict = {}
    plot_info_dict['light colors'] = [
        'xkcd:kermit green',
        'xkcd:light teal', 'xkcd:pale purple', 'xkcd:apricot',
        'xkcd:light red'
    ]

    plot_info_dict['dark colors'] = plot_info_dict['light colors']
    plot_info_dict['legend'] = [
        'Core normals',
        'Other adult non-cancer',
        'Developmental',
        'Stem cell',
        'Unexplained'
    ]

    plot_info_dict['row colors'] = plot_info_dict['light colors']
    plot_info_dict['row font color'] = [
        'black', 'black', 'black', 'black', 'black'
    ]
    plot_info_dict['row labels'] = plot_info_dict['legend']
    fig_name = 'fig3B_antisense_ratios_boxplot_{}.pdf'.format(now)
    fig_file = os.path.join(out_path, fig_name)
    logging.info('saving figure at {}'.format(fig_file))
    # fig_size = mm2inch(84, 140)
    # fig_size = mm2inch(84, 110)
    fig_size = mm2inch(60, 70)
    grouped_boxplots_with_table(
        data_dict, plot_info_dict, fig_file, logscale=False,
        y_label='antisense junctions (%)', percent=False, right_lim_shift=3,
        fig_size=fig_size, intab_fontsize=7, tabrow_fontsize=7,
        tabcol_fontsize=7, expand_rows=1.4
    )
    return
コード例 #6
0
def cluster_jxs_together(db_jx_df, expt_jx_dict, out_path, count_dict,
                         sra_threshold, now):
    """

    :param db_jx_df:
    :param expt_jx_dict:
    :param mode:
    :param bin_sizes:
    :param bin_max:
    :param cancer:
    :return:
    """
    metric = 'seuclidean'
    method = 'ward'

    tcga_cols = db_jx_df.drop(['jx'], axis=1).columns.values.tolist()
    renames = {}
    xlabels_init = []
    for name in tcga_cols:
        can = name.split('_Sample_Percents')[0]
        abbr = _ALL_ABBR[can]
        xlabels_init.append(abbr)
        renames[name] = abbr
    db_jx_df.rename(renames, axis=1, inplace=True)

    db_jx_df = db_jx_df[(db_jx_df != 0).any(axis=1)]

    all_sra_jxs = set()
    for key in expt_jx_dict:
        expt_jxs = expt_jx_dict[key]
        jxs = collect_mutual_jxs(db_jx_df, expt_jxs, key, ret_type='jxs')
        all_sra_jxs = all_sra_jxs.union(jxs)

    full_df = db_jx_df.fillna(0)
    full_df = full_df[full_df['jx'].isin(all_sra_jxs)]
    full_df = full_df.reset_index(drop=True)
    tcga_master_cols = full_df.drop('jx', axis=1).columns.values.tolist()

    sra_cols = list(expt_jx_dict.keys())
    sra_abbr_cols = []
    for item in sra_cols:
        try:
            sra_abbr_cols.append(_SRA_ABBR[item])
        except KeyError:
            continue

    sra_master = pd.DataFrame(full_df['jx'])

    for key in expt_jx_dict:
        try:
            name = _SRA_ABBR[key]
        except KeyError:
            logging.info('key error: {}'.format(key))
            continue
        sra_master[name] = sra_master.jx.apply(
            lambda x: expt_jx_dict[key].get(x, 0) / count_dict[key])

    null_thresh = floor(sra_threshold * len(sra_master))
    sra_master = sra_master.replace(to_replace=0, value=np.nan)
    sra_master = sra_master.dropna(thresh=null_thresh, axis=1).fillna(0)

    xlabelset = [
        'PAAD',
        'pancreatic islet primary cell',
        'THYM',
        'thymus primary cell',
        'thymus tissue',
        'LIHC',
        'hepatocyte primary cell',
        'epithelial primary cell',
        'SKCM',
        'melanocyte cell line',
        'melanocyte primary cell',
    ]
    full_merge = cluster_by_tcga(sra_master, full_df, metric, method)
    merge_df = full_merge[xlabelset]

    logging.info('total junction top_x is: {}'.format(len(merge_df)))
    fig_name = ('fig2C_TCGA_cell_of_origin_heatmap_{}.pdf'.format(now))
    fig_file = os.path.join(out_path, fig_name)

    label_ranges = []
    df_cols = merge_df.columns.values.tolist()
    for i in range(len(df_cols)):
        label_ranges.append([i, i + 1])
    label_ranges = np.array(label_ranges)
    colors = ['xkcd:{}'.format(_FULL_HEATMAP_COLORS[x]) for x in df_cols]
    texts = None
    fontcolors = None
    colorbar_dict = {
        'ranges': label_ranges,
        'colors': colors,
        'labels': texts,
        'fontcolors': fontcolors,
        'height_percent': '2%'
    }
    fig_size = mm2inch(63.5, 114.3)
    masked_double_heatmap(merge_df,
                          tcga_master_cols,
                          fig_file,
                          colorbar=colorbar_dict,
                          other_cbar_label='SRA cell type prevalence',
                          size=fig_size,
                          bottom_pad=-3.5,
                          xlabel_fontsize=5,
                          cbar_fraction=0.1,
                          cbar_pad=0.077,
                          cbarticks=[.01, .1, .5],
                          cbar_font_adder=3)
    logging.info('saving figure at: {}'.format(fig_file))
    return