Example #1
0
def gene_to_ens(genes):
    """
    Convert gene(s) found in the annotation to Ensembl gene IDs
    :param genes: One or more gene names
    :return: pd.Series indexed by genes containing corresponding ENS IDs where available
    """
    if isinstance(genes, str):
        genes = [genes]
    df = reference_genomes.gene_symbol_to_ensembl(genes)
    not_found = df.index[df.isna()]
    for nf in not_found:
        if nf in MANUALLY_CURATED:
            df.loc[nf] = MANUALLY_CURATED[nf]
    for g in UNRELIABLE:
        if g in df:
            df.loc[g] = None

    return df
def tabulate_de_counts_by_direction(de_res, pids=consts.PIDS, **gene_lists):
    """
    Given the lists of genes associated with the hypo and hyper group, tabulate the numbers of matching genes in
    the supplied DE results for each patient. Split counts by DE logFC direction.
    :param de_res:
    :param genes_hypo:
    :param genes_hyper:
    :param pids:
    :return: Raw counts table, Table expressing coutns as a % of the total in that direction
    """
    cols = reduce(
        lambda x, y: x + y,
        [["%s up" % k, "%s down" % k] for k in gene_lists]
    )

    # table of DE counts (in DMR-linked context)
    de_count_table = pd.DataFrame(
        0,
        index=pids,
        columns=cols + ['Total up', 'Total down']
    )

    for pid in pids:
        de_count_table.loc[pid, 'Total up'] = (de_res[pid]['logFC'] > 0).sum()
        de_count_table.loc[pid, 'Total down'] = (de_res[pid]['logFC'] < 0).sum()
        for k, g_arr in gene_lists.items():
            ix = de_res[pid].index.intersection(reference_genomes.gene_symbol_to_ensembl(g_arr).dropna().values)
            de_count_table.loc[pid, '%s up' % k] = (de_res[pid].loc[ix, 'logFC'] > 0).sum()
            de_count_table.loc[pid, '%s down' % k] = (de_res[pid].loc[ix, 'logFC'] < 0).sum()

    # express this as a pct of the total up/down
    de_count_table_pct = pd.DataFrame(
        index=pids,
        columns=cols
    )
    for k in gene_lists:
        for k2 in ['up', 'down']:
            de_count_table_pct.loc[pids, "%s %s" % (k, k2)] = \
                de_count_table.loc[pids, "%s %s" % (k, k2)] / de_count_table.loc[pids, 'Total %s' % k2].astype(float) * 100.

    return de_count_table, de_count_table_pct
Example #3
0
    median_count = dat_n.median(axis=1).sort_values()
    keep_idx = median_count.loc[median_count != 0].index

    dat = dat.loc[keep_idx]
    dat_n = dat_n.loc[keep_idx]
    median_count = median_count.loc[keep_idx]

    # remove any genes that are (mostly) absent in NSC
    nsc_missing = dat.loc[:, ref] < 10.
    dat = dat.loc[~nsc_missing]
    dat_n = dat_n.loc[~nsc_missing]
    median_count = median_count.loc[~nsc_missing]

    # levels of HKG across GBM and GIBCO
    hkg = ['ATP5B', 'GAPDH', 'ACTB']
    hkg_ens = gene_symbol_to_ensembl(hkg)

    label_symbols = hkg + ['BMI1']
    label_ens = gene_symbol_to_ensembl(label_symbols)

    hkg_dat = dat_n.loc[hkg_ens, sorted(dat_n.columns)]
    hkg_dat.index = pd.Index(hkg, name='')
    hkg_dat_rel = hkg_dat.divide(hkg_dat.loc[:, ref], axis=0)
    ax = hkg_dat_rel.transpose().plot.bar()
    ax.set_ylim([0, 3.4])
    plt.tight_layout()
    ax.figure.savefig(os.path.join(outdir, 'housekeeping_levels.png'), dpi=200)

    # identifying stable HKG
    ranked_count = pd.Series(rankdata(median_count, method='ordinal'),
                             index=median_count.index)
    ),
                     axis=0)

    data = data.loc[data.index.str.contains('ENSG')]

    # remove rRNA
    data = data.loc[~data.index.isin(rrna_ensg)]

    # remove MT RNA
    data = data.loc[~data.index.isin(mt_ensg)]

    # normalise by read counts: per million
    data_n = data.divide(data.sum(axis=0), axis=1)

    # extract genes of interest
    genes = reference_genomes.gene_symbol_to_ensembl(['SLC1A3', 'PDGFRA'])

    this_fpkm = data_n.loc[genes] * 1e6
    this_fpkm.index = genes.index

    # normalise by gene length: per million per kilobase
    for g in genes.index:
        this_fpkm.loc[g] = this_fpkm.loc[g] / gene_lengths[g] * 1e3

    ax = this_fpkm.transpose().plot.bar()
    ax.set_ylabel('FPKM')
    ax.figure.tight_layout()
    ax.figure.savefig(os.path.join(OUTDIR, 'all_markers_fpkm.png'), dpi=200)

    # now just our own samples and the reference
    this_fpkm_lim = this_fpkm.loc[:, this_fpkm.columns.str.contains('NSC')]
Example #5
0
    # for plotting
    groups = [(pid, dat_s1.columns[dat_s1.columns.str.contains(pid)])
              for pid in pids]

    for name, p_arr in poi.items():
        this_ipa = dict([(pid, ipa_de_res[pid].reindex(list(p_arr)))
                         for pid in pids])

        # get all genes involved
        union_genes = set()
        for v in this_ipa.values():
            for arr in v.dropna().genes.str.split(',').values:
                union_genes.update(arr)

        union_genes_ens = reference_genomes.gene_symbol_to_ensembl(union_genes)
        union_genes_comp = union_genes_ens.reset_index().set_index(
            'Ensembl Gene ID').squeeze()

        # plot TPM
        this_cpm = cpm.loc[union_genes_ens]

        # sort by mean logFC across patients
        this_de = de_res_wide.loc[union_genes_ens]
        this_logfc = this_de.loc[:, this_de.columns.str.contains('logFC')]
        this_logfc.columns = pids
        sort_ix = (this_logfc.sum(axis=1) /
                   float(len(pids))).sort_values(ascending=False).index
        this_logfc = this_logfc.loc[sort_ix]
        this_cpm = this_cpm.loc[sort_ix]
                                   method='single')
clustering.dendrogram_with_colours(data,
                                   col_colours,
                                   legend_labels=legend_labels,
                                   metric='correlation',
                                   method='average')
clustering.dendrogram_with_colours(data,
                                   col_colours,
                                   legend_labels=legend_labels,
                                   metric='correlation',
                                   method='single')

# concatenate meta and data for plotting
# variables in columns

ens_pdgfra = reference_genomes.gene_symbol_to_ensembl('PDGFRA')
ens_nf1 = reference_genomes.gene_symbol_to_ensembl('NF1')
ens_cdkn2a = reference_genomes.gene_symbol_to_ensembl('CDKN2A')

aa = pd.concat((datan.transpose(), meta), axis=1)

ax = sns.boxplot(y=ens_pdgfra, x='subgroup', data=aa)
ax.set_xlabel('Methylation subgroup')
ax.set_ylabel('PDGFRa (proportion of reads)')
plt.setp(ax.xaxis.get_ticklabels(), rotation=90)
plt.tight_layout()
ax.figure.savefig(os.path.join(outdir, "pdgfra_boxplot.png"), dpi=200)
ax.figure.savefig(os.path.join(outdir, "pdgfra_boxplot.pdf"))

ax = sns.boxplot(y=ens_nf1, x='subgroup', data=aa)
ax.set_xlabel('Methylation subgroup')
        obj61794.meta,
        objwtchg_all.meta,
        objpollard.meta
    ), axis=0)

    data = data.loc[data.index.str.contains('ENSG')]

    # compare with qPCR: comparing markers in the NSC samples and paired astrocytes
    astro_markers1 = [
        'S100B',
        'CD44',
        'ALDH1L1',
        'NFIA',
    ]
    data_markers = data.loc[
        reference_genomes.gene_symbol_to_ensembl(astro_markers1),
        data.columns.str.contains('DURA')
    ]
    data_markers.index = astro_markers1
    series = [data_markers.iloc[i] for i in range(data_markers.shape[0])]
    colours = [
        '#0000FF',
        '#FF0000',
        '#00C000',
        '#AD07E3',
    ]

    # plot absolute counts

    fig = plt.figure(figsize=(8, 7.5))
    ax = fig.add_subplot(111)
        # 'encode_roadmap/ENCSR291IZK': {'batch': 'ENCODE Ecker', 'strandedness': 'r'},
        # 'encode_roadmap/ENCSR572EET': {'batch': 'ENCODE Ecker', 'strandedness': 'r'},
        # 'encode_roadmap/ENCSR977XUX': {'batch': 'ENCODE Ecker', 'strandedness': 'r'},
    }

    to_aggr_nsc = [
        (r'H9_NSC_[12]', 'H9 NSC'),
        # (r'Pollard NSC [12]', 'Fetal NSC'),
    ]

    # Ruiz 9 gene signature - should distinguish ESC and iPSC
    gene_sign = [
        'PTPRT', 'TMEM132C', 'TMEM132D', 'TCERG1L', 'DPP6', 'FAM19A5',
        'RBFOX1', 'CSMD1', 'C22orf34'
    ]
    gene_sign_ens = reference_genomes.gene_symbol_to_ensembl(gene_sign)

    load_kwds = {'source': source, 'alignment_subdir': SetMe}
    if source == 'salmon':
        units = 'tpm'
        load_kwds['units'] = 'tpm'
    if source == 'star':
        # set strandedness as a cue to import for each
        load_kwds['strandedness'] = SetMe

    # restrict samples manually to avoid changes going forwards
    our_samples = consts.S1_RNASEQ_SAMPLES_INSC + consts.S1_RNASEQ_SAMPLES_IPSC + consts.S1_RNASEQ_SAMPLES_FB + [
        'GIBCO_NSC_P4'
    ]

    # our data (everything)
from utils import output
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from rnaseq import loader
from scripts.hgic_final import consts
from hgic_consts import NH_ID_TO_PATIENT_ID_MAP
from utils import reference_genomes

if __name__ == '__main__':
    outdir = output.unique_output_dir()
    eps = 0.1
    seps = Decimal('0.1')
    gois = ['PTGER4']
    ens_gois = reference_genomes.gene_symbol_to_ensembl(gois)

    # pids = consts.PIDS
    pids = ['018']

    meta_fn = os.path.join(DATA_DIR, 'rnaseq', 'tcga_gbm',
                           'primary_tumour/htseq-count_fpkm/sources.csv')
    dat_fn = os.path.join(DATA_DIR, 'rnaseq', 'tcga_gbm',
                          'primary_tumour/htseq-count_fpkm/fpkm.csv')
    tcga_meta = pd.read_csv(meta_fn, header=0, index_col=0)
    tcga_dat = pd.read_csv(dat_fn, header=0, index_col=0)

    # filter: primary GBM only
    ix = (tcga_meta.idh1_status == 'WT')
    tcga_meta = tcga_meta[ix]
    tcga_dat = tcga_dat[tcga_meta.index]
Example #10
0
    tax_id = 9606

    if isinstance(cell_types, str):
        cell_types = [cell_types]

    if unit == 'tpm':
        dat = rnaseq_data.load_salmon_by_patient_id(pids,
                                                    include_control=False,
                                                    type=type)
        dat = general.ensembl_transcript_quant_to_gene(dat, tax_id=tax_id)
    elif unit == 'cpm':
        obj = rnaseq_data.load_by_patient(pids,
                                          type=type,
                                          source='star',
                                          annotate_by='Ensembl Gene ID',
                                          include_control=False)
        dat = obj.data
        dat = dat.divide(dat.sum(axis=0), axis=1) * 1e6
    else:
        raise NotImplementedError()

    if cell_types is not None:
        idx = reduce(
            lambda x, y: x | y,
            [dat.columns.str.contains(t) for t in cell_types],
        )
        dat = dat.loc[:, idx]

    lookup = reference_genomes.gene_symbol_to_ensembl(gois, tax_id=tax_id)

    vals = dat.loc[lookup]
from plotting import common
from rnaseq import loader
from scripts.hgic_final import consts, two_strategies_grouped_dispersion as tsgd
from settings import HGIC_LOCAL_DIR, INTERMEDIATE_DIR
from stats import nht
from utils import output, setops, reference_genomes

if __name__ == "__main__":
    outdir = output.unique_output_dir()
    pids = consts.PIDS
    DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de')

    eps = .1  # offset for log transform

    target_gene = 'CD274'
    target_ens = reference_genomes.gene_symbol_to_ensembl(target_gene)

    # load Salmon data

    obj_cc = loader.load_by_patient(pids, source='salmon')
    ix = obj_cc.meta.index.isin(consts.S1_RNASEQ_SAMPLES)
    obj_cc.filter_samples(ix)

    # add PID to cell culture metadata
    obj_cc.meta.insert(
        0, 'pid',
        obj_cc.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*',
                                      '\g<pid>'))

    obj_ff = loader.load_by_patient(pids, source='salmon', type='ffpe')
    obj_ff.filter_by_sample_name(consts.FFPE_RNASEQ_SAMPLES)
    # is there much overlap in the gene sets between the two groups?
    fig = plt.figure(figsize=(5, 3))
    ax = fig.add_subplot(111)
    venn.venn_diagram(genes_from_dmr_groups['Hyper'],
                      genes_from_dmr_groups['Hypo'],
                      set_labels=['Hyper', 'Hypo'],
                      ax=ax)
    fig.tight_layout()
    fig.savefig(os.path.join(outdir, "genes_from_dmr_groups_venn.png"),
                dpi=200)

    # no, but if we look at the intersection genes, are they in different directions (DE) between the two groups?
    groups_inv = dictionary.complement_dictionary_of_iterables(groups,
                                                               squeeze=True)
    in_both = setops.reduce_intersection(*genes_from_dmr_groups.values())
    in_both_ens = reference_genomes.gene_symbol_to_ensembl(in_both)

    # some of these will have no DE results
    tmp = {}
    for pid in pids:
        tmp[pid] = de_res_s1[pid].reindex(in_both_ens).logFC.dropna()
    in_both_ens = in_both_ens[in_both_ens.isin(pd.DataFrame(tmp).index)]

    fig = plt.figure(figsize=(10.5, 4.))
    ax = fig.add_subplot(111)
    for pid in pids:
        this_grp = groups_inv[pid].lower()
        this_de = de_res_s1[pid].reindex(in_both_ens).logFC
        x = np.arange(
            len(in_both_ens)) + 0.1 * (-1 if this_grp == 'hypo' else 1)
        ax.scatter(x,
Example #13
0
def scatter_plot(pid,
                 de_res_s1,
                 de_res_s1_full,
                 de_res_s2,
                 ipa_df,
                 pw_colours,
                 ms=30,
                 s1_colour='darkblue',
                 s2_colour='lightskyblue',
                 external_ref_labels=('GIBCO', 'H9')):
    external_ref_labels = list(external_ref_labels)

    dat_all = pd.DataFrame.from_dict({
        'logFC': de_res_s1_full[pid].logFC,
        'pFDR': -np.log10(de_res_s1_full[pid].FDR)
    })

    # patient specific

    vs, vc = setops.venn_from_arrays(*[de_res_s1[p].index for p in pids])
    set_ps = list(setops.binary_combinations_sum_eq(len(pids), 1))

    # remember that sets are defined in reverse!
    set_ps = dict([(pids[i], vs[set_ps[(-1 - i)]]) for i in range(len(pids))])

    dat_patient_specific = dat_all.loc[set_ps[pid]]

    # syngeneic only
    the_idx = [de_res_s2[(pid, k)].index for k in [pid] + external_ref_labels]
    vs, vc = setops.venn_from_arrays(*the_idx)

    dat_syngen_only = dat_all.loc[vs['100']]

    # pathway(s)

    in_pathways = {}
    pw_dat = {}

    for pw in pw_colours:
        the_genes = ipa_df.loc[pw].genes.split(',')
        the_ens_id = reference_genomes.gene_symbol_to_ensembl(
            the_genes).dropna().tolist()
        # resolve '/' weirdness
        for g in the_genes:
            if '/' in g:
                g_arr = g.split('/')
                e_arr = reference_genomes.gene_symbol_to_ensembl(
                    g_arr).dropna().tolist()
                the_ens_id += e_arr
        # drop any genes not found in the data
        in_pathways[pw] = dat_all.index.intersection(the_ens_id)
        pw_dat[pw] = dat_all.loc[in_pathways[pw]]

    marker_full = markers.MarkerStyle(marker='o', fillstyle='full')
    marker_s1 = markers.MarkerStyle(marker='o', fillstyle='left')
    marker_s2 = markers.MarkerStyle(marker='o', fillstyle='right')

    fig = plt.figure(figsize=(7.1, 6.3))
    ax = fig.add_subplot(111)

    ax.scatter(dat_all['logFC'],
               dat_all['pFDR'],
               c='gray',
               s=ms,
               zorder=1,
               alpha=0.3,
               label='All DE')
    # S1
    ax.scatter(
        dat_patient_specific.loc[:, 'logFC'],
        dat_patient_specific.loc[:, 'pFDR'],
        marker=marker_s1,
        c=s1_colour,
        s=ms,
        zorder=3,  # in front as there are fewer
        label='Patient specific')
    ax.scatter(
        dat_patient_specific.loc[:, 'logFC'],
        dat_patient_specific.loc[:, 'pFDR'],
        marker=marker_full,
        edgecolor='k',
        linewidths=.5,
        c='none',
        s=ms,
        zorder=3,  # in front as there are fewer
        label=None,
    )
    # S2
    ax.scatter(
        dat_syngen_only['logFC'],
        dat_syngen_only['pFDR'],
        marker=marker_s2,
        c=s2_colour,
        s=ms,
        zorder=2,  # behind S1 as there are more
        label='Syngeneic only')
    ax.scatter(
        dat_syngen_only['logFC'],
        dat_syngen_only['pFDR'],
        marker=marker_full,
        edgecolor='k',
        linewidths=.5,
        c='none',
        s=ms,
        zorder=2,  # behind S1 as there are more
        label=None,
    )

    for pw in pw_colours:
        ax.scatter(pw_dat[pw]['logFC'],
                   pw_dat[pw]['pFDR'],
                   c='none',
                   edgecolor=pw_colours[pw],
                   linewidths=1.5,
                   s=ms + 2,
                   zorder=4,
                   label=pw)

    ax.legend(loc='upper right', frameon=True, facecolor='w', framealpha=0.7)
    ax.set_xlabel('logFC')
    ax.set_ylabel('-log10(FDR)')
    fig.tight_layout()

    return fig, ax