def compute_cross_de(rnaseq_obj,
                     main_pids,
                     additional_pids,
                     external_references=(('GIBCO', 'NSC'), ),
                     lfc=1,
                     fdr=0.01,
                     method='QLGLM'):
    """
    Compute DE between every patient GBM sample and every _other_ healthy patient sample, in addition to paired DE.
    We can also include one or more external references (e.g. Gibco, the default).
    :param rnaseq_obj:
    :param main_pids: The PIDs of the patients in the study
    :param additional_pids: The PIDs of the additional iNSC lines to include. These will be treated as additional
    reference lines.
    :param external_references:
    :param lfc:
    :param fdr:
    :param method:
    :param njob:
    :return:
    """
    if method not in {'QLGLM', 'GLM', 'exact'}:
        raise NotImplementedError("Unsupported method.")
    de = {}

    for pid in main_pids:

        # cross comparison
        for pid2 in list(main_pids) + list(additional_pids):
            the_idx = (rnaseq_obj.meta.index.str.contains(pid) & (rnaseq_obj.meta.loc[:, 'type'] == 'GBM')) | \
                      (rnaseq_obj.meta.index.str.contains(pid2) & (rnaseq_obj.meta.loc[:, 'type'] == 'iNSC'))
            the_data = rnaseq_obj.data.loc[:, the_idx]
            the_data = filter.filter_by_cpm(the_data, min_n_samples=1)
            the_groups = rnaseq_obj.meta.loc[the_idx, 'type'].values
            the_comparison = ['GBM', 'iNSC']

            de[(pid, pid2)] = run_one_de(the_data,
                                         the_groups,
                                         the_comparison,
                                         lfc=lfc,
                                         fdr=fdr,
                                         method=method)

        # external reference comparison
        for er, er_type in external_references:
            the_idx = (rnaseq_obj.meta.index.str.contains(pid) & (rnaseq_obj.meta.loc[:, 'type'] == 'GBM')) | \
                      (rnaseq_obj.meta.index.str.contains(er) & (rnaseq_obj.meta.loc[:, 'type'] == er_type))
            the_data = rnaseq_obj.data.loc[:, the_idx]
            the_data = filter.filter_by_cpm(the_data, min_n_samples=1)
            the_groups = rnaseq_obj.meta.loc[the_idx, 'type'].values
            the_comparison = ['GBM', er_type]
            de[(pid, er)] = run_one_de(the_data,
                                       the_groups,
                                       the_comparison,
                                       lfc=lfc,
                                       fdr=fdr,
                                       method=method)

    return de
    rna_ff_dat = rna_ff_obj.data.copy()
    rna_cc_dat = rna_cc_obj.data.copy()

    # update labels
    rna_ff_dat.columns = ["GBM%s" % t for t in p_id]

    cols = []
    for t in rna_cc_dat.columns:
        p, q = t.replace('GBM', 'GIC').split('_')
        q = q.replace('n', ' & P')
        cols.append("%s (%s)" % (p, q))
    rna_cc_dat.columns = cols

    # filter low expression out
    rna_ff_dat = filter.filter_by_cpm(rna_ff_dat,
                                      min_cpm=min_tpm,
                                      min_n_samples=2)
    rna_cc_dat = filter.filter_by_cpm(rna_cc_dat,
                                      min_cpm=min_tpm,
                                      min_n_samples=2)

    # reduce to matching probes
    probes = rna_cc_dat.index.intersection(rna_ff_dat.index)

    if remove_mt:
        probes = probes[~probes.isin(mt_ens)]

    rna_ff_dat = np.log2(rna_ff_dat.loc[probes] + eps)
    rna_cc_dat = np.log2(rna_cc_dat.loc[probes] + eps)

    # QN
                                                         polya_nsc_meta.index]

    assigned_sum = polya_nsc_data.sum()
    unassigned_sum = polya_nsc_unassigned.drop('N_unmapped').sum()

    polya_pct_assigned = assigned_sum / (assigned_sum + unassigned_sum) * 100.

    print "Poly(A) samples % assigned"
    print polya_pct_assigned

    # combine data then eliminate genes that are not expressed
    ss2_data = ss2_obj.data
    ss2_data.columns = ["%s_smartseq" % t for t in ss2_data.columns]
    polya_nsc_data = polya_obj.data
    data = pd.concat((ss2_data, polya_nsc_data), axis=1)
    data = filter.filter_by_cpm(data, min_cpm=min_cpm, min_n_samples=1)

    # TMM normed version
    data_n = transformations.edger_tmm_normalisation_cpm(data)

    # plot the CDF of the log2(CPM) values
    # for this purpose, we need to filter the CPM values for each col separately
    x_cdf = np.linspace(-5, 15, 500)
    log_cpm_ecdf = {}

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ss_nsc_lbl = False
    ss_opc_lbl = False
    pa_lbl = False
Esempio n. 4
0
if __name__ == '__main__':
    lfc = 1
    fdr = 0.01
    outdir = output.unique_output_dir("paired_rnaseq")
    # RTK II samples
    # pids = ['017', '050', '054', '061']
    # all n=2 samples
    pids = ['018', '044', '049', '050', '052', '054', '061']
    # all samples
    # pids = [t for t in rnaseq_data.PATIENT_LOOKUP_STAR if t != 'GIBCO']
    obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID')
    # discard unmapped, etc
    obj.data = obj.data.loc[obj.data.index.str.contains('ENSG')]

    dat_filt = filter_by_cpm(obj.data, min_n_samples=2)

    de = {}
    de_up = {}
    de_down = {}
    de_counts = {}
    de_counts_up = {}
    de_counts_down = {}
    de_matched = {}
    de_gibco = {}

    # all gene lists combined in one file (one sheet per list)
    xl_writer = pd.ExcelWriter(
        os.path.join(outdir, "individual_gene_lists.xlsx"))

    for pid in pids:
    return cor, pval


if __name__ == "__main__":
    min_tpm = 1.
    eps = 0.01
    outdir = output.unique_output_dir()
    # load just 1st lane
    obj = loader.load_references('wtchg_p180443/180911_K00150_0372_AHWV7TBBXX',
                                 tax_id=10090,
                                 source='salmon')
    ix = (obj.meta.species
          == 'mouse') | (obj.meta.index.str.contains(r'[iI]MGL'))
    dat = obj.data.loc[:, ix]
    dat_filt = filter.filter_by_cpm(dat, min_cpm=min_tpm, unless_cpm_gt=10.)
    log_dat_filt = np.log2(dat_filt + eps)

    # correlation clustermap
    row_colours = pd.DataFrame('g',
                               index=log_dat_filt.columns,
                               columns=['Sample type'])
    row_colours.loc[row_colours.index.str.contains('mDURA')] = 'k'
    row_colours.loc[row_colours.index.str.contains(
        'mDURA5_NSCmus_N3BE50.2')] = 'y'
    row_colours.loc[row_colours.index.str.contains('mDURA6_NSCmus')] = 'y'

    # Spearman distance

    cor, pval = pairwise_correlation(log_dat_filt.transpose(),
                                     method='spearman')
    dat_pt = pd.read_csv(dat_fn, header=0, index_col=0)
    dat_hn = pd.read_csv(dat_hn_fn, header=0, index_col=0)

    dat_pt.columns = [t[:12] for t in dat_pt.columns]
    dat_pt = dat_pt.loc[:, ~dat_pt.columns.duplicated()]
    meta = meta.loc[dat_pt.columns]
    meta = meta.loc[~meta.index.duplicated()]
    dat_hn.columns = ["%s_normal" % t[:12] for t in dat_hn.columns]

    # filter samples
    keep = (meta['gcimp_methylation'] == 'non-G-CIMP') & (meta['idh1_status']
                                                          == 'WT')
    dat_pt = dat_pt.loc[:, keep]

    all_dat = pd.concat((dat_pt, dat_hn), axis=1)
    all_dat = filter.filter_by_cpm(all_dat, min_n_samples=3, min_cpm=1)

    groups = ['GBM'] * dat_pt.shape[1] + ['control'] * dat_hn.shape[1]

    de_res = differential_expression.run_one_de(all_dat, groups,
                                                ('GBM', 'control'),
                                                **de_params)

    # de_res.to_excel(os.path.join(outdir, 'tcga_primary_tumour_vs_normal_solid.xlsx'))

    de_res_full = differential_expression.run_one_de(all_dat,
                                                     groups,
                                                     ('GBM', 'control'),
                                                     return_full=True,
                                                     **de_params)
    de_res_full.to_excel(
        0, 'patient_id',
        obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*',
                                   '\g<pid>'))

    cmap = common.get_best_cmap(len(consts.PIDS))
    scatter_colours = dict(zip(consts.PIDS, cmap))

    scatter_markers = {'GBM': 's', 'iNSC': 'o'}

    # scaling parameter applied during SVD
    scale_preserved = 0.05

    sample_colours = obj.meta.patient_id.map(scatter_colours.get).to_dict()
    sample_markers = obj.meta.type.map(scatter_markers.get).to_dict()

    dat = filter.filter_by_cpm(obj.data, min_n_samples=2)
    # TODO: include VST or similar here
    dat = np.log(dat + eps)
    # copy of dat with gene symbols
    dat_with_gs = dat.copy()
    general.add_gene_symbols_to_ensembl_data(dat_with_gs)
    # fill back in with ENS where no gene symbol is available
    dat_with_gs.loc[dat_with_gs['Gene Symbol'].isnull(),
                    'Gene Symbol'] = dat_with_gs.index[
                        dat_with_gs['Gene Symbol'].isnull()]

    # recreate Sven's original (unweighted) plot
    # we're not going to use this, it's just for validation / reproducibility
    fig, ax, _ = plot_biplot(dat,
                             obj.meta, (0, 1),
                             scatter_colours,
Esempio n. 8
0
from utils import output
import os
import numpy as np

if __name__ == "__main__":
    min_cpm = 1
    obj = loader.load_by_patient('all', type='ffpe')
    samples = [
        'NH15_1661DEF2C',
        'NH15_1877_SP1C',
        'NH15_2101_DEF1A',
        'NH16_270_DEF1Ereplacement',
        'NH16_616DEF1B',
        'NH16_677_SP1A',
        'NH16_1574DEF1A',
        'NH16_1976_DEF1Areplacement',
        'NH16_2063_DEF1Areplacement',
        'NH16_2214DEF1A',
        'NH16_2255DEF1B2',
        'NH16_2806DEF3A1',
    ]

    # remove duplicates
    dat = obj.data.loc[:, samples]
    dat = filter.filter_by_cpm(dat, min_cpm=min_cpm, min_n_samples=1)
    cpm = (dat + 1).divide(dat.sum() + 1, axis=1) * 1e6
    general.add_gene_symbols_to_ensembl_data(cpm)

    outdir = output.unique_output_dir("ffpe_logcpm_values")

    cpm.to_excel(os.path.join(outdir, 'cpm_ffpe_filtered.xlsx'))
                                        source='salmon',
                                        tax_id=10090)

    # dump to file for sharing
    dat = obj_salmon.data.copy()
    general.add_gene_symbols_to_ensembl_data(dat, tax_id=10090)
    dat.to_excel(os.path.join(outdir, 'salmon_tpm_all_data.xlsx'))
    dat = obj_star.data.copy()
    general.add_gene_symbols_to_ensembl_data(dat, tax_id=10090)
    dat.to_excel(os.path.join(outdir, 'star_counts_all_data.xlsx'))

    # load Bowman data

    # plots with only our samples
    dat = filter.filter_by_cpm(obj_salmon.data,
                               min_cpm=min_cpm,
                               min_n_samples=2)
    log_dat = np.log10(obj_salmon.data + eps)

    # ECDF
    ax = rnaseq.log_cpm_ecdf_plot(dat, units='tpm', min_cpm=min_cpm)
    ax.figure.set_size_inches(6, 4)
    ax.figure.tight_layout()
    ax.figure.savefig(os.path.join(outdir, "cdf_our_samples.png"), dpi=200)

    # PCA
    colour_subgroups = obj_salmon.meta.treatment

    cmap = collections.OrderedDict(
        zip(
            colour_subgroups,
Esempio n. 10
0
    data = pd.concat((loader_hgic.data.loc[genes], h9_data), axis=1)
    meta = pd.concat((loader_hgic.meta, pd.DataFrame(h9_meta).transpose()),
                     axis=0)

    matches = ('018', '019', '024', '026', '030', '031', '044', 'GIBCO', 'H9')

    col_colours = clustering.generate_colour_map_dict(meta,
                                                      'sample',
                                                      matches,
                                                      label='Patient',
                                                      non_matching='gray')

    # filter
    data = filter.filter_by_cpm(data,
                                min_cpm=1,
                                min_n_samples=3,
                                unless_cpm_gt=10)
    # data = filter.filter_by_cpm(data, min_cpm=1, min_n_samples=3, unless_cpm_gt=None)

    # normalise by read count
    cpm = (data + 1).divide((data + 1).sum(axis=0), axis=1) * 1e6

    # transform
    log_data = np.log2(cpm)
    vst_data = variance_stabilizing_transform(cpm)
    mad_log_srt = median_absolute_deviation(log_data).sort_values(
        ascending=False)
    mad_vst_srt = median_absolute_deviation(vst_data).sort_values(
        ascending=False)

    for NGENE in [500, 1000, 1500, 2000, 2500]:
Esempio n. 11
0
import numpy as np
from utils import output
import os
import pandas as pd
from matplotlib import pyplot as plt


if __name__ == "__main__":
    eps = 1e-2

    outdir = output.unique_output_dir("export_sb_data")
    obj_star = loader.load_by_patient(['ICb1299', '3021'], source='star', type='cell_culture', include_control=False)
    obj_salmon = loader.load_by_patient(['ICb1299', '3021'], source='salmon', type='cell_culture', include_control=False)

    # cluster plot
    tpm = filter.filter_by_cpm(obj_salmon.data, min_cpm=1, min_n_samples=4)

    batch_colours = common.COLOUR_BREWERS[len(obj_salmon.meta.batch.unique())]
    line_colours = common.COLOUR_BREWERS[2]
    cc = pd.DataFrame(line_colours[0], index=tpm.columns, columns=['Batch', 'Cell line'])

    aa, bb = obj_salmon.meta.batch.factorize()
    for i in range(aa.max()):
        cc.loc[aa == i, 'Batch'] = batch_colours[i]
    cc.loc[cc.index.str.contains('3021'), 'Cell line'] = line_colours[1]

    cg = clustering.dendrogram_with_colours(
        np.log2(tpm + eps),
        cc,
    )
    cg['fig'].savefig(os.path.join(outdir, "dendrogram_pearson_log_tpm_all_genes.png"), dpi=200)