Ejemplo n.º 1
0
def add_genes_and_save(dat, fn, type='excel'):
    general.add_gene_symbols_to_ensembl_data(dat)
    if type == 'excel':
        save_func = lambda x: x.to_excel
    elif type == 'csv':
        save_func = lambda x: x.to_csv
    else:
        raise NotImplementedError("Unsupported type %s." % type)

    save_func(dat)(fn)
        # insert spacing columns
        po_dat.insert(spacing1, '', np.nan)
        po_dat.insert(spacing2, ' ', np.nan)

        fig = plt.figure(figsize=(7, 3 + 10 / 50. * len(po_ref_diff)))
        ax = fig.add_subplot(111)
        ax = sns.heatmap(po_dat, cmap=cmap, ax=ax)
        plt.setp(ax.xaxis.get_ticklabels(), rotation=90)
        plt.setp(ax.yaxis.get_ticklabels(), rotation=0)
        fig.tight_layout()
        fig.savefig(os.path.join(outdir, "consistently_in_pair_only_%s.png" % c), dpi=200)

        # export those same genes to a file, adding gene symbols
        for_export = rnaseq_obj.data.loc[po_ref_diff, the_cols]
        general.add_gene_symbols_to_ensembl_data(for_export)
        for_export.to_excel(os.path.join(outdir, 'consistently_in_pair_only_%s.xlsx' % c))

        # export the pair_only genes (GBM vs iNSC paired -- GBM vs ref) to a file
        print "Combination type: %s" % c
        po_de_export = {}
        for pid in pids:
            # get PO DE genes
            the_idx = pair_only.loc[pid, c]
            # subtract the correction genes
            the_corr = po_ref_diff
            print "Subtracting %d correction genes from %d PO DE genes to leave %d PO DE genes." % (
                len(the_corr),
                len(the_idx),
                len(set(the_idx).difference(the_corr))
            )
    patient = matched_data.columns.str.replace('_.*',
                                               '').str.replace('DURA', '')
    library_prep = [
        'SS' if 'smartseq' in t else 'polyA' for t in matched_data.columns
    ]

    fit, design = differential_expression.edger_fit_glm(
        matched_data,
        de_method,
        '~patient + library_prep',
        patient=patient,
        library_prep=library_prep,
    )
    de_res_ss = differential_expression.edger_test(fit, design,
                                                   "library_prepSS")
    general.add_gene_symbols_to_ensembl_data(de_res_ss)

    # 2) separate comparisons SS vs polyA
    # restrict this to matching passage / subclone
    # use grouped dispersion approach (SS vs PolyA) and GLM method to avoid error when estimating DOF
    de_method = 'GLM'

    sample_pairs = {
        '019': ('DURA019_NSC_N8C_P2', 'DURA019_NSC_N8C_smartseq'),
        '031': ('DURA031_NSC_N44B_P2', 'DURA031_NSC_N44B_smartseq'),
        '049': ('DURA049_NSC_N19_P4', 'DURA049_NSC_N19_P4_smartseq'),
        '052': ('DURA052_NSC_N4_P3', 'DURA052_NSC_N4_P3_smartseq')
    }
    all_snames = []
    [all_snames.extend(sample_pairs[p]) for p in pids]
    this_matched = matched_data.loc[:, all_snames]
Ejemplo n.º 4
0
# add null set manually from full DE results
de_genes_all = setops.reduce_union(*venn_set.values())
k_null = ''.join(['0'] * len(pids))
venn_set[k_null] = list(de_res_full_s1[pids[0]].index.difference(de_genes_all))
venn_ct[k_null] = len(venn_set[k_null])

de_data = setops.venn_set_to_wide_dataframe(de_res_s1,
                                            venn_set,
                                            pids,
                                            full_data=de_res_full_s1,
                                            cols_to_include=['logFC', 'FDR'],
                                            consistency_check_col='logFC',
                                            consistency_check_method='sign')
# add gene symbols back in
general.add_gene_symbols_to_ensembl_data(de_data)
de_data.to_excel(os.path.join(outdir, 'full_de.xlsx'))

# load methylation data
me_ff_obj, anno = tsgd.load_methylation(pids,
                                        type='ffpe',
                                        norm_method=norm_method_s1)
me_cc_obj, anno = tsgd.load_methylation(pids,
                                        type='cell_culture',
                                        norm_method=norm_method_s1)
# filter CC to include only iNSC
me_cc_obj.filter_samples(
    me_cc_obj.meta.index.isin(consts.S1_METHYL_SAMPLES_INSC))
# FIXME: this is a bug in the loader?
me_cc_obj.batch_id = me_cc_obj.meta.batch
Ejemplo n.º 5
0
    for cmp in comparisons:
        lbl = "%s_vs_%s" % cmp
        jobs[lbl] = pool.apply_async(run_one_de,
                                     args=(dat, groups, cmp),
                                     kwds=de_params)
        # res[lbl] = run_one_de(dat, groups, cmp, **de_params)
        # print "%d DE genes\n" % (res[lbl].FDR <= de_params['fdr']).sum()

    for lbl in jobs:
        res[lbl] = jobs[lbl].get(1e6)
        print lbl
        print "%d DE genes\n" % (res[lbl].FDR <= de_params['fdr']).sum()

    for k, v in res.items():
        general.add_gene_symbols_to_ensembl_data(v, tax_id=10090)
        res_sign[k] = v.loc[v.FDR <= de_params['fdr']]

    excel.pandas_to_excel(res, os.path.join(outdir,
                                            "mouse_GBM_NSC_DE_all.xlsx"))
    excel.pandas_to_excel(
        res_sign, os.path.join(outdir, "mouse_GBM_NSC_DE_significant.xlsx"))

    # finally, re-run with a lfc of zero
    # disabled for now to speed things up
    if False:
        de_params['lfc'] = 0

        jobs2 = {}
        print "No logFC requirement"
    scatter_colours = dict(zip(consts.PIDS, cmap))

    scatter_markers = {'GBM': 's', 'iNSC': 'o'}

    # scaling parameter applied during SVD
    scale_preserved = 0.05

    sample_colours = obj.meta.patient_id.map(scatter_colours.get).to_dict()
    sample_markers = obj.meta.type.map(scatter_markers.get).to_dict()

    dat = filter.filter_by_cpm(obj.data, min_n_samples=2)
    # TODO: include VST or similar here
    dat = np.log(dat + eps)
    # copy of dat with gene symbols
    dat_with_gs = dat.copy()
    general.add_gene_symbols_to_ensembl_data(dat_with_gs)
    # fill back in with ENS where no gene symbol is available
    dat_with_gs.loc[dat_with_gs['Gene Symbol'].isnull(),
                    'Gene Symbol'] = dat_with_gs.index[
                        dat_with_gs['Gene Symbol'].isnull()]

    # recreate Sven's original (unweighted) plot
    # we're not going to use this, it's just for validation / reproducibility
    fig, ax, _ = plot_biplot(dat,
                             obj.meta, (0, 1),
                             scatter_colours,
                             scatter_markers,
                             annotate_features_radius=0.4,
                             include_weighting=False,
                             scale=10.)
    fig.savefig(os.path.join(
Ejemplo n.º 7
0
def ens_index_to_gene_symbol(df):
    general.add_gene_symbols_to_ensembl_data(df)
    tmp = df['Gene Symbol'].dropna()
    df = df.loc[tmp.index]
    df.set_index('Gene Symbol', inplace=True)
    return df
Ejemplo n.º 8
0
from utils import output
import os
import numpy as np

if __name__ == "__main__":
    min_cpm = 1
    obj = loader.load_by_patient('all', type='ffpe')
    samples = [
        'NH15_1661DEF2C',
        'NH15_1877_SP1C',
        'NH15_2101_DEF1A',
        'NH16_270_DEF1Ereplacement',
        'NH16_616DEF1B',
        'NH16_677_SP1A',
        'NH16_1574DEF1A',
        'NH16_1976_DEF1Areplacement',
        'NH16_2063_DEF1Areplacement',
        'NH16_2214DEF1A',
        'NH16_2255DEF1B2',
        'NH16_2806DEF3A1',
    ]

    # remove duplicates
    dat = obj.data.loc[:, samples]
    dat = filter.filter_by_cpm(dat, min_cpm=min_cpm, min_n_samples=1)
    cpm = (dat + 1).divide(dat.sum() + 1, axis=1) * 1e6
    general.add_gene_symbols_to_ensembl_data(cpm)

    outdir = output.unique_output_dir("ffpe_logcpm_values")

    cpm.to_excel(os.path.join(outdir, 'cpm_ffpe_filtered.xlsx'))
    treatment_colour = {'WT': '0.8', 'Rheb KO': '0.2'}

    outdir = output.unique_output_dir()

    # load our data
    obj_star = loader.load_references('wtchg_p190202',
                                      alignment_subdir='mouse',
                                      tax_id=10090)
    obj_salmon = loader.load_references('wtchg_p190202',
                                        alignment_subdir='mouse',
                                        source='salmon',
                                        tax_id=10090)

    # dump to file for sharing
    dat = obj_salmon.data.copy()
    general.add_gene_symbols_to_ensembl_data(dat, tax_id=10090)
    dat.to_excel(os.path.join(outdir, 'salmon_tpm_all_data.xlsx'))
    dat = obj_star.data.copy()
    general.add_gene_symbols_to_ensembl_data(dat, tax_id=10090)
    dat.to_excel(os.path.join(outdir, 'star_counts_all_data.xlsx'))

    # load Bowman data

    # plots with only our samples
    dat = filter.filter_by_cpm(obj_salmon.data,
                               min_cpm=min_cpm,
                               min_n_samples=2)
    log_dat = np.log10(obj_salmon.data + eps)

    # ECDF
    ax = rnaseq.log_cpm_ecdf_plot(dat, units='tpm', min_cpm=min_cpm)
Ejemplo n.º 10
0
            y[aa == a, 1],
            facecolor=batch_colours[a],
            edgecolor='k',
            s=30,
            label=b
        )
    ax.set_xlabel("PC1 (%.1f %%)" % (p.explained_variance_ratio_[0] * 100))
    ax.set_ylabel("PC2 (%.1f %%)" % (p.explained_variance_ratio_[1] * 100))
    ax.legend(loc='lower right', frameon=True, facecolor='w', framealpha=0.7)
    fig.tight_layout()
    fig.savefig(os.path.join(outdir, "pca_log_tpm.png"), dpi=200)

    # this looks almost the same!
    # log_tpm_qn = transformations.quantile_normalisation(np.log2(tpm + eps))
    # cg = clustering.dendrogram_with_colours(
    #     log_tpm_qn,
    #     cc,
    # )

    # export data
    cpm = obj_star.data.divide(obj_star.data.sum(), axis=1) * 1e6
    counts = obj_star.data.copy()
    tpm = obj_salmon.data.copy()
    general.add_gene_symbols_to_ensembl_data(counts, tax_id=9606)
    general.add_gene_symbols_to_ensembl_data(cpm, tax_id=9606)
    general.add_gene_symbols_to_ensembl_data(tpm, tax_id=9606)

    counts.to_excel(os.path.join(outdir, "ICb1299_CRL3021.counts.xlsx"))
    cpm.to_excel(os.path.join(outdir, "ICb1299_CRL3021.cpm.xlsx"))
    tpm.to_excel(os.path.join(outdir, "ICb1299_CRL3021.tpm.xlsx"))
    obj_star.meta.to_excel(os.path.join(outdir, "ICb1299_CRL3021.meta.xlsx"))