def reannotate(this_res):
    """
    Convert Ensembl IDs back to gene symbols for easier intepretability
    :param this_res:
    :return:
    """
    # get all Entrez gene IDs and convert in one go
    all_genes = set()
    for k, df in this_res.items():
        for t in df.study_items.str.split(', ').dropna():
            all_genes.update(t)
    gene_conv = reference_genomes.ensembl_to_gene_symbol(sorted(all_genes))

    new_res = {}

    for k in this_res.keys():
        df = this_res[k].copy()
        this_gene_symb = []
        for t in df.study_items:
            if pd.isnull(t):
                this_gene_symb.append('')
            else:
                this_gene_symb.append(','.join(
                    gene_conv.loc[t.split(', ')].dropna().values))
        df.drop('study_items', axis=1, inplace=True)
        df.insert(df.shape[1], 'genes_in_term', this_gene_symb)
        new_res[k] = df
    return new_res
Beispiel #2
0
def top_genes(
    data,
    n=100,
    convert_to_symbols=True,
    tax_id=9606,
):
    """
    Retrieve the top n genes from the data
    :param data: Indexed by ensembl_ID
    :param units:
    :param n:
    :return:
    """
    if convert_to_symbols:
        # get gene symbols and drop all NaN
        gs = reference_genomes.ensembl_to_gene_symbol(data.index,
                                                      tax_id=tax_id).dropna()
        gs = gs.loc[~gs.index.duplicated()]
        gs = gs.loc[~gs.duplicated()]
    res = {}
    for col in data.columns:
        t = data.loc[:, col].sort_values(ascending=False)[:n]
        if convert_to_symbols:
            new_idx = gs.loc[t.index]
            new_idx.loc[new_idx.isnull()] = t.index[new_idx.isnull()]
            t.index = new_idx
        res[col] = set(t.index)
    return res
def load_rnaseq_htseq_count_data(by_gene=False):
    """
    Load in HTSeq counting data from pre-existing ht-seq run.
    :param by_gene: If True, translate the raw Ensembl codes to gene symbol. Discard any that do not translate, except
    _ambiguous, _no_feature, _unmapped.
    :return:
    """
    infiles = {
        'XZ1': 'xz1_exon_counts_gr37_reverse.dill',
    }
    res = pd.DataFrame()

    for tag, fn in infiles.items():
        ff = os.path.join(RNASEQ_GENE_COUNTS_DIR, fn)
        with open(ff, 'rb') as f:
            t = pickle.load(f)
        if by_gene:
            trans = reference_genomes.ensembl_to_gene_symbol(t.index)
            # keep only the non-null entries
            trans = trans.loc[~trans.isnull()]
            t = t.loc[trans.index.union(RNA_COUNT_FIELDS)]
            # reindex
            t.index = list(trans.values) + RNA_COUNT_FIELDS

        res[tag] = t

    return res
Beispiel #4
0
def add_gene_symbols(df):
    """
    Add gene symbols to the DataFrame df which is indexed by Ensembl IDs
    """
    gs = reference_genomes.ensembl_to_gene_symbol(df.index)
    # resolve any duplicates arbitrarily (these should be rare)
    gs = gs.loc[~gs.index.duplicated()]
    df.insert(0, 'Gene Symbol', gs)
def prepare_gct_files(outdir=None):
    """
    Prepare the GCT files required to perform classification:
    - Our GBM FFPE and cell culture samples
    - TCGA RNA-Seq cohort
    - Both combined
    In all cases, use FPKM units and gene symbols, as these are used by Wang
    """
    if outdir is None:
        outdir = unique_output_dir("gct_files_for_wang")

    infiles = []

    # 1) Our data
    obj_ffpe = rnaseq_data.load_by_patient('all', type='ffpe')
    dat_ffpe = obj_ffpe.get_fpkm()
    dat_ffpe.columns = ['%s_FFPE' % t for t in obj_ffpe.meta.reference_id]
    obj_cc = rnaseq_data.load_by_patient(patient_ids='all')
    dat_cc = obj_cc.get_fpkm()
    dat_cc = dat_cc.loc[:, obj_cc.meta.type == 'GBM']
    dat_all = pd.concat((dat_cc, dat_ffpe), axis=1)
    idx = reference_genomes.ensembl_to_gene_symbol(dat_all.index).dropna()
    dat_all = dat_all.loc[idx.index]
    dat_all.index = idx
    fn = os.path.join(outdir, "gbm_ffpe_cc_fpkm.gct")
    gsea.data_to_gct(dat_all, fn)
    infiles.append(fn)

    # 2) TCGA (IDH1 WT only)
    tcga_dat, tcga_meta = rnaseq_data.tcga_primary_gbm(units='fpkm')
    tcga_dat = tcga_dat.loc[:, tcga_meta.idh1_status == 'WT']
    idx = reference_genomes.ensembl_to_gene_symbol(tcga_dat.index).dropna()
    idx = idx.loc[~idx.index.duplicated()]
    tcga_dat = tcga_dat.loc[idx.index]
    tcga_dat.index = idx
    fn = os.path.join(outdir, "tcga_idh1_wt_fpkm.gct")
    gsea.data_to_gct(tcga_dat, fn)
    infiles.append(fn)

    # 3) Combined
    dat = gsea.combine_gct_files(*infiles)
    fn = os.path.join(outdir, "tcga_idh1_wt_and_gbm_ffpe_cc_fpkm.gct")
    gsea.data_to_gct(dat, fn)
                                                       design=design)

    de_res_separate = {}
    for p in pids:
        de_res_separate[p] = differential_expression.edger_test(
            fit, design, "groupSmartSeq%s - groupPolyA%s" % (p, p))
        general.add_gene_symbols_to_ensembl_data(de_res_separate[p])
        print "Patient %s: %d DE genes in SmartSeq2 - PolyA (%d up, %d down)." % (
            p,
            de_res_separate[p].shape[0],
            (de_res_separate[p].logFC > 0).sum(),
            (de_res_separate[p].logFC < 0).sum(),
        )

    de_in_all = reference_genomes.ensembl_to_gene_symbol(
        setops.reduce_intersection(
            *[t.index for t in de_res_separate.values()]))
    # sort this by the avg logFC
    logfc_in_all = pd.DataFrame.from_dict(
        dict([(p, v.loc[de_in_all.index, 'logFC'])
              for p, v in de_res_separate.items()]))
    logfc_in_all = logfc_in_all.loc[logfc_in_all.mean(
        axis=1).abs().sort_values(ascending=False).index]
    general.add_gene_symbols_to_ensembl_data(logfc_in_all)

    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(111, facecolor='w')
    venn.venn_diagram(set_labels=de_res_separate.keys(),
                      *[t.index for t in de_res_separate.values()],
                      ax=ax)
    fig.tight_layout()
# pick one in each case
# duplicated() doesn't mark the first entry by default
aa = aa[~pd.Index(aa.values).duplicated()]

dat2 = dat.loc[:, aa.index.intersection(dat.columns)]
cols = aa.loc[dat2.columns].values

bb = cases_mani.loc[cols]
# these should be unique...
if pd.Index(bb.values).duplicated().any():
    raise AttributeError("Some case IDs are duplicated in the final dataset")
dat2.columns = bb.values

# finally, only keep those cases that are also in the Brennan table
not_in_meta = pd.Index(bb.values).difference(meta.index)

if len(not_in_meta):
    this = bb.loc[pd.Index(bb.values).isin(not_in_meta)]
    print "Cases not in meta: \n%s" % this.to_string()

meta = meta.loc[meta.index.intersection(bb.values)]
dat2 = dat2.loc[:, meta.index]

# add gene symbols
gs = reference_genomes.ensembl_to_gene_symbol(dat2.index)
dat2.loc[:, 'Approved Symbol'] = gs

# export
meta.to_csv(os.path.join(indir, 'sources.csv'))
dat2.to_csv(os.path.join(indir, 'counts.csv'))
        for g in cl.genes:
            gene_to_dm_cluster.setdefault(g[0], set()).add(c)

    ens_to_dm_cluster = {}
    for c, cl in dmr_res_s1.clusters.items():
        for g in cl.genes:
            if g[0] in all_dm_ens:
                e = all_dm_ens[g[0]]
                ens_to_dm_cluster.setdefault(e, set()).add(c)

    all_de_ens_with_dm = sorted(
        setops.reduce_union(*[
            de_res_full_s1[pid].reindex(all_dm_ens).dropna().index
            for pid in pids
        ]))
    ens_to_gs = reference_genomes.ensembl_to_gene_symbol(all_de_ens_with_dm)

    # single mega table
    single_de_dm_df = []
    de_fdr = {}
    dm_fdr = {}

    # plus separate DGIdb dump
    dgi_db_df = []

    # use to determine which relations are relevant
    all_relations = sorted(
        setops.reduce_union(*[[t[1] for t in cl.genes]
                              for cl in dmr_res_s1.clusters.values()]))

    for e in all_de_ens_with_dm:
def plot_biplot(dat,
                meta,
                dims,
                scatter_colours,
                scatter_markers,
                annotate_features_radius=None,
                annotate_features_quantile=None,
                adjust_annotation=True,
                adjust_annotation_kwargs=None,
                **kwargs):
    """
    :param dat:
    :param meta: pd.DataFrame, must have columns entitled `type` and `patient_id`
    :param dims:
    :param scatter_colours:
    :param scatter_markers:
    :param annotate_features_radius: If supplied, this is the biplot radius outside of which we annotate genes (by
    symbol).
    :param **kwargs: Passed to pca.biplot()
    :return:
    """
    if annotate_features_radius is not None and annotate_features_quantile is not None:
        raise AttributeError(
            "Supply EITHER annotate_features_radius OR annotate_features_quantile."
        )

    if annotate_features_quantile is not None:
        assert 0 < annotate_features_quantile < 1, "annotate_features_quantile must be between 0 and 1 (not inclusive)."

    if adjust_annotation_kwargs is None:
        adjust_annotation_kwargs = {}

    sample_colours = meta.patient_id.map(scatter_colours.get).to_dict()
    sample_markers = meta.type.map(scatter_markers.get).to_dict()

    res = pca.biplot(dat,
                     plot_dims=dims,
                     sample_colours=sample_colours,
                     sample_markers=sample_markers,
                     **kwargs)

    sample_x, sample_y = res['sample_data']
    feat_x, feat_y = res['feature_data']
    ax = res['ax']
    fig = res['fig']

    typ_ix, typ = meta.type.factorize()

    # connect patients
    for pid in meta.patient_id.unique():
        ix = meta.patient_id == pid
        for t0, t1 in itertools.combinations(typ, 2):
            # draw all possible connections between these two cell types (in one direction only)
            ix0 = meta.index[ix & (meta.type == t0)]
            ix1 = meta.index[ix & (meta.type == t1)]

            for a, b in itertools.product(ix0, ix1):
                ax.plot([
                    sample_x[meta.index == a][0], sample_x[meta.index == b][0]
                ], [
                    sample_y[meta.index == a][0], sample_y[meta.index == b][0]
                ],
                        lw=1.5,
                        color=scatter_colours[pid],
                        zorder=9)

    # custom legend outside of plot
    line_kwargs = {
        'class': 'line',
        'markerfacecolor': 'none',
        'markeredgecolor': 'k',
        'markeredgewidth': 1.0,
        'linestyle': 'none'
    }
    patch_kwargs = {'class': 'patch', 'edgecolor': 'k', 'linewidth': 1.}
    legend_dict = {
        'Patient': collections.OrderedDict(),
        'Cell type': collections.OrderedDict()
    }
    for pid in consts.PIDS:
        ll = dict(patch_kwargs)
        ll['facecolor'] = scatter_colours[pid]
        legend_dict['Patient'][pid] = ll
    for t in typ:
        pp = dict(line_kwargs)
        pp['marker'] = scatter_markers[t]
        legend_dict['Cell type'][t] = pp

    res['legend_dict'] = legend_dict

    common.add_custom_legend(ax, legend_dict, loc_outside=True)
    fig.tight_layout()
    fig.subplots_adjust(right=0.8)

    selected = None

    if annotate_features_radius is not None:
        # annotate most influential genes
        selected = pca.highlight_biplot_features(feat_x, feat_y,
                                                 annotate_features_radius, ax)

    if annotate_features_quantile is not None:
        rad = (feat_x**2 + feat_y**2)**.5
        cut = sorted(rad)[int(len(rad) * annotate_features_quantile)]
        selected = rad >= cut

    if selected is not None:
        genes_selected = dat.index[selected]
        symbols_selected = reference_genomes.ensembl_to_gene_symbol(
            genes_selected)

        # add gene symbol annotations
        text_handles = []
        for ix, gs in zip(np.where(selected)[0], symbols_selected):
            if not pd.isnull(gs):
                text_handles.append(
                    ax.text(feat_x[ix], feat_y[ix], gs, zorder=10))
        # rearrange them to avoid overlaps
        if adjust_annotation:
            adjuster.adjust_text_radial_plus_repulsion(
                text_handles, **adjust_annotation_kwargs)

    return fig, ax, res
Beispiel #10
0
    de_res = differential_expression.compute_cross_de(
        rnaseq_obj, pids, external_references=external_refs, **de_params)
    # this is useful for volcano plots, but otherwise not worth computing?
    # de_res_full = differential_expression.compute_cross_de(rnaseq_obj, pids, external_references=external_refs, return_full=True, **de_params)

    cc_dict = cross_comparison.compute_cross_comparison_correction(
        dict([(k, v.index) for k, v in de_res.items()]),
        pids,
        external_ref_labels,
        set_type='pair_only')
    po_specific_to_all_refs = sorted(cc_dict['specific_to_all_refs'])
    pair_only = cc_dict['venn_set']

    # get the genes that consistently differ in the pair comparison only and NOT in Gibco (across all patients)
    # these will have an expression pattern in Gibco similar to GBM, so that they do NOT appear
    po_specific_to_all_refs_gs = reference_genomes.ensembl_to_gene_symbol(
        po_specific_to_all_refs)
    po_specific_to_all_refs_gs = po_specific_to_all_refs_gs.where(
        ~po_specific_to_all_refs_gs.isnull(), po_specific_to_all_refs)

    po_dat = rnaseq_obj.data.loc[po_specific_to_all_refs]
    po_dat.index = po_specific_to_all_refs_gs
    po_dat = np.log2(po_dat + 1)

    # rearrange columns
    the_cols = (po_dat.columns[po_dat.columns.str.contains('GBM')].tolist() +
                ref_samples +
                po_dat.columns[po_dat.columns.str.contains('DURA')].tolist())
    spacing1 = po_dat.columns.str.contains('GBM').sum()
    spacing2 = spacing1 + len(
        ref_samples
    ) + 1  # +1 required as we will already have added a space to the left of this
Beispiel #11
0
        ssgsea_rnaseq_data, xcell_tcga, corr_metric=corr_metric)

    # heatmap showing correlation between pathways and cell types

    # precursor: check for cases where there is a substantial overlap in genes in pathways and cell type signatures
    # load xCell signatures
    xcell_s = pd.read_excel(XCELL_SIGNATURE_FN, header=0, index_row=0)
    xcell_signatures = {}
    for i, row in xcell_s.iterrows():
        xcell_signatures[row.Celltype_Source_ID] = set(
            row.iloc[2:].dropna().values)

    # convert IPA pathway Ensembl IDs to symbols for compatibility
    ipa_signatures_symb = {}
    for k, v in ipa_signatures.items():
        ipa_signatures_symb[k] = reference_genomes.ensembl_to_gene_symbol(
            v).dropna()

    # compute overlap between cell type signatures and IPA signatures
    pct_shared = analyse_xcell_results.compute_cell_type_pathway_overlap(
        ipa_signatures_symb,
        xcell_signatures,
    )

    # aggregate taking max over pathways
    cc = pct_shared.columns.str.replace(r'(?P<ct>[^_]*)_.*', r'\g<ct>')
    pct_shared_aggr = pct_shared.groupby(cc, axis=1).max()

    # set of pathways with any significance
    logger.info(
        "%d pathways enriched in at least one patient and retained after correlation analysis"
        % co.shape[1])
Beispiel #12
0
    if remove_idh1:
        # filter IDH1 mutants
        idh1_wt = (~rnaseq_meta.idh1_status.isnull()) & (
            rnaseq_meta.idh1_status == 'WT')

        rnaseq_meta = rnaseq_meta.loc[idh1_wt]
        rnaseq_dat = rnaseq_dat_raw.loc[:, rnaseq_meta.index]
    else:
        rnaseq_dat = rnaseq_dat_raw.loc[:,
                                        rnaseq_dat_raw.columns.str.
                                        contains('TCGA')]

    if rnaseq_type != 'gliovis':
        # add gene symbols for gene signature scoring?
        gs = reference_genomes.ensembl_to_gene_symbol(
            rnaseq_dat.index).dropna()
        rnaseq_dat = rnaseq_dat.loc[gs.index]
        rnaseq_dat.index = gs.values

    if rnaseq_type == 'counts':
        # convert to CPM
        rnaseq_dat = rnaseq_dat.divide(rnaseq_dat.sum(axis=0), axis=1) * 1e6

    rnaseq_meta.insert(0, 'wang_classification_simplicity',
                       wang_classes.loc[rnaseq_meta.index, 'Simplicity score'])
    rnaseq_meta.insert(
        0, 'wang_classification_num_matches',
        wang_classes.loc[rnaseq_meta.index, 'Number of matches'])
    rnaseq_meta.insert(0, 'wang_classification',
                       wang_classes.loc[rnaseq_meta.index, 'Wang subclass'])
from rnaseq import general, gsea
from utils import reference_genomes
from utils.output import unique_output_dir

if __name__ == '__main__':
    outdir = unique_output_dir("mouse_gsea_files", reuse_empty=True)
    dat = rnaseq_data.mouse_nsc_salmon()
    dat = general.ensembl_transcript_quant_to_gene(dat, tax_id=10090)
    idx = dat.columns.str.contains(r'eNSC[0-9]med') | dat.columns.str.contains(
        r'mDura[0-9AN]*human')
    dat = dat.loc[:, idx]
    the_groups = pd.Series('eNSC', index=dat.columns)
    the_groups[dat.columns.str.contains('mDura')] = 'iNSC'

    # now switch from Ensembl to gene symbol and capitalize (why?)
    gs = reference_genomes.ensembl_to_gene_symbol(dat.index, tax_id=10090)
    gs = gs.str.upper()
    gs = gs.loc[~gs.index.duplicated()]

    gs.dropna(inplace=True)
    dat = dat.loc[gs.index]
    dat.index = gs

    # this leaves some duplicate values
    # we'll take the average
    dupe_idx = dat.index[dat.index.duplicated()]
    dupe_map = dat.index.isin(dupe_idx)
    dupes = dat.loc[dupe_map]
    dat = dat.loc[~dupe_map]
    dupes_mean = dupes.groupby(dupes.index).mean()
    dat = dat.append(dupes_mean)
Beispiel #14
0
        'ENSG00000135679',
        'ENSG00000198625',
        'ENSG00000141510',
        'ENSG00000100393',
        'ENSG00000149311',
        'ENSG00000012048',
        'ENSG00000139618',
        'ENSG00000116062',
    ]

    # kde for one gene
    counts = np.arange(8000)

    example_col = X.columns[0]
    example_ens = X.index[0]
    example_gene = reference_genomes.ensembl_to_gene_symbol(example_ens)

    x1 = X.loc[example_ens]
    n = float(len(x1))
    p = X.shape[0]
    fr1 = reduce(operator.add, (stats.poisson.pmf(counts, t + r) for t in x1))

    Fr1 = fr1.cumsum() / n

    # run for all genes (rows)
    pool = mp.Pool()
    jobs = {}
    for ei, xi in X.iterrows():
        jobs[ei] = pool.apply_async(eval_one_kde_poisson, args=(xi,))

    pool.close()
Beispiel #15
0
    mean_logfc = pd.Series(np.nanmean(de_res[["%s_logFC" % p for p in pids]], axis=1), index=de_res.index)
    mean_logfc.dropna(inplace=True)

    ix = feat_dat.index.intersection(mean_logfc.index)
    mean_logfc = mean_logfc.loc[ix]

    mean_logfc = mean_logfc.loc[mean_logfc.abs().sort_values(ascending=False).index]

    ax.scatter(
        feat_dat.loc[mean_logfc.index[:50], 'x'],
        feat_dat.loc[mean_logfc.index[:50], 'y'],
        c='k',
        facecolor='k',
        marker='^',
    )
    gg = reference_genomes.ensembl_to_gene_symbol(mean_logfc.index[:50]).dropna()
    for k, v in feat_dat.loc[mean_logfc.index[:50]].iterrows():
        g = gg[k] if k in gg else k
        ax.text(v['x'], v['y'], g)

    dims = (2, 3)  # for copy paste convenience
    fig, ax, res = plot_biplot(
        dat,
        obj.meta,
        dims,
        scatter_colours,
        scatter_markers,
        scale=0.05
    )

    feat_dat = pd.DataFrame(np.array(res['feature_data']).transpose(), index=dat.index)
Beispiel #16
0
    export_hypo = []
    export_hyper = []

    for tt, out_arr in zip([partial_hypo_recs, partial_hyper_recs],
                           [export_hypo, export_hyper]):
        for pid_arr, rec in tt:
            genes = set()
            gene_names = []
            for t in rec.INFO['ANN']:
                srch = re.search(r'(?P<g>ENSG[0-9]*)', t)
                if srch is not None:
                    genes.add(srch.group('g'))
            if len(genes) > 0:
                try:
                    gene_names = reference_genomes.ensembl_to_gene_symbol(
                        genes).dropna().unique()
                except KeyError:
                    gene_names = []

            out = collections.OrderedDict([
                ('id', rec.ID),
                ('chrom', rec.CHROM),
                ('start', rec.start),
                ('end', rec.end),
                ('ref', rec.REF),
                ('alt_seq', '|'.join([t.sequence for t in rec.ALT])),
                ('alt_type', '|'.join([t.type for t in rec.ALT])),
                ('gene_ens', ','.join(genes)),
                ('gene_symbol', ','.join(gene_names)),
            ])
            for p in pids:
Beispiel #17
0
        # now, find the union of genes that are PO when ANY of the external references is used
        tmp2 = reduce(unioner, pair_only.loc[pid, external_ref_labels])
        po_intersection_insc.loc[pid, 'any'] = tmp.difference(tmp2)

    # find DE genes
    po_specific_to_reference = [
        sorted(
            reduce(intersecter, po_diff.loc[~po_diff.index.str.contains(pid),
                                            pid])) for pid in cols
    ]
    po_specific_to_reference = pd.Series(po_specific_to_reference, index=cols)

    # get the genes that consistently differ in the pair comparison only and NOT in Gibco (across all patients)
    # these will have an expression pattern in Gibco similar to GBM, so that they do NOT appear
    po_gibco_diff = po_specific_to_reference.loc['GIBCO']
    po_gibco_diff_gs = reference_genomes.ensembl_to_gene_symbol(po_gibco_diff)
    po_gibco_diff_gs = po_gibco_diff_gs.where(~po_gibco_diff_gs.isnull(),
                                              po_gibco_diff)

    po_dat = rnaseq_obj.data.loc[po_gibco_diff]
    po_dat.index = po_gibco_diff_gs
    po_dat = np.log2(po_dat + 1)

    # po_dat = salmon_dat.loc[po_gibco_diff]
    # po_dat.index = po_gibco_diff_gs
    # # dropna() here loses one gene - LINC01090 / ENSG00000231689
    # # all others are present
    # po_dat = np.log2(po_dat.dropna() + 0.01)

    # rearrange columns
    the_cols = (po_dat.columns[po_dat.columns.str.contains('GBM')].tolist() +
Beispiel #18
0
               med_dev_nsc_rel.loc[rel_dev_candidates],
               c='g')
    ax.scatter(ranked_perc.loc[hkg_ens], med_dev_nsc_rel.loc[hkg_ens], c='r')
    for g, e in zip(hkg, hkg_ens):
        ax.text(ranked_perc.loc[e], med_dev_nsc_rel.loc[e], g)
    ax.set_ylim([0, 2])
    ax.set_xlabel("Abundance percentile")
    ax.set_ylabel("Relative median absolute difference from NSC")
    ax.set_title("%d genes meet relative NSC-MAD criteria" %
                 rel_dev_candidates.size)
    ax.figure.savefig(os.path.join(
        outdir, 'relative_median_absolute_deviation_from_nsc.png'),
                      dpi=200)

    final_candidates = ensembl_to_gene_symbol(
        mad_candidates.intersection(rel_dev_candidates).intersection(
            range_candidates))
    print "Identified %d candidates" % final_candidates.size
    print '\n'.join(final_candidates)

    # now re-plot the first figure but with a subset of these
    new_hkg = ['GAPDH', 'ATP5B', 'ACTB', 'PPIA', 'H3F3B']
    new_hkg_ens = gene_symbol_to_ensembl(new_hkg)

    hkg_dat = dat_n.loc[new_hkg_ens, sorted(dat_n.columns)]
    hkg_dat.index = pd.Index(new_hkg, name='Housekeeping gene')

    hkg_dat_rel = hkg_dat.divide(hkg_dat.loc[:, ref], axis=0)
    cols = [ref] + sorted(hkg_dat_rel.columns[hkg_dat_rel.columns != ref])
    hkg_dat_rel = hkg_dat_rel.loc[:, cols]
Beispiel #19
0
            mfc = np.sign(mfc) * 20
        log2_mfc.loc[g] = mfc

    t_values.dropna(inplace=True)
    log2_mfc.dropna(inplace=True)
    idx = t_values.index.intersection(log2_mfc.index)
    t_values = t_values.loc[idx]
    log2_mfc = log2_mfc.loc[idx]
    p_values = p_values.loc[idx]

    from statsmodels.sandbox.stats import multicomp
    tmp = multicomp.multipletests(p_values.values,
                                  method='fdr_bh',
                                  alpha=0.001)
    # get the genes responsible for the observed changes
    reference_genomes.ensembl_to_gene_symbol(the_insc.loc[tmp[0]].index,
                                             tax_id=10090)

    # compare within mice

    data = obj.data.loc[obj.data.index.str.contains('ENS')]
    meta = obj.meta

    # cpm = data.divide(meta.loc[:, 'read_count'].values, axis=1) * 1e6
    cpm = data.divide(data.sum(axis=0), axis=1) * 1e6
    keep = (cpm > .5).sum(axis=1) > 5

    the_dat_cv = np.log2(data.loc[keep] + 1)

    groups_by_mouse = [
        ['eNSC3med', 'eNSC3mouse', 'mDura3N1mouse', 'mDura3N1human'],
        ['eNSC5med', 'eNSC5mouse', 'mDura5N24Amouse', 'mDura5N24Ahuman'],
            tmp2 = pair_only.loc[pid, c]
            # we want anything in the first part that is NOT in the second part
            po_intersection_insc.loc[pid] = tmp.difference(tmp2)

        # find DE genes that are always unique to a given reference (regardless of the GBM)
        po_specific_to_reference = [
            sorted(
                reduce(intersecter, po_diff.loc[~po_diff.index.str.contains(pid), pid])
            ) for pid in cols
        ]
        po_specific_to_reference = pd.Series(po_specific_to_reference, index=cols)

        # get the genes that consistently differ in the pair comparison only and NOT in Gibco (across all patients)
        # these will have an expression pattern in Gibco similar to GBM, so that they do NOT appear
        po_ref_diff = po_specific_to_reference.loc[c]
        po_ref_diff_gs = reference_genomes.ensembl_to_gene_symbol(po_ref_diff)
        po_ref_diff_gs = po_ref_diff_gs.where(~po_ref_diff_gs.isnull(), po_ref_diff)

        po_dat = rnaseq_obj.data.loc[po_ref_diff]
        po_dat.index = po_ref_diff_gs
        po_dat = np.log2(po_dat + 1)

        # po_dat = salmon_dat.loc[po_gibco_diff]
        # po_dat.index = po_gibco_diff_gs
        # # dropna() here loses one gene - LINC01090 / ENSG00000231689
        # # all others are present
        # po_dat = np.log2(po_dat.dropna() + 0.01)

        # rearrange columns
        the_cols = (
            po_dat.columns[po_dat.columns.str.contains('GBM')].tolist() +
    col_order = plot_all_clustermaps(data_nsc, filestem, col_colors=col_colors)

    # for every sample, extract the top N by count and summarise

    topNs = [10, 50, 100]

    for topN in topNs:

        common_genes = set()
        top_dat = []
        for i in range(data_rr.shape[1]):
            t = data_rr_mt.iloc[:, i].sort_values(ascending=False)[:topN]
            common_genes.update(t.index)

        top_dat = data_rr_mt.loc[list(common_genes)].divide(data_rr.sum(), axis=1)
        symb = reference_genomes.ensembl_to_gene_symbol(top_dat.index)
        tidx = np.array(top_dat.index)
        tidx[~symb.isnull().values] = symb.loc[~symb.isnull()].values
        top_dat.index = tidx

        filestem = os.path.join(OUTDIR, 'clustermap_sub_rrna_mt_top_%d' % topN)
        col_order = plot_all_clustermaps(top_dat, filestem, col_colors=col_colors)

        filestem = os.path.join(OUTDIR, 'correlation_sub_rrna_mt_top_%d' % topN)
        plot_all_correlation_heatmaps(top_dat, filestem, col_order, vmin=0.5, vmax=1.)


    # bar charts of successive markers, used to characterise based on timeline
    # for this, only astrocytes and NSCs useful, so remove oligo and neuron
    astro_markers2 = [
        'NFIA',