def prepare_gct_files(outdir=None):
    """
    Prepare the GCT files required to perform classification:
    - Our GBM FFPE and cell culture samples
    - TCGA RNA-Seq cohort
    - Both combined
    In all cases, use FPKM units and gene symbols, as these are used by Wang
    """
    if outdir is None:
        outdir = unique_output_dir("gct_files_for_wang")

    infiles = []

    # 1) Our data
    obj_ffpe = rnaseq_data.load_by_patient('all', type='ffpe')
    dat_ffpe = obj_ffpe.get_fpkm()
    dat_ffpe.columns = ['%s_FFPE' % t for t in obj_ffpe.meta.reference_id]
    obj_cc = rnaseq_data.load_by_patient(patient_ids='all')
    dat_cc = obj_cc.get_fpkm()
    dat_cc = dat_cc.loc[:, obj_cc.meta.type == 'GBM']
    dat_all = pd.concat((dat_cc, dat_ffpe), axis=1)
    idx = reference_genomes.ensembl_to_gene_symbol(dat_all.index).dropna()
    dat_all = dat_all.loc[idx.index]
    dat_all.index = idx
    fn = os.path.join(outdir, "gbm_ffpe_cc_fpkm.gct")
    gsea.data_to_gct(dat_all, fn)
    infiles.append(fn)

    # 2) TCGA (IDH1 WT only)
    tcga_dat, tcga_meta = rnaseq_data.tcga_primary_gbm(units='fpkm')
    tcga_dat = tcga_dat.loc[:, tcga_meta.idh1_status == 'WT']
    idx = reference_genomes.ensembl_to_gene_symbol(tcga_dat.index).dropna()
    idx = idx.loc[~idx.index.duplicated()]
    tcga_dat = tcga_dat.loc[idx.index]
    tcga_dat.index = idx
    fn = os.path.join(outdir, "tcga_idh1_wt_fpkm.gct")
    gsea.data_to_gct(tcga_dat, fn)
    infiles.append(fn)

    # 3) Combined
    dat = gsea.combine_gct_files(*infiles)
    fn = os.path.join(outdir, "tcga_idh1_wt_and_gbm_ffpe_cc_fpkm.gct")
    gsea.data_to_gct(dat, fn)
    de_params = {
        'lfc': 1,
        'fdr': 0.01,
        'method': 'GLM'
    }

    subgroups = {
        'RTK I': ['019', '030', '031'],
        'RTK II': ['017', '050', '054'],
    }

    intersecter = lambda x, y: set(x).intersection(y)
    unioner = lambda x, y: set(x).union(y)

    # Load RNA-Seq from STAR
    rnaseq_obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID')

    # load additional references if required
    h9_obj = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID')
    h1_obj = rnaseq_data.gse38993(annotate_by='Ensembl Gene ID')
    rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj, h1_obj, h9_obj])

    # discard unmapped, etc
    rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains('ENSG')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('PSC')]
    rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('fibroblast')]
    rnaseq_obj.data = rnaseq_obj.data.loc[:, rnaseq_obj.meta.index]

    # load RNA-Seq from Salmon (for normalised comparison)
    # disabled for now
    if False:
Esempio n. 3
0
    unit = 'cpm'
    type = 'cell_culture'
    tax_id = 9606

    if isinstance(cell_types, str):
        cell_types = [cell_types]

    if unit == 'tpm':
        dat = rnaseq_data.load_salmon_by_patient_id(pids,
                                                    include_control=False,
                                                    type=type)
        dat = general.ensembl_transcript_quant_to_gene(dat, tax_id=tax_id)
    elif unit == 'cpm':
        obj = rnaseq_data.load_by_patient(pids,
                                          type=type,
                                          source='star',
                                          annotate_by='Ensembl Gene ID',
                                          include_control=False)
        dat = obj.data
        dat = dat.divide(dat.sum(axis=0), axis=1) * 1e6
    else:
        raise NotImplementedError()

    if cell_types is not None:
        idx = reduce(
            lambda x, y: x | y,
            [dat.columns.str.contains(t) for t in cell_types],
        )
        dat = dat.loc[:, idx]

    lookup = reference_genomes.gene_symbol_to_ensembl(gois, tax_id=tax_id)