def prepare_gct_files(outdir=None): """ Prepare the GCT files required to perform classification: - Our GBM FFPE and cell culture samples - TCGA RNA-Seq cohort - Both combined In all cases, use FPKM units and gene symbols, as these are used by Wang """ if outdir is None: outdir = unique_output_dir("gct_files_for_wang") infiles = [] # 1) Our data obj_ffpe = rnaseq_data.load_by_patient('all', type='ffpe') dat_ffpe = obj_ffpe.get_fpkm() dat_ffpe.columns = ['%s_FFPE' % t for t in obj_ffpe.meta.reference_id] obj_cc = rnaseq_data.load_by_patient(patient_ids='all') dat_cc = obj_cc.get_fpkm() dat_cc = dat_cc.loc[:, obj_cc.meta.type == 'GBM'] dat_all = pd.concat((dat_cc, dat_ffpe), axis=1) idx = reference_genomes.ensembl_to_gene_symbol(dat_all.index).dropna() dat_all = dat_all.loc[idx.index] dat_all.index = idx fn = os.path.join(outdir, "gbm_ffpe_cc_fpkm.gct") gsea.data_to_gct(dat_all, fn) infiles.append(fn) # 2) TCGA (IDH1 WT only) tcga_dat, tcga_meta = rnaseq_data.tcga_primary_gbm(units='fpkm') tcga_dat = tcga_dat.loc[:, tcga_meta.idh1_status == 'WT'] idx = reference_genomes.ensembl_to_gene_symbol(tcga_dat.index).dropna() idx = idx.loc[~idx.index.duplicated()] tcga_dat = tcga_dat.loc[idx.index] tcga_dat.index = idx fn = os.path.join(outdir, "tcga_idh1_wt_fpkm.gct") gsea.data_to_gct(tcga_dat, fn) infiles.append(fn) # 3) Combined dat = gsea.combine_gct_files(*infiles) fn = os.path.join(outdir, "tcga_idh1_wt_and_gbm_ffpe_cc_fpkm.gct") gsea.data_to_gct(dat, fn)
de_params = { 'lfc': 1, 'fdr': 0.01, 'method': 'GLM' } subgroups = { 'RTK I': ['019', '030', '031'], 'RTK II': ['017', '050', '054'], } intersecter = lambda x, y: set(x).intersection(y) unioner = lambda x, y: set(x).union(y) # Load RNA-Seq from STAR rnaseq_obj = rnaseq_data.load_by_patient(pids, annotate_by='Ensembl Gene ID') # load additional references if required h9_obj = rnaseq_data.gse61794(annotate_by='Ensembl Gene ID') h1_obj = rnaseq_data.gse38993(annotate_by='Ensembl Gene ID') rnaseq_obj = rnaseq_data.MultipleBatchLoader([rnaseq_obj, h1_obj, h9_obj]) # discard unmapped, etc rnaseq_obj.data = rnaseq_obj.data.loc[rnaseq_obj.data.index.str.contains('ENSG')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('PSC')] rnaseq_obj.meta = rnaseq_obj.meta.loc[~rnaseq_obj.meta.index.str.contains('fibroblast')] rnaseq_obj.data = rnaseq_obj.data.loc[:, rnaseq_obj.meta.index] # load RNA-Seq from Salmon (for normalised comparison) # disabled for now if False:
unit = 'cpm' type = 'cell_culture' tax_id = 9606 if isinstance(cell_types, str): cell_types = [cell_types] if unit == 'tpm': dat = rnaseq_data.load_salmon_by_patient_id(pids, include_control=False, type=type) dat = general.ensembl_transcript_quant_to_gene(dat, tax_id=tax_id) elif unit == 'cpm': obj = rnaseq_data.load_by_patient(pids, type=type, source='star', annotate_by='Ensembl Gene ID', include_control=False) dat = obj.data dat = dat.divide(dat.sum(axis=0), axis=1) * 1e6 else: raise NotImplementedError() if cell_types is not None: idx = reduce( lambda x, y: x | y, [dat.columns.str.contains(t) for t in cell_types], ) dat = dat.loc[:, idx] lookup = reference_genomes.gene_symbol_to_ensembl(gois, tax_id=tax_id)