def load_rnaseq(pids, ref_names, ref_name_filter='NSC', discard_filter='IPSC', strandedness=None): """ :param strandedness: Iterable of same length as ref_names giving the strandedness of each ref """ if strandedness is None: strandedness = ['u'] * len(ref_names) else: if len(strandedness) != len(ref_names): raise ValueError("Supplied strandedness must be a list of the same length as the ref_names.") # Load RNA-Seq from STAR obj = rnaseq_loader.load_by_patient(pids) # load additional references ref_objs = [] for rn, strnd in zip(ref_names, strandedness): ref_obj = rnaseq_loader.load_references(rn, strandedness=strnd) if ref_name_filter is not None: # only keep relevant references ref_obj.meta = ref_obj.meta.loc[ref_obj.meta.index.str.contains(ref_name_filter)] ref_obj.data = ref_obj.data.loc[:, ref_obj.meta.index] ref_objs.append(ref_obj) obj = loader.MultipleBatchLoader([obj] + ref_objs) if discard_filter is not None: if not hasattr(discard_filter, '__iter__'): discard_filter = [discard_filter] for d in discard_filter: obj.meta = obj.meta.loc[~obj.meta.index.str.contains(d)] obj.data = obj.data.loc[:, obj.meta.index] obj.batch_id = obj.batch_id.loc[obj.meta.index] return obj
def prepare_gct_files_hgic(pids=consts.ALL_PIDS, outdir=None): """ Prepare the GCT files required to perform classification of the hGIC samples: - hGIC FFPE - hGIC cell culture - Both combined In all cases, use FPKM units (cufflinks), TPM (salmon) and CPM (STAR). Use gene symbols as these are contained in the signatures. """ if outdir is None: outdir = output.unique_output_dir() infiles = [] loaded = {} for typ in ('cell_culture', 'ffpe'): for src in ('star', 'salmon', 'star/cufflinks'): this_obj = loader.load_by_patient(pids, type=typ, source=src, include_control=False) this_obj.filter_samples(this_obj.meta.type == 'GBM') if typ == 'ffpe': # restrict to the 'best' versions (there are some duplicates where we tried twice) this_obj.filter_by_sample_name(consts.FFPE_RNASEQ_SAMPLES_ALL) this_dat = reference_genomes.translate_quantification_resolving_duplicates( this_obj.data, 'Ensembl Gene ID', 'Approved Symbol' ) loaded.setdefault(typ, {})[src] = this_dat fn = os.path.join(outdir, "%s_%s.gct" % (SRC_MAP[src], typ)) gsea.data_to_gct(this_dat, fn) infiles.append(fn) return infiles
from rnaseq import loader, filter, general from utils import output import os import numpy as np if __name__ == "__main__": min_cpm = 1 obj = loader.load_by_patient('all', type='ffpe') samples = [ 'NH15_1661DEF2C', 'NH15_1877_SP1C', 'NH15_2101_DEF1A', 'NH16_270_DEF1Ereplacement', 'NH16_616DEF1B', 'NH16_677_SP1A', 'NH16_1574DEF1A', 'NH16_1976_DEF1Areplacement', 'NH16_2063_DEF1Areplacement', 'NH16_2214DEF1A', 'NH16_2255DEF1B2', 'NH16_2806DEF3A1', ] # remove duplicates dat = obj.data.loc[:, samples] dat = filter.filter_by_cpm(dat, min_cpm=min_cpm, min_n_samples=1) cpm = (dat + 1).divide(dat.sum() + 1, axis=1) * 1e6 general.add_gene_symbols_to_ensembl_data(cpm) outdir = output.unique_output_dir("ffpe_logcpm_values")
mtor_geneset = mtor_gs_dict[mtor_source] tam_genesets = tam_gs_dict[tam_signature_source] genesets = dict(tam_genesets) genesets['mTOR'] = mtor_geneset subgroups = consts.SUBGROUPS subgroups_lookup = {} for grp, arr in subgroups.items(): subgroups_lookup.update(dict([(t, grp) for t in arr])) outdir = output.unique_output_dir() obj = loader.load_by_patient(consts.PIDS, type='ffpe', source='salmon', include_control=False) obj.filter_by_sample_name(consts.FFPE_RNASEQ_SAMPLES) obj.meta.insert(0, 'patient_id', nh_id_to_patient_id(obj.meta.index)) obj.meta.insert(1, 'subgroup', [subgroups_lookup[pid] for pid in obj.meta.patient_id]) rnaseq_dat = obj.data.copy() # use gene symbol identifiers gs = reference_genomes.ensembl_to_gene_symbol(rnaseq_dat.index).dropna() rnaseq_dat = rnaseq_dat.loc[gs.index] rnaseq_dat.index = gs.values groups = obj.meta.subgroup group_list = groups.unique()
min_cpm_individual = 0.1 outdir = output.unique_output_dir("james_opc_smartseq2_vs_polya") ## 1) STAR CPM estimates ss2_obj = loader.load_references('wtchg_p180059', strandedness='u') assigned_sum = ss2_obj.data.sum() unassigned_sum = ss2_obj.data_unassigned.drop('N_unmapped').sum() ss2_pct_assigned = assigned_sum / (assigned_sum + unassigned_sum) * 100. print "SmartSeq2 samples % assigned" print ss2_pct_assigned polya_obj = loader.load_by_patient(pids) # restrict to relevant samples for first part of the analysis idx = (polya_obj.meta.type == 'iNSC') polya_nsc_meta = polya_obj.meta.loc[idx] polya_nsc_data = polya_obj.data.loc[:, polya_nsc_meta.index] polya_nsc_unassigned = polya_obj.data_unassigned.loc[:, polya_nsc_meta.index] assigned_sum = polya_nsc_data.sum() unassigned_sum = polya_nsc_unassigned.drop('N_unmapped').sum() polya_pct_assigned = assigned_sum / (assigned_sum + unassigned_sum) * 100. print "Poly(A) samples % assigned" print polya_pct_assigned
DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de') de_params = consts.DE_PARAMS dmr_params = consts.DMR_PARAMS dmr_params['n_jobs'] = mp.cpu_count() norm_method_s1 = 'swan' min_cpm = 1. min_counts = 10000000 pids = consts.PIDS # load RNA-Seq data rna_ff_obj = rnaseq_loader.load_by_patient(pids, type='ffpe', source='star', include_control=False) # filter FFPE to include only the best samples (NB not actually good!) rna_ff_obj.filter_samples( rna_ff_obj.meta.index.isin(consts.FFPE_RNASEQ_SAMPLES)) rna_ff_obj.batch_id = rna_ff_obj.meta.batch # add FFPE PID nh_id = rna_ff_obj.meta.index.str.replace(r'(_?)(DEF|SP).*', '') p_id = [NH_ID_TO_PATIENT_ID_MAP[t.replace('_', '-')] for t in nh_id] rna_ff_obj.meta.insert(0, 'nh_id', nh_id) rna_ff_obj.meta.insert(0, 'patient_id', p_id) rna_ff_obj.batch_id = rna_ff_obj.meta.batch # reject samples with low counts
dmr_hash_dict) filename = 'dmr_results_paired_comparison.%d.pkl' % the_hash fn = os.path.join(DMR_LOAD_DIR, filename) if os.path.isfile(fn): logger.info("Reading S1 DMR results from %s", fn) dmr_res_s1 = dmr.DmrResultCollection.from_pickle(fn, anno=anno) else: raise AttributeError( "Unable to load pre-computed DMR results, expected at %s" % fn) # extract results dmr_res_full_s1 = dmr_res_s1.results dmr_res_sign_s1 = dmr_res_s1.results_significant rnaseq_obj = obj = rnaseq_loader.load_by_patient(pids) rnaseq_obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES) # only keep the required syngeneic samples for this analysis dat_s1 = rnaseq_obj.data meta_s1 = rnaseq_obj.meta the_hash = tscdd.de_results_hash(meta_s1.index.tolist(), de_params) filename = 'de_results_paired_comparison.%d.pkl' % the_hash fn = os.path.join(DE_LOAD_DIR, filename) if os.path.isfile(fn): logger.info("Reading S1 DE results from %s", fn) with open(fn, 'rb') as f: de_res_full_s1 = pickle.load(f)
from settings import INTERMEDIATE_DIR import os import pickle import re import pandas as pd import statsmodels.formula.api as sm logger = log.get_console_logger() if __name__ == '__main__': outdir = output.unique_output_dir() DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr') DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de') rnaseq_obj = loader.load_by_patient(consts.PIDS) rnaseq_obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES) dat_s1 = rnaseq_obj.data meta_s1 = rnaseq_obj.meta.loc[dat_s1.columns] the_hash = tsgd.de_results_hash(meta_s1.index.tolist(), consts.DE_PARAMS) filename = 'de_results_paired_comparison.%d.pkl' % the_hash fn = os.path.join(DE_LOAD_DIR, filename) if os.path.isfile(fn): logger.info("Reading S1 DE results from %s", fn) with open(fn, 'rb') as f: de_res_full_s1 = pickle.load(f) else: raise NotImplementedError(
load_kwds = {'source': source, 'alignment_subdir': SetMe} if source == 'salmon': units = 'tpm' load_kwds['units'] = 'tpm' if source == 'star': # set strandedness as a cue to import for each load_kwds['strandedness'] = SetMe # restrict samples manually to avoid changes going forwards our_samples = consts.S1_RNASEQ_SAMPLES_INSC + consts.S1_RNASEQ_SAMPLES_IPSC + consts.S1_RNASEQ_SAMPLES_FB + [ 'GIBCO_NSC_P4' ] # our data (everything) obj = loader.load_by_patient(pids, source=source) # obj.filter_by_sample_name(our_samples) # HipSci data hip_obj = loader.hipsci_ipsc(aggregate_to_gene=True) hip_obj.meta.insert(3, 'batch', hip_obj.batch_id) # hip_obj.meta.insert(3, 'batch', 'HipSci') # reduce the number in a (repeatably) random fashion rs = np.random.RandomState( 42) # set the seed so we always get the same samples keep = np.zeros(hip_obj.meta.shape[0]).astype(bool) idx = hip_obj.meta.index.tolist() rs.shuffle(idx) idx = idx[:n_hipsci] hip_obj.meta = hip_obj.meta.loc[idx]
else: raise AttributeError( "Unable to load pre-computed DMR results, expected at %s" % fn) # extract results dmr_res_full_s1 = dmr_res_s1.results dmr_res_sign_s1 = dmr_res_s1.results_significant # get samples used in each comparison dmr_comparison_groups = collections.OrderedDict([(pid, {}) for pid in consts.PIDS]) gg = me_data.columns.groupby(zip(me_meta.patient_id, me_meta.type)) for (pid, typ), samples in gg.items(): dmr_comparison_groups[pid][typ] = samples rnaseq_obj = rnaseq_loader.load_by_patient( pids) # quicker than tscdd method that loads refs too # rnaseq_obj = tscdd.load_rnaseq( # pids, # external_ref_names_de, # strandedness=external_ref_strandedness_de, # ) rnaseq_obj.filter_by_sample_name(consts.ALL_RNASEQ_SAMPLES) # only keep the required syngeneic samples for this analysis dat_s1 = rnaseq_obj.data.loc[:, rnaseq_obj.meta.index.isin(consts. S1_RNASEQ_SAMPLES)] meta_s1 = rnaseq_obj.meta.loc[dat_s1.columns]
] row_per_fig = 5 # load TPM data cols_syn_gic = consts.S1_RNASEQ_SAMPLES_GIC cols_syn_insc = consts.S1_RNASEQ_SAMPLES_INSC cols_ref_nsc = [ 'GIBCO_NSC_P4', 'H9_NSC_1', 'H9_NSC_2' ] outdir = output.unique_output_dir() obj1 = loader.load_by_patient(pids, source='salmon', include_control=True) obj2 = loader.load_references('GSE61794', source='salmon') obj = loader.MultipleBatchLoader([obj1, obj2]) obj.filter_by_sample_name(consts.ALL_RNASEQ_SAMPLES) dat = obj.data.copy() general.add_gene_symbols_to_ensembl_data(dat) # check that all GOIs are present vc = dat['Gene Symbol'].value_counts() for g in gois: if g not in vc: raise KeyError("Gene %s was not found." % g) if vc[g] != 1: raise AttributeError("Gene %s has %d hits." % (g, vc[g]))
meta_fn = os.path.join(DATA_DIR, 'rnaseq', 'tcga_gbm', 'primary_tumour/htseq-count_fpkm/sources.csv') dat_fn = os.path.join(DATA_DIR, 'rnaseq', 'tcga_gbm', 'primary_tumour/htseq-count_fpkm/fpkm.csv') tcga_meta = pd.read_csv(meta_fn, header=0, index_col=0) tcga_dat = pd.read_csv(dat_fn, header=0, index_col=0) # filter: primary GBM only ix = (tcga_meta.idh1_status == 'WT') tcga_meta = tcga_meta[ix] tcga_dat = tcga_dat[tcga_meta.index] tcga_dat = tcga_dat.divide(tcga_dat.sum(axis=0), axis=1) * 1e6 # load our data gic_obj = loader.load_by_patient(consts.PIDS, source='salmon', include_control=False, type='cell_culture') ffpe_obj = loader.load_by_patient(consts.PIDS, source='salmon', include_control=False, type='ffpe') # add NH ID and patient ID to FFPE nh_id = ffpe_obj.meta.index.str.replace(r'(_?)(DEF|SP).*', '') p_id = [NH_ID_TO_PATIENT_ID_MAP[t.replace('_', '-')] for t in nh_id] ffpe_obj.meta.insert(0, 'nh_id', nh_id) ffpe_obj.meta.insert(0, 'patient_id', p_id) # ditto GIC gic_obj.meta.insert( 0, 'patient_id',
import os from plotting import clustering, common, pca from sklearn.decomposition.pca import PCA from stats import transformations import numpy as np from utils import output import os import pandas as pd from matplotlib import pyplot as plt if __name__ == "__main__": eps = 1e-2 outdir = output.unique_output_dir("export_sb_data") obj_star = loader.load_by_patient(['ICb1299', '3021'], source='star', type='cell_culture', include_control=False) obj_salmon = loader.load_by_patient(['ICb1299', '3021'], source='salmon', type='cell_culture', include_control=False) # cluster plot tpm = filter.filter_by_cpm(obj_salmon.data, min_cpm=1, min_n_samples=4) batch_colours = common.COLOUR_BREWERS[len(obj_salmon.meta.batch.unique())] line_colours = common.COLOUR_BREWERS[2] cc = pd.DataFrame(line_colours[0], index=tpm.columns, columns=['Batch', 'Cell line']) aa, bb = obj_salmon.meta.batch.factorize() for i in range(aa.max()): cc.loc[aa == i, 'Batch'] = batch_colours[i] cc.loc[cc.index.str.contains('3021'), 'Cell line'] = line_colours[1] cg = clustering.dendrogram_with_colours(
'017', '018', '019', '030', '031', '026', '044', '049', '050', '052', '054', '061' ] if units == 'tpm': min_val = 1 min_n = 4 eps = .01 elif units == 'estimated_counts': min_val = 10 min_n = 4 eps = .01 if remove_mt: mt_ensg = set(gtf_reader.get_mitochondrial()) patient_obj = loader.load_by_patient(pids, source='salmon', units=units) patient_data = patient_obj.data # discard GBM and unused 016 iNSC patient_data = patient_data.loc[:, ~patient_data.columns.str.contains('GBM')] patient_data = patient_data.drop( ['DURA061_NSC_N6_P4', 'DURA061_NSC_N1_P5'], axis=1) # discard mitochondrial genes if remove_mt: idx = ~patient_data.index.isin(mt_ensg) pdbg = patient_data.loc[idx] # renorm if units == 'tpm': pdbg = pdbg.divide(pdbg.sum(), axis=1) * 1e6
) dedm_indir = os.path.join( HGIC_LOCAL_DIR, 'current/core_pipeline/rnaseq_methylation_combined/s0_individual_patients_direct_comparison/ipa/pathways' ) DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de') outdir = output.unique_output_dir() ####################################################### # DE ####################################################### # data rnaseq_obj = rnaseq_loader.load_by_patient(pids, include_control=False, source='star') rnaseq_obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES) dat_s1 = rnaseq_obj.data meta_s1 = rnaseq_obj.meta cpm = dat_s1.divide(dat_s1.sum(axis=0), axis=1) * 1e6 # DE results the_hash = tsgd.de_results_hash(meta_s1.index.tolist(), de_params) filename = 'de_results_paired_comparison.%d.pkl' % the_hash fn = os.path.join(DE_LOAD_DIR, filename) if os.path.isfile(fn): logger.info("Reading S1 DE results from %s", fn)
# load all data pids = ['018', '019', '030', '031', '017', '050', '054', '061', '026', '052'] units = 'tpm' out_subdir = os.path.join(outdir, units) if not os.path.isdir(out_subdir): os.makedirs(out_subdir) print "Created output subdirectory %s" % out_subdir source_by_units = { 'tpm': 'salmon', 'counts': 'star', 'fpkm': 'star/cufflinks' } obj = loader.load_by_patient(pids, source=source_by_units[units], include_control=True) # set gibco aside dat_gibco = obj.data.loc[:, obj.data.columns.str.contains('GIBCO')] dat_gibco = ens_index_to_gene_symbol(dat_gibco) # drop any cell types other than GBM and iNSC ix = obj.meta['type'].isin(['GBM', 'iNSC']) # drop unneeded GBM061 samples ix = ix & (~obj.meta.index.isin(['DURA061_NSC_N1_P5', 'DURA061_NSC_N6_P4'])) obj.filter_samples(ix) # convert to gene symbols dat = ens_index_to_gene_symbol(obj.data) # load reference dataset(s)
elif type == 'csv': save_func = lambda x: x.to_csv else: raise NotImplementedError("Unsupported type %s." % type) save_func(dat)(fn) if __name__ == "__main__": outdir = output.unique_output_dir() keep_samples = consts.S1_RNASEQ_SAMPLES_FB + consts.S1_RNASEQ_SAMPLES_INSC + consts.S1_RNASEQ_SAMPLES_GIC + \ consts.S1_RNASEQ_SAMPLES_IPSC + consts.S1_RNASEQ_SAMPLES_IAPC star_cc_obj = loader.load_by_patient( consts.PIDS, type='cell_culture', source='star', include_control=True, ) ix = star_cc_obj.meta.index.isin(keep_samples) star_cc_obj.filter_samples(ix) salmon_cc_obj = loader.load_by_patient( consts.PIDS, type='cell_culture', source='salmon', include_control=True, ) ix = salmon_cc_obj.meta.index.isin(keep_samples) salmon_cc_obj.filter_samples(ix) star_ff_obj = loader.load_by_patient(
quantile = 0.99 # dimensions (components) to investigate dims = [0, 1, 2] selection_radii_for_plotting = {0: 0.6, 1: 0.30, 2: 0.25} # path to syngeneic DE results fn_de_res = os.path.join(HGIC_LOCAL_DIR, 'current/core_pipeline/rnaseq/', 'full_de_syngeneic_only.xlsx') # load DE results de_res = pd.read_excel(fn_de_res, index_col=0) # load data for iNSC and GBM (Salmon TPM) obj = loader.load_by_patient(consts.PIDS, source='salmon') obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES_GIC + consts.S1_RNASEQ_SAMPLES_INSC) obj.meta.insert( 0, 'patient_id', obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*', '\g<pid>')) cmap = common.get_best_cmap(len(consts.PIDS)) scatter_colours = dict(zip(consts.PIDS, cmap)) scatter_markers = {'GBM': 's', 'iNSC': 'o'} # scaling parameter applied during SVD scale_preserved = 0.05
de_params = consts.DE_PARAMS dmr_params = consts.DMR_PARAMS dmr_params['n_jobs'] = mp.cpu_count() # file location DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr') DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de') # boilerplate outdir = output.unique_output_dir() logger = log.get_console_logger() ## DE # load data (ours only, no references) rnaseq_obj = rnaseq_loader.load_by_patient(pids, include_control=False) rnaseq_obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES) dat_s1 = rnaseq_obj.data meta_s1 = rnaseq_obj.meta the_hash = tsgd.de_results_hash(meta_s1.index.tolist(), de_params) filename = 'de_results_s1_cross_comparison.%d.pkl' % the_hash fn = os.path.join(DE_LOAD_DIR, filename) if os.path.isfile(fn): logger.info("Reading S1 cross-comparison DE results from %s", fn) with open(fn, 'rb') as f: de_res = pickle.load(f) else: groups = pd.Series(index=meta_s1.index)
import os import numpy as np import seaborn as sns from matplotlib import pyplot as plt import hgic_consts from plotting import clustering from rnaseq import loader from stats import transformations from utils import output, reference_genomes if __name__ == '__main__': outdir = output.unique_output_dir("cluster_gic_ffpe") pids = ['018', '019', '031', '017', '050', '054'] obj_ffpe = loader.load_by_patient(pids, type='ffpe', include_control=False) obj_gic = loader.load_by_patient(pids, type='cell_culture', include_control=False) obj = loader.loader.MultipleBatchLoader([obj_ffpe, obj_gic]) # drop iNSC, iPSC obj.meta = obj.meta.loc[~obj.meta.index.str.contains('DURA')] obj.data = obj.data.loc[:, obj.meta.index] # relabel the FFPE samples idx = obj.meta.index.tolist() for k, v in hgic_consts.NH_ID_TO_PATIENT_ID_MAP.items(): for i, t in enumerate(idx): if k.replace('-', '_') in t: idx[i] = "FFPE GBM%s" % v
# to_aggr_nsc = [ # (r'H9_NSC_[12]', 'H9 NSC'), # # (r'Pollard NSC [12]', 'Fetal NSC'), # ] outdir = output.unique_output_dir("assess_reprogramming_de") load_kwds = { 'source': 'star', 'alignment_subdir': SetMe, 'strandedness': SetMe, } # our data (everything) obj = loader.load_by_patient(pids, source='star') # HipSci data hip_obj = loader.hipsci_ipsc(aggregate_to_gene=True) hip_obj.meta.insert(3, 'batch', hip_obj.batch_id) # hip_obj.meta.insert(3, 'batch', 'HipSci') # reduce the number in a (repeatably) random fashion rs = np.random.RandomState( 42) # set the seed so we always get the same samples keep = np.zeros(hip_obj.meta.shape[0]).astype(bool) idx = hip_obj.meta.index.tolist() rs.shuffle(idx) idx = idx[:n_hipsci] hip_obj.meta = hip_obj.meta.loc[idx] hip_obj.data = hip_obj.data.loc[:, idx]
dat = dat + offset if len(dat.shape) == 2: cpm = dat.divide(dat.sum(), axis=1) * 1e6 else: cpm = dat.divide(dat.sum()) * 1e6 return np.log(cpm) / np.log(base) if __name__ == '__main__': min_cpm = 0.01 outdir = output.unique_output_dir("biological_technical_ecdf") # all our patient data (cell culture) our_patient_obj = loader.load_by_patient('all', source='star') # all our patient data (FFPE culture) ffpe_samples = [ 'NH15_1661DEF2C', 'NH15_1877_SP1C', 'NH15_2101_DEF1A', 'NH16_270_DEF1Ereplacement', 'NH16_616DEF1B', 'NH16_677_SP1A', 'NH16_1574DEF1A', 'NH16_1976_DEF1Areplacement', 'NH16_2063_DEF1Areplacement', 'NH16_2214DEF1A', 'NH16_2255DEF1B2', 'NH16_2806DEF3A1',
@property def kde_func(self): return eval_one_kde_gaussian def run_normalisation(self, njob=None): self.Fr = self.X.rank(axis=1, method='average') / float(self.n) self.z_ij = self.Fr.rank(axis=0, method='average') def plot_kde_one_gene(self, gene, n_annot=10, gene_ttl=None): raise NotImplementedError("TODO: refactor this part") if __name__ == "__main__": outdir = output.unique_output_dir() obj = loader.load_by_patient(['018', '019', '030', '031'], source='star') X = obj.data r = 0.5 # offset for Poisson kernels tau = 1. # weighting in KS statistic # arbitrary gene set: GBM signalling gk = [ 'ENSG00000077782', 'ENSG00000133056', 'ENSG00000136158', 'ENSG00000110092', 'ENSG00000169032', 'ENSG00000139687', 'ENSG00000171862', 'ENSG00000140443',
dmr_hash_dict['norm_method'] = norm_method_s1 # load DMR results the_hash = tsgd.dmr_results_hash(meth_obj.meta.index.tolist(), dmr_hash_dict) filename = 'dmr_results_paired_comparison.%d.pkl' % the_hash fn = os.path.join(DMR_LOAD_DIR, filename) if os.path.isfile(fn): logger.info("Loading pre-computed DMR results from %s", fn) dmr_res_s1 = dmr.DmrResultCollection.from_pickle(fn, anno=anno) else: raise Exception("Unable to locate pre-existing results.") # load DE results rnaseq_obj = rnaseq_loader.load_by_patient(consts.PIDS, include_control=False) rnaseq_obj.filter_by_sample_name(consts.S1_RNASEQ_SAMPLES) the_hash = tsgd.de_results_hash(rnaseq_obj.meta.index.tolist(), de_params) filename = 'de_results_paired_comparison.%d.pkl' % the_hash fn = os.path.join(DE_LOAD_DIR, filename) if os.path.isfile(fn): logger.info("Reading S1 DE results from %s", fn) with open(fn, 'rb') as f: de_res_s1 = pickle.load(f) else: raise Exception("Unable to locate pre-existing DE results.") the_hash = tsgd.dmr_results_hash(meth_obj.meta.index.tolist(), dmr_hash_dict)
'CAB39L', 'STRADA', 'RICTOR', 'EIF4E1B', 'TSC1' ] # which list should we use? list_name = 'S2' # list_name = 'S4' if list_name == 'S2': list_cols = ['MG', 'BMDM'] elif list_name == 'S4': list_cols = ['TAM MG', 'TAM BMDM', 'Core MG', 'Core BMDM'] else: raise NotImplementedError("Unrecognised list: %s" % list_name) # load FFPE RNA-Seq data obj_ff = loader.load_by_patient('all', source='salmon', type='ffpe', include_control=False) # add patient identifiers nh_id = obj_ff.meta.index.str.replace(r'(_?)(DEF|SP).*', '') p_id = [NH_ID_TO_PATIENT_ID_MAP[t.replace('_', '-')] for t in nh_id] obj_ff.meta.insert(0, 'nh_id', nh_id) obj_ff.meta.insert(0, 'patient_id', p_id) # switch to gene symbols gs = reference_genomes.ensembl_to_gene_symbol(obj_ff.data.index) gs = gs.loc[~gs.index.duplicated()] the_ix = np.array(obj_ff.data.index, copy=True) the_ix[~gs.isnull().values] = gs.values[~gs.isnull()] ffpe_dat = obj_ff.data.copy() ffpe_dat.index = the_ix
subgroup_set_colours = { 'RTK I full': '#0d680f', 'RTK II full': '#820505', 'MES full': '#7900ad', 'RTK I partial': '#6ecc70', 'RTK II partial': '#d67373', 'MES partial': '#cc88ea', 'mixed': '#4C72B0', 'specific': '#f4e842', } min_cpm = 1 outdir = output.unique_output_dir("compare_de_gene_counts_s1", reuse_empty=True) obj = loader.load_by_patient(pids, include_control=False) # remove IPSC and rejected 061 samples for good idx = ((~obj.meta.index.str.contains('IPSC')) & (~obj.meta.index.isin(['DURA061_NSC_N1_P5', 'DURA061_NSC_N6_P4']))) obj.meta = obj.meta.loc[idx] obj.data = obj.data.loc[:, idx] obj.batch_id = obj.batch_id.loc[idx] # we'll run everything with two different edgeR tests methods = ('GLM', 'QLGLM') res_1 = {} res_2 = {}
from stats import nht from utils import output, setops, reference_genomes if __name__ == "__main__": outdir = output.unique_output_dir() pids = consts.PIDS DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de') eps = .1 # offset for log transform target_gene = 'CD274' target_ens = reference_genomes.gene_symbol_to_ensembl(target_gene) # load Salmon data obj_cc = loader.load_by_patient(pids, source='salmon') ix = obj_cc.meta.index.isin(consts.S1_RNASEQ_SAMPLES) obj_cc.filter_samples(ix) # add PID to cell culture metadata obj_cc.meta.insert( 0, 'pid', obj_cc.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*', '\g<pid>')) obj_ff = loader.load_by_patient(pids, source='salmon', type='ffpe') obj_ff.filter_by_sample_name(consts.FFPE_RNASEQ_SAMPLES) # add PID to FFPE metadata nh_id = obj_ff.meta.index.str.replace(r'(_?)(DEF|SP).*', '') p_id = [NH_ID_TO_PATIENT_ID_MAP[t.replace('_', '-')] for t in nh_id]