def load_methylation(pids, ref_names=None, norm_method='swan', ref_name_filter=None, units='beta'): """ Load and prepare the Illumina methylation data """ # patient data obj = loader.load_by_patient(pids, norm_method=norm_method) anno = loader.load_illumina_methylationepic_annotation() # reference data if ref_names is not None: ref_obj = loader.load_reference(ref_names, norm_method=norm_method) if ref_name_filter is not None: ref_obj.filter_by_sample_name(ref_name_filter, exact=True) obj = loader.loader.MultipleBatchLoader([obj, ref_obj]) me_data = obj.data.dropna() if units == 'm': me_data = process.m_from_beta(me_data) # reduce anno and data down to common probes common_probes = anno.index.intersection(me_data.index) anno = anno.loc[common_probes] # dmr.add_merged_probe_classes(anno) me_data = me_data.loc[common_probes] obj.data = me_data return obj, anno
'DURA054_IPSC_N3C_P11', 'DURA054_FB_P5', 'DURA061_NSC_N4_P2', 'DURA061_NSC_N6_P4', 'DURA061_NSC_N1_P3n4', 'DURA026_NSC_N31D_P5', 'DURA052_NSC_N4_P3', 'DURA052_NSC_N5_P2', 'GIBCONSC_P4', # 'DURA052_NH16_2214_P6_14/04/2017', # 'DURA026_NH16_270_P8_15/05/2017', # 'DURA018_NH15_1877_P6_15/05/2017', ] patient_obj = loader.load_by_patient(pids, norm_method=norm_method, samples=our_samples) nazor_ldr = loader.load_reference('GSE31848', norm_method=norm_method) ix = nazor_ldr.meta.index.str.contains(r'(ES__WA)|(iPS__HDF)') ix = ix & (~nazor_ldr.meta.index.str.contains(r'HDF51IPS7') ) # this sample is an outlier, so remove it now nazor_ldr.filter_samples(ix) # Zhou et al.: lots of samples here, but we'll only keep 2 x ESC lines zhou_ldr = loader.load_reference('GSE92462_450K', norm_method=norm_method) ix = zhou_ldr.meta.index.str.contains(r'^H[19]ES') zhou_ldr.filter_samples(ix) hip_epic_ldr = loader.hipsci(norm_method=norm_method, n_sample=n_hipsci,
return {'axs': axs, 'fig': fig} if __name__ == "__main__": pids = consts.PIDS norm_method = 'swan' dmr_params = consts.DMR_PARAMS dmr_params['n_jobs'] = mp.cpu_count() outdir = output.unique_output_dir() DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr') # load our data our_obj = loader.load_by_patient(pids, norm_method=norm_method, samples=consts.S1_METHYL_SAMPLES) anno = loader.load_illumina_methylationepic_annotation() our_obj.meta.insert( 0, 'patient_id', our_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*', '\g<pid>')) # load validation data val_obj = loader.load_reference('GSE92462_450k', norm_method=norm_method) # filter val_obj.filter_samples(val_obj.meta.type.isin(['GBM (GSC)', 'NSC'])) # TODO: upload to the classifier and run (toggle this so it's only run once) # combine and reduce probes
if __name__ == "__main__": pids = consts.PIDS norm_method = 'swan' alpha = 0.05 pk_alpha = -np.log10(alpha) dmr_params = consts.DMR_PARAMS dmr_params['n_jobs'] = mp.cpu_count() outdir = output.unique_output_dir() DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr') # load our data cc_obj = loader.load_by_patient(pids, norm_method=norm_method, samples=consts.S1_METHYL_SAMPLES) ffpe_obj = loader.load_by_patient(pids, norm_method=norm_method, type='ffpe') anno = loader.load_illumina_methylationepic_annotation() # add patient ID column to metadata cc_obj.meta.insert( 0, 'patient_id', cc_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*', '\g<pid>')) ffpe_obj.meta.insert( 0, 'patient_id', [hgic_consts.NH_ID_TO_PATIENT_ID_MAP[t] for t in ffpe_obj.meta.index]) ffpe_obj.meta.insert(1, 'type', 'ffpe')
from utils import output, log, setops from scripts.hgic_final import consts, two_strategies_grouped_dispersion as tsgd from methylation import loader as methylation_loader, dmr, process from rnaseq import loader as rnaseq_loader from settings import INTERMEDIATE_DIR from plotting import genomics logger = log.get_console_logger() if __name__ == '__main__': outdir = output.unique_output_dir() # load methylation and DMR data meth_obj = methylation_loader.load_by_patient(consts.PIDS, include_control=False) meth_obj.filter_by_sample_name(consts.S1_METHYL_SAMPLES_GIC + consts.S1_METHYL_SAMPLES_INSC) meth_obj.meta.insert( 0, 'patient_id', meth_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*', '\g<pid>')) mdat = process.m_from_beta(meth_obj.data) norm_method_s1 = 'swan' dmr_params = consts.DMR_PARAMS de_params = consts.DE_PARAMS DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr') DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de')
if __name__ == "__main__": """ Here, we simply load the methylation data and export it in an efficient manner (restricting the floating point bit depth to save space). We also export the annotation and metadata separately. """ norm_method = 'swan' # the float format is used when exporting to Excel - it reduces the file size by restricting the precision float_format = '%.2f' outdir = output.unique_output_dir() anno = loader.load_illumina_methylationepic_annotation() obj_cc = loader.load_by_patient(consts.PIDS, type='cell_culture', norm_method=norm_method, reduce_to_common_probes=False) obj_ff = loader.load_by_patient(consts.PIDS, type='ffpe', norm_method=norm_method, reduce_to_common_probes=False) # add useful patient ID column to metadata obj_ff.meta.insert(0, 'patient_id', [NH_ID_TO_PATIENT_ID_MAP[k] for k in obj_ff.meta.index]) # export methylation data obj_cc.data.to_excel(os.path.join(outdir, "methylation_beta_cell_culture.xlsx"), float_format=float_format) obj_ff.data.to_excel(os.path.join(outdir, "methylation_beta_ffpe.xlsx"), float_format=float_format)
'core_min_sample_overlap': 3, # 3 / 4 samples must match 'd_max': 400, 'n_min': 6, 'delta_m_min': 1.4, 'fdr': 0.01, 'dmr_test_method': 'mwu', # 'mwu', 'mwu_permute' 'test_kwargs': {}, 'n_jobs': 4, } norm_method = 'swan' intersecter = lambda x, y: set(x).intersection(y) unioner = lambda x, y: set(x).union(y) # Load DNA Methylation me_obj = loader.load_by_patient(pids, norm_method=norm_method) me_meta = me_obj.meta # me_data, me_meta = methylation_array.load_by_patient(pids) # me_data.dropna(inplace=True) # me_data = process.m_from_beta(me_data) me_data = process.m_from_beta(me_obj.data) anno = loader.load_illumina_methylationepic_annotation() # anno = methylation_array.load_illumina_methylationepic_annotation() # reduce anno and data down to common probes common_probes = anno.index.intersection(me_data.index) anno = anno.loc[common_probes] me_data = me_data.loc[common_probes]
'n_min': 6, 'delta_m_min': 1.4, 'alpha': 0.01, 'dmr_test_method': 'mwu', # 'mwu', 'mwu_permute' 'test_kwargs': {}, } norm_method_s1 = 'swan' ############ # 1: FFPE # ############ # in this case, we want the median beta value over all probes that are associated with a given gene # we'll exclude those associated with gene body only ffpe_obj = loader.load_by_patient(pids, type='ffpe', norm_method=norm_method_s1) anno = loader.load_illumina_methylationepic_annotation(split_genes=False) # reduce anno to (probe ID, gene, relation) probe_tups = set() for i, row in anno.iterrows(): if pd.isnull(row.UCSC_RefGene_Name): continue genes = row.UCSC_RefGene_Name.split(';') rels = row.UCSC_RefGene_Group.split(';') for g, r in zip(genes, rels): probe_tups.add( (i, g, r) ) probe_tups = list(probe_tups)
n_probe_to_show = 2000 clustering_metric = 'euclidean' outdir = output.unique_output_dir() norm_method = 'swan' pdx_bulk_samples = ['SM18_108A_GBM019Luc_PDX1', 'SM18_119A_GBM019Luc_PDX2'] gic_late_samples = [ 'GBM019Luc_P12', 'GBM019Luc_P3_PDX1', 'GBM019Luc_P2_PDX2', ] # load all relevant data our_gic_obj = loader.load_by_patient(consts.PIDS, include_control=False, samples=consts.S1_METHYL_SAMPLES_GIC, norm_method=norm_method) our_ffpe_obj = loader.load_by_patient(consts.PIDS, type='ffpe', include_control=False, norm_method=norm_method) pdx_bulk = loader.load_reference('2018-12-14', norm_method=norm_method, samples=pdx_bulk_samples) gic_late = loader.load_reference('2018-12-06', norm_method=norm_method, samples=gic_late_samples) # add patient ID to samples our_gic_obj.meta.insert( 0, 'patient_id',