Esempio n. 1
0
def load_methylation(pids,
                     ref_names=None,
                     norm_method='swan',
                     ref_name_filter=None,
                     units='beta'):
    """
    Load and prepare the Illumina methylation data
    """
    # patient data
    obj = loader.load_by_patient(pids, norm_method=norm_method)
    anno = loader.load_illumina_methylationepic_annotation()

    # reference data
    if ref_names is not None:
        ref_obj = loader.load_reference(ref_names, norm_method=norm_method)
        if ref_name_filter is not None:
            ref_obj.filter_by_sample_name(ref_name_filter, exact=True)
        obj = loader.loader.MultipleBatchLoader([obj, ref_obj])

    me_data = obj.data.dropna()
    if units == 'm':
        me_data = process.m_from_beta(me_data)

    # reduce anno and data down to common probes
    common_probes = anno.index.intersection(me_data.index)

    anno = anno.loc[common_probes]
    # dmr.add_merged_probe_classes(anno)
    me_data = me_data.loc[common_probes]
    obj.data = me_data

    return obj, anno
        'DURA054_IPSC_N3C_P11',
        'DURA054_FB_P5',
        'DURA061_NSC_N4_P2',
        'DURA061_NSC_N6_P4',
        'DURA061_NSC_N1_P3n4',
        'DURA026_NSC_N31D_P5',
        'DURA052_NSC_N4_P3',
        'DURA052_NSC_N5_P2',
        'GIBCONSC_P4',
        # 'DURA052_NH16_2214_P6_14/04/2017',
        # 'DURA026_NH16_270_P8_15/05/2017',
        # 'DURA018_NH15_1877_P6_15/05/2017',
    ]

    patient_obj = loader.load_by_patient(pids,
                                         norm_method=norm_method,
                                         samples=our_samples)

    nazor_ldr = loader.load_reference('GSE31848', norm_method=norm_method)
    ix = nazor_ldr.meta.index.str.contains(r'(ES__WA)|(iPS__HDF)')
    ix = ix & (~nazor_ldr.meta.index.str.contains(r'HDF51IPS7')
               )  # this sample is an outlier, so remove it now
    nazor_ldr.filter_samples(ix)

    # Zhou et al.: lots of samples here, but we'll only keep 2 x ESC lines
    zhou_ldr = loader.load_reference('GSE92462_450K', norm_method=norm_method)
    ix = zhou_ldr.meta.index.str.contains(r'^H[19]ES')
    zhou_ldr.filter_samples(ix)

    hip_epic_ldr = loader.hipsci(norm_method=norm_method,
                                 n_sample=n_hipsci,
Esempio n. 3
0
    return {'axs': axs, 'fig': fig}


if __name__ == "__main__":
    pids = consts.PIDS
    norm_method = 'swan'
    dmr_params = consts.DMR_PARAMS
    dmr_params['n_jobs'] = mp.cpu_count()

    outdir = output.unique_output_dir()
    DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr')

    # load our data
    our_obj = loader.load_by_patient(pids,
                                     norm_method=norm_method,
                                     samples=consts.S1_METHYL_SAMPLES)
    anno = loader.load_illumina_methylationepic_annotation()
    our_obj.meta.insert(
        0, 'patient_id',
        our_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*',
                                       '\g<pid>'))

    # load validation data
    val_obj = loader.load_reference('GSE92462_450k', norm_method=norm_method)
    # filter
    val_obj.filter_samples(val_obj.meta.type.isin(['GBM (GSC)', 'NSC']))

    # TODO: upload to the classifier and run (toggle this so it's only run once)

    # combine and reduce probes
if __name__ == "__main__":
    pids = consts.PIDS
    norm_method = 'swan'
    alpha = 0.05
    pk_alpha = -np.log10(alpha)

    dmr_params = consts.DMR_PARAMS
    dmr_params['n_jobs'] = mp.cpu_count()

    outdir = output.unique_output_dir()
    DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr')

    # load our data
    cc_obj = loader.load_by_patient(pids,
                                    norm_method=norm_method,
                                    samples=consts.S1_METHYL_SAMPLES)
    ffpe_obj = loader.load_by_patient(pids,
                                      norm_method=norm_method,
                                      type='ffpe')

    anno = loader.load_illumina_methylationepic_annotation()
    # add patient ID column to metadata
    cc_obj.meta.insert(
        0, 'patient_id',
        cc_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*',
                                      '\g<pid>'))
    ffpe_obj.meta.insert(
        0, 'patient_id',
        [hgic_consts.NH_ID_TO_PATIENT_ID_MAP[t] for t in ffpe_obj.meta.index])
    ffpe_obj.meta.insert(1, 'type', 'ffpe')
from utils import output, log, setops
from scripts.hgic_final import consts, two_strategies_grouped_dispersion as tsgd
from methylation import loader as methylation_loader, dmr, process
from rnaseq import loader as rnaseq_loader
from settings import INTERMEDIATE_DIR

from plotting import genomics

logger = log.get_console_logger()

if __name__ == '__main__':
    outdir = output.unique_output_dir()

    # load methylation and DMR data
    meth_obj = methylation_loader.load_by_patient(consts.PIDS,
                                                  include_control=False)
    meth_obj.filter_by_sample_name(consts.S1_METHYL_SAMPLES_GIC +
                                   consts.S1_METHYL_SAMPLES_INSC)
    meth_obj.meta.insert(
        0, 'patient_id',
        meth_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*',
                                        '\g<pid>'))

    mdat = process.m_from_beta(meth_obj.data)

    norm_method_s1 = 'swan'
    dmr_params = consts.DMR_PARAMS
    de_params = consts.DE_PARAMS

    DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr')
    DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de')
Esempio n. 6
0

if __name__ == "__main__":
    """
    Here, we simply load the methylation data and export it in an efficient manner (restricting the floating point
    bit depth to save space).

    We also export the annotation and metadata separately.
    """
    norm_method = 'swan'
    # the float format is used when exporting to Excel - it reduces the file size by restricting the precision
    float_format = '%.2f'
    outdir = output.unique_output_dir()
    anno = loader.load_illumina_methylationepic_annotation()
    obj_cc = loader.load_by_patient(consts.PIDS,
                                    type='cell_culture',
                                    norm_method=norm_method,
                                    reduce_to_common_probes=False)
    obj_ff = loader.load_by_patient(consts.PIDS,
                                    type='ffpe',
                                    norm_method=norm_method,
                                    reduce_to_common_probes=False)
    # add useful patient ID column to metadata
    obj_ff.meta.insert(0, 'patient_id',
                       [NH_ID_TO_PATIENT_ID_MAP[k] for k in obj_ff.meta.index])

    # export methylation data
    obj_cc.data.to_excel(os.path.join(outdir,
                                      "methylation_beta_cell_culture.xlsx"),
                         float_format=float_format)
    obj_ff.data.to_excel(os.path.join(outdir, "methylation_beta_ffpe.xlsx"),
                         float_format=float_format)
Esempio n. 7
0
        'core_min_sample_overlap': 3,  # 3 / 4 samples must match
        'd_max': 400,
        'n_min': 6,
        'delta_m_min': 1.4,
        'fdr': 0.01,
        'dmr_test_method': 'mwu',  # 'mwu', 'mwu_permute'
        'test_kwargs': {},
        'n_jobs': 4,
    }
    norm_method = 'swan'

    intersecter = lambda x, y: set(x).intersection(y)
    unioner = lambda x, y: set(x).union(y)

    # Load DNA Methylation
    me_obj = loader.load_by_patient(pids, norm_method=norm_method)
    me_meta = me_obj.meta
    # me_data, me_meta = methylation_array.load_by_patient(pids)
    # me_data.dropna(inplace=True)
    # me_data = process.m_from_beta(me_data)

    me_data = process.m_from_beta(me_obj.data)

    anno = loader.load_illumina_methylationepic_annotation()
    # anno = methylation_array.load_illumina_methylationepic_annotation()

    # reduce anno and data down to common probes
    common_probes = anno.index.intersection(me_data.index)
    anno = anno.loc[common_probes]
    me_data = me_data.loc[common_probes]
Esempio n. 8
0
        'n_min': 6,
        'delta_m_min': 1.4,
        'alpha': 0.01,
        'dmr_test_method': 'mwu',  # 'mwu', 'mwu_permute'
        'test_kwargs': {},
    }
    norm_method_s1 = 'swan'

    ############
    # 1: FFPE  #
    ############

    # in this case, we want the median beta value over all probes that are associated with a given gene
    # we'll exclude those associated with gene body only

    ffpe_obj = loader.load_by_patient(pids, type='ffpe', norm_method=norm_method_s1)
    anno = loader.load_illumina_methylationepic_annotation(split_genes=False)

    # reduce anno to (probe ID, gene, relation)
    probe_tups = set()
    for i, row in anno.iterrows():
        if pd.isnull(row.UCSC_RefGene_Name):
            continue
        genes = row.UCSC_RefGene_Name.split(';')
        rels = row.UCSC_RefGene_Group.split(';')
        for g, r in zip(genes, rels):
            probe_tups.add(
                (i, g, r)
            )

    probe_tups = list(probe_tups)
Esempio n. 9
0
    n_probe_to_show = 2000
    clustering_metric = 'euclidean'

    outdir = output.unique_output_dir()

    norm_method = 'swan'
    pdx_bulk_samples = ['SM18_108A_GBM019Luc_PDX1', 'SM18_119A_GBM019Luc_PDX2']
    gic_late_samples = [
        'GBM019Luc_P12',
        'GBM019Luc_P3_PDX1',
        'GBM019Luc_P2_PDX2',
    ]

    # load all relevant data
    our_gic_obj = loader.load_by_patient(consts.PIDS,
                                         include_control=False,
                                         samples=consts.S1_METHYL_SAMPLES_GIC,
                                         norm_method=norm_method)
    our_ffpe_obj = loader.load_by_patient(consts.PIDS,
                                          type='ffpe',
                                          include_control=False,
                                          norm_method=norm_method)
    pdx_bulk = loader.load_reference('2018-12-14',
                                     norm_method=norm_method,
                                     samples=pdx_bulk_samples)
    gic_late = loader.load_reference('2018-12-06',
                                     norm_method=norm_method,
                                     samples=gic_late_samples)

    # add patient ID to samples
    our_gic_obj.meta.insert(
        0, 'patient_id',