)
    ix = ~e6194_obj.meta.cell_line.isin([
        'NA07057',
        'HCT116',
        'HEL46.11',
    ])
    e6194_obj.filter_samples(ix)
    e6194_obj.meta.insert(1, 'array_type', 'EPIC')

    # GSE110544 (Banovich et al.; iPSC lines) (EPIC)
    # banov_obj = loader.load_reference('gse110544', norm_method=norm_method)
    # banov_obj.meta.insert(1, 'array_type', 'EPIC')

    # HipSci data
    hip_epic_obj = loader.hipsci(norm_method=norm_method,
                                 array_type='epic',
                                 n_sample=12)

    # combine all data
    obj = loader.loader.MultipleBatchLoader(
        # [me_obj, encode_epic_obj, e6194_obj, hip_epic_obj, banov_obj]
        [me_obj, encode_epic_obj, e6194_obj, hip_epic_obj])
    meta = obj.meta
    dat_m = process.m_from_beta(obj.data)

    this_anno = anno.loc[dat_m.index]
    dmr_clusters = compute_dmr_clusters(this_anno, dmr_params)

    ipsc_ref_names_6194 = ['HEL139', 'HEL140', 'HEL141']
    ipsc_ref_names_6194_n1 = [
        'HEL139.2_p17', 'HEL139.5_p14', 'HEL139.8_p13', 'HEL140.1_p12',
                                         norm_method=norm_method,
                                         samples=our_samples)

    nazor_ldr = loader.load_reference('GSE31848', norm_method=norm_method)
    ix = nazor_ldr.meta.index.str.contains(r'(ES__WA)|(iPS__HDF)')
    ix = ix & (~nazor_ldr.meta.index.str.contains(r'HDF51IPS7')
               )  # this sample is an outlier, so remove it now
    nazor_ldr.filter_samples(ix)

    # Zhou et al.: lots of samples here, but we'll only keep 2 x ESC lines
    zhou_ldr = loader.load_reference('GSE92462_450K', norm_method=norm_method)
    ix = zhou_ldr.meta.index.str.contains(r'^H[19]ES')
    zhou_ldr.filter_samples(ix)

    hip_epic_ldr = loader.hipsci(norm_method=norm_method,
                                 n_sample=n_hipsci,
                                 array_type='epic')
    ## FIXME: this is required to avoid a BUG where the meta column gets renamed to batch_1 in all other loaders
    hip_epic_ldr.meta.drop('batch', axis=1, inplace=True)

    # Weltner et al. (E-MTAB-6194)
    e6194_ldr = loader.load_reference('E-MTAB-6194', norm_method=norm_method)
    ix = ~e6194_ldr.meta.cell_line.isin(
        ['NA07057', 'HCT116', 'HEL46.11', 'CCD-1112Sk (CRL-2429)'])
    e6194_ldr.filter_samples(ix)

    refs = [('Kim et al.',
             loader.gse38216(
                 norm_method=norm_method,
                 samples=['H9 ESC 1', 'H9 ESC 2', 'H9 NPC 1', 'H9 NPC 2'])),
            ('Morey et al.',
Example #3
0
    #     r.batch_id = bid
    #     r.meta.index = ["%s_%s" % (t, bid) for t in r.meta.index]
    #     r.data.columns = r.meta.index
    # ref_obj = loader.loader.MultipleBatchLoader([t[1] for t in refs])
    # ref_meta = ref_obj.meta.loc[ref_obj.meta.index.str.contains('ESC')]
    # ref_data = ref_data.loc[:, ref_meta.index]

    # combine all loaders
    # this will reduce the probe list to the intersection (i.e. 450K)
    ref_obj = loader.loader.MultipleBatchLoader(refs)

    ref_meta = ref_obj.meta
    ref_data = ref_obj.data.dropna()

    # HipSci data
    hip_epic_ldr = loader.hipsci(norm_method=norm_method, n_sample=12, array_type='epic')
    hip_epic_meta = hip_epic_ldr.meta
    hip_epic_data = hip_epic_ldr.data

    # hip_450k_meta, hip_450k_data = loader.hipsci(norm_method=norm_method, n_sample=30, array_type='450k')

    hip_ldr = loader.hipsci(norm_method=norm_method, n_sample=12, array_type='all')
    hip_meta = hip_ldr.meta
    hip_data = hip_ldr.data
    hip_meta.batch = ["HipSci (%s)" % t for t in hip_meta.array_type]

    # clustering genome-wide
    # iPSC, FB, ESC

    # mix of HipSci samples by array_type
    meta, dat = combine_data_meta(