Esempio n. 1
0
def load_methylation(pids,
                     ref_names=None,
                     norm_method='swan',
                     ref_name_filter=None,
                     units='beta'):
    """
    Load and prepare the Illumina methylation data
    """
    # patient data
    obj = loader.load_by_patient(pids, norm_method=norm_method)
    anno = loader.load_illumina_methylationepic_annotation()

    # reference data
    if ref_names is not None:
        ref_obj = loader.load_reference(ref_names, norm_method=norm_method)
        if ref_name_filter is not None:
            ref_obj.filter_by_sample_name(ref_name_filter, exact=True)
        obj = loader.loader.MultipleBatchLoader([obj, ref_obj])

    me_data = obj.data.dropna()
    if units == 'm':
        me_data = process.m_from_beta(me_data)

    # reduce anno and data down to common probes
    common_probes = anno.index.intersection(me_data.index)

    anno = anno.loc[common_probes]
    # dmr.add_merged_probe_classes(anno)
    me_data = me_data.loc[common_probes]
    obj.data = me_data

    return obj, anno
def combine_data_meta(data_arr, meta_arr, units='beta'):
    if len(data_arr) != len(meta_arr):
        raise ValueError("data_arr and meta_arr must have the same size")

    # include all probes again
    dat = pd.concat(
        data_arr,
        axis=1,
        join='inner'
    )
    meta = pd.concat(
        meta_arr,
        axis=0,
        join='outer',
        sort=True
    )
    if units.lower() == 'm':
        # convert to M values
        dat = process.m_from_beta(dat)

    # drop any infinite valued probes (should be very few)
    inft = (~np.isfinite(dat)).sum(axis=1) > 0

    if inft.any():
        dat = dat.loc[~inft]
        print "Dropped %d probes with infinite M values" % inft.sum()

    return meta, dat
    # remove a few
    # ix = obj.meta.type != 'astrocyte'
    # obj.filter_samples(ix)
    #
    # ix = obj.meta.type != 'iAPC'
    # obj.filter_samples(ix)
    #
    # ix = ~obj.meta.index.str.contains('GBM')
    # obj.filter_samples(ix)

    # ix = obj.meta.index != 'H9 NPC (Encode EPIC)'
    # obj.filter_samples(ix)

    bdat = obj.data
    mdat = process.m_from_beta(bdat)

    if qn_method is not None:
        mdat = transformations.quantile_normalisation(mdat, method=qn_method)

    # tidy up batch IDs
    obj.meta.loc[obj.meta.batch.isnull(),
                 'batch'] = obj.meta.loc[obj.meta.batch.isnull(), 'batch_1']
    obj.meta.batch = obj.meta.batch.str.replace('2016-12-19_ucl_genomics',
                                                '2016-12-19')

    # the only batch names without letters are ours
    obj.meta.loc[~obj.meta.batch.str.contains(r'[A-Z]'),
                 'batch'] = 'This study'

    # PCA plot (by batch and cell type)
Esempio n. 4
0
    #     'p62_3_shB+C': 'shBMI1shCHD7',
    #     'p62_3_Scr': 'scramble',
    #
    # })
    # condition = condition.loc[meta.index]
    # meta.insert(0, 'condition', condition)
    #
    # cell_line = pd.Series('3021', index=meta.index)
    # cell_line[cell_line.index.str.contains('1299')] = 'ICb1299'
    # cell_line[cell_line.index.str.contains('p62')] = 'ICb1299'
    # meta.insert(0, 'cell_line', cell_line)

    anno = loader.load_illumina_methylationepic_annotation()

    me_data = obj.data.dropna()
    me_data = process.m_from_beta(me_data)

    # reduce anno and data down to common probes
    common_probes = anno.index.intersection(me_data.index)

    anno = anno.loc[common_probes]

    # plot PCA

    p = pca.PCA()
    pca_dat = p.fit_transform(me_data.transpose())

    fig = plt.figure()
    ax = fig.add_subplot(111)

    marker_groups = meta.cell_line
Esempio n. 5
0
    anno = loader.load_illumina_methylationepic_annotation()
    our_obj.meta.insert(
        0, 'patient_id',
        our_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*',
                                       '\g<pid>'))

    # load validation data
    val_obj = loader.load_reference('GSE92462_450k', norm_method=norm_method)
    # filter
    val_obj.filter_samples(val_obj.meta.type.isin(['GBM (GSC)', 'NSC']))

    # TODO: upload to the classifier and run (toggle this so it's only run once)

    # combine and reduce probes
    obj = loader.loader.MultipleBatchLoader([our_obj, val_obj])
    dat = process.m_from_beta(obj.data)
    meta = obj.meta
    common_probes = anno.index.intersection(dat.index)
    dat = dat.reindex(common_probes)
    anno = anno.reindex(common_probes)

    dmr_hash_dict = dict(dmr_params)
    dmr_hash_dict['norm_method'] = norm_method

    the_hash = tsgd.dmr_results_hash(meta.sort_index().index.tolist(),
                                     dmr_hash_dict)
    filename = 'dmr_results_450k_validation.%d.pkl' % the_hash
    fn = os.path.join(DMR_LOAD_DIR, filename)

    if os.path.isfile(fn):
        logger.info("Loading pre-computed DMR results from %s", fn)
    ffpe_obj = loader.load_by_patient(pids,
                                      norm_method=norm_method,
                                      type='ffpe')

    anno = loader.load_illumina_methylationepic_annotation()
    # add patient ID column to metadata
    cc_obj.meta.insert(
        0, 'patient_id',
        cc_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*',
                                      '\g<pid>'))
    ffpe_obj.meta.insert(
        0, 'patient_id',
        [hgic_consts.NH_ID_TO_PATIENT_ID_MAP[t] for t in ffpe_obj.meta.index])
    ffpe_obj.meta.insert(1, 'type', 'ffpe')

    dat_cc = process.m_from_beta(cc_obj.data).sort_index()
    # replace CC data with those normed differently (in R)
    # dat_cc = pd.read_csv('cell_culture_swan_one_norm.csv', header=0, index_col=0).sort_index()

    dat_ffpe = process.m_from_beta(ffpe_obj.data).sort_index()
    dat = pd.concat((dat_cc, dat_ffpe), axis=1, join='inner')
    meta = pd.concat((cc_obj.meta, ffpe_obj.meta),
                     axis=0,
                     join='outer',
                     sort=True)
    meta.loc[meta.batch.isnull(), 'batch'] = meta.loc[meta.batch.isnull(),
                                                      'batch_1']

    common_probes = anno.index.intersection(dat.index)
    dat = dat.reindex(common_probes)
    anno = anno.reindex(common_probes)
logger = log.get_console_logger()

if __name__ == '__main__':
    outdir = output.unique_output_dir()

    # load methylation and DMR data
    meth_obj = methylation_loader.load_by_patient(consts.PIDS,
                                                  include_control=False)
    meth_obj.filter_by_sample_name(consts.S1_METHYL_SAMPLES_GIC +
                                   consts.S1_METHYL_SAMPLES_INSC)
    meth_obj.meta.insert(
        0, 'patient_id',
        meth_obj.meta.index.str.replace(r'(GBM|DURA)(?P<pid>[0-9]{3}).*',
                                        '\g<pid>'))

    mdat = process.m_from_beta(meth_obj.data)

    norm_method_s1 = 'swan'
    dmr_params = consts.DMR_PARAMS
    de_params = consts.DE_PARAMS

    DMR_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'dmr')
    DE_LOAD_DIR = os.path.join(INTERMEDIATE_DIR, 'de')

    anno = methylation_loader.load_illumina_methylationepic_annotation()

    # use a hash on the PIDs and parameters to ensure we're looking for the right results
    dmr_hash_dict = dict(dmr_params)
    dmr_hash_dict['norm_method'] = norm_method_s1

    # load DMR results
    k = 'GIC-FB syn'
    samples = consts.S1_METHYL_SAMPLES_GIC + consts.S1_METHYL_SAMPLES_FB
    this_pids = ['018', '019', '030', '031', '017', '050', '054', '026', '052']
    pids_included[k] = this_pids

    try:
        dmr_res_fb_syn = load_dmr_results(anno, samples)
    except IOError:
        logger.info("%s. Computing results.", k)
        fn = get_hashed_filename(samples,
                                 norm_method=norm_method,
                                 dmr_params=dmr_params)
        me_obj, this_anno = load_methylation_data(samples,
                                                  anno,
                                                  norm_method=norm_method)
        me_data = process.m_from_beta(me_obj.data)

        data_loaded[k] = me_obj
        dmr_res_fb_syn = tsgd.paired_dmr(me_data,
                                         me_obj.meta,
                                         this_anno,
                                         this_pids,
                                         dmr_params,
                                         type1='GBM',
                                         type2='FB')
        dmr_res_fb_syn.to_pickle(fn, include_annotation=False)
        logger.info("Saved DMR results to %s", fn)

    all_results[k] = dmr_res_fb_syn

    # GIC-iAPC (syngeneic)
                axs[j, i].scatter(dat.loc[idx, cols[0]], dat.loc[idx, cols[1]])
                axs[j, i].set_title(
                    "%s%s r=%.3f" %
                    (lbl, pid,
                     stats.linregress(dat.loc[idx, cols[0]],
                                      dat.loc[idx, cols[1]]).rvalue))
    fig.tight_layout()
    return fig, axs


if __name__ == '__main__':
    outdir = output.unique_output_dir("methylation_replicates",
                                      reuse_empty=True)
    pids = ['017', '050', '054', '061']
    b, me_meta = methylation_array.load_by_patient(pids)
    m = process.m_from_beta(b)
    mad = transformations.median_absolute_deviation(m).sort_values(
        ascending=False)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(mad.values)
    ax.set_xlabel("Probe rank by MAD")
    ax.set_ylabel("MAD value")
    ax.axvline(50000, ls='--', c='r')
    fig.savefig(os.path.join(outdir, "MAD_sorted.png"), dpi=200)

    fig1, axs1 = scatter_plots(m, pids)
    fig1.savefig(os.path.join(outdir, "correlation_all_probes.png"), dpi=200)
    fig2, axs2 = scatter_plots(m, pids, mad=mad, top_n_by_mad=50000)
    fig2.savefig(os.path.join(outdir, "correlation_top_50000.png"), dpi=200)