def load_methylation(pids,
                     ref_names=None,
                     norm_method='swan',
                     ref_name_filter=None,
                     units='beta'):
    """
    Load and prepare the Illumina methylation data
    """
    # patient data
    obj = loader.load_by_patient(pids, norm_method=norm_method)
    anno = loader.load_illumina_methylationepic_annotation()

    # reference data
    if ref_names is not None:
        ref_obj = loader.load_reference(ref_names, norm_method=norm_method)
        if ref_name_filter is not None:
            ref_obj.filter_by_sample_name(ref_name_filter, exact=True)
        obj = loader.loader.MultipleBatchLoader([obj, ref_obj])

    me_data = obj.data.dropna()
    if units == 'm':
        me_data = process.m_from_beta(me_data)

    # reduce anno and data down to common probes
    common_probes = anno.index.intersection(me_data.index)

    anno = anno.loc[common_probes]
    # dmr.add_merged_probe_classes(anno)
    me_data = me_data.loc[common_probes]
    obj.data = me_data

    return obj, anno
Exemple #2
0
    #     'ICb1299_shBMI1CHD7': 'shBMI1shCHD7',
    #     'p62_3_shBmi1': 'shBMI1',
    #     'p62_3_shChd7': 'shCHD7',
    #     'p62_3_shB+C': 'shBMI1shCHD7',
    #     'p62_3_Scr': 'scramble',
    #
    # })
    # condition = condition.loc[meta.index]
    # meta.insert(0, 'condition', condition)
    #
    # cell_line = pd.Series('3021', index=meta.index)
    # cell_line[cell_line.index.str.contains('1299')] = 'ICb1299'
    # cell_line[cell_line.index.str.contains('p62')] = 'ICb1299'
    # meta.insert(0, 'cell_line', cell_line)

    anno = loader.load_illumina_methylationepic_annotation()

    me_data = obj.data.dropna()
    me_data = process.m_from_beta(me_data)

    # reduce anno and data down to common probes
    common_probes = anno.index.intersection(me_data.index)

    anno = anno.loc[common_probes]

    # plot PCA

    p = pca.PCA()
    pca_dat = p.fit_transform(me_data.transpose())

    fig = plt.figure()
        df_row = df.loc[ix].tolist()
        if pd.isna(row.UCSC_RefGene_Name):
            gr = [('', '')]
        else:
            g = row.UCSC_RefGene_Name.split(';')
            r = row.UCSC_RefGene_Group.split(';')
            gr = sorted(set(zip(g, r)))
        for t in gr:
            this_res.append([ix] + df_row +
                            [row.CHR, row.Strand, row.MAPINFO, t[0], t[1]])
    return pd.DataFrame(this_res,
                        columns=['probe_id'] + df.columns.tolist() + anno_cols)


if __name__ == '__main__':
    anno = loader.load_illumina_methylationepic_annotation(split_genes=False)

    # 1. Annotate DMPs and re-export to Excel

    # dmp_fns = glob(os.path.join(GIT_LFS_DATA_DIR, 'mb_dmp', '*.xlsx'))
    dmp_fns = glob(os.path.join(os.path.expanduser('~/temp'), '*.xlsx'))

    print "Found %d relevant input (DMP) files: %s" % (len(dmp_fns),
                                                       ', '.join(dmp_fns))
    outdir = output.unique_output_dir("mb_dmps")
    res = {}

    for fn in dmp_fns:
        base = os.path.splitext(os.path.basename(fn))[0]
        res[base] = {}
        dat = pd.read_excel(fn, sheet_name=None)