Beispiel #1
0
def calc_stats(mfiles_pat, gff):
    fat = Fat(gff)

    hists = {}
    for i in range(1, 6):
        i = str(i)
        mf = mfiles_pat % i
        assert os.path.exists(mf)
        hists[i] = np.fromfile(mf, dtype=np.float32)
    header = "accn,gene,cds,intron,up10,up100,up1000,down10,down100,down1000"

    print header

    for accn, f in sorted(fat.iteritems()):

        data = [accn]
        if not f.seqid in hists: continue  # C, G chrs
        hist = hists[f.seqid]

        for locs in ([[f.start, f.end]], getattr(f, 'CDS',
                                                 None), fat.introns(f),
                     fat.upstream(f, 10, noncoding=True),
                     fat.upstream(f, 100, noncoding=True),
                     fat.upstream(f, 1000, noncoding=True),
                     fat.downstream(f, 10, noncoding=True),
                     fat.downstream(f, 100, noncoding=True),
                     fat.downstream(f, 1000, noncoding=True)):
            if locs is None:
                # occurs when there's no CDS.
                data.append("na")
                continue

            slicer = pairs_to_slice(locs)
            try:
                m = hist[slicer]  # this context.
            except IndexError:
                # difference between fasta and features due to version
                slicer = slicer[slicer < hist.shape[0]]
                m = hist[slicer]

            data.append("%.5f" % m.mean())

        print ",".join(data)
Beispiel #2
0
def calc_stats(mfiles_pat, gff):
    fat = Fat(gff)

    hists = {}
    for i in range(1, 6):
        i = str(i)
        mf = mfiles_pat % i
        assert os.path.exists(mf)
        hists[i] = np.fromfile(mf, dtype=np.float32)
    header = "accn,gene,cds,intron,up10,up100,up1000,down10,down100,down1000"

    print header

    for accn, f in sorted(fat.iteritems()):
        
        data = [accn]
        if not f.seqid in hists: continue # C, G chrs
        hist = hists[f.seqid]

        for locs in ([[f.start, f.end]], 
                     getattr(f, 'CDS', None), 
                     fat.introns(f),
                     fat.upstream(f, 10, noncoding=True), 
                     fat.upstream(f, 100, noncoding=True), 
                     fat.upstream(f, 1000, noncoding=True),
                     fat.downstream(f, 10, noncoding=True), 
                     fat.downstream(f, 100, noncoding=True), 
                     fat.downstream(f, 1000, noncoding=True)
        ):
            if locs is None:
                # occurs when there's no CDS.
                data.append("na")
                continue

            slicer = pairs_to_slice(locs)
            try:
                m = hist[slicer] # this context.
            except IndexError: 
                # difference between fasta and features due to version
                slicer = slicer[slicer < hist.shape[0]]
                m = hist[slicer]

            data.append("%.5f" % m.mean())
    
        print ",".join(data)
Beispiel #3
0
def calc_stats(mfiles_pat, gff):
    fat = Fat(gff)

    methyl = {}
    contexts = {}
    for i in range(1, 6):
        i = str(i)
        mf = mfiles_pat % i
        assert os.path.exists(mf)
        methyl[i] = np.fromfile(mf, dtype=np.float32)
        mt = np.fromfile(mf.replace(".methyl.", ".methyltype."), dtype=np.uint8)
        # these can be used to mask to a given context.
        contexts[i] = {'cg': (mt == 1) | (mt == 4),
                       'chg': (mt == 2) | (mt == 5),
                       'chh': (mt == 3) | (mt == 6)}
    header = ["accn"]
    for ctx in ('cg', 'chg', 'chh'):
        # TODO: make this suck less.
        header.append(",".join([
            "gene_CTX_avg,gene_CTX_avg_gt0,gene_CTX_n_methylated,gene_CTX_n,gene_CTX_%_methylated",
            "cds_CTX_avg,cds_CTX_avg_gt0,cds_CTX_n_methylated,cds_CTX_n,cds_CTX_%_methylated",
            "intron_CTX_avg,intron_CTX_avg_gt0,intron_CTX_n_methylated,intron_CTX_n,intron_CTX_%_methylated",
            "up10_CTX_avg,up10_CTX_avg_gt0,up10_CTX_n_methylated,up10_CTX_n,up10_CTX_%_methylated",
            "up100_CTX_avg,up100_CTX_avg_gt0,up100_CTX_n_methylated,up100_CTX_n,up100_CTX_%_methylated",
            "up1000_CTX_avg,up1000_CTX_avg_gt0,up1000_CTX_n_methylated,up1000_CTX_n,up1000_CTX_%_methylated",
            "down10_CTX_avg,down10_CTX_avg_gt0,down10_CTX_n_methylated,down10_CTX_n,down10_CTX_%_methylated",
            "down100_CTX_avg,down100_CTX_avg_gt0,down100_CTX_n_methylated,down100_CTX_n,down100_CTX_%_methylated",
            "down1000_CTX_avg,down1000_CTX_avg_gt0,down1000_CTX_n_methylated,down1000_CTX_n,down1000_CTX_%_methylated"
        ]).replace('CTX', ctx)) # bleckh. shrug.
    print ",".join(header)

    for accn, f in sorted(fat.iteritems()):
        
        data = [accn]
        if not f.seqid in contexts: continue # C, G

        for mtype, context in sorted(contexts[f.seqid].iteritems()):
            for locs in ([[f.start, f.end]], 
                         getattr(f, 'CDS', None), 
                         fat.introns(f),
                         fat.upstream(f, 10, noncoding=True), 
                         fat.upstream(f, 100, noncoding=True), 
                         fat.upstream(f, 1000, noncoding=True),
                         fat.downstream(f, 10, noncoding=True), 
                         fat.downstream(f, 100, noncoding=True), 
                         fat.downstream(f, 1000, noncoding=True)
            ):
                if locs is None:
                    # occurs when there's no CDS.
                    data.extend(["na","na","na","na","na"])
                    continue

                slicer = pairs_to_slice(locs)
                try:
                    ctx = context[slicer] # this context.
                except IndexError: 
                    # difference between fasta and features due to version
                    slicer = slicer[slicer < context.shape[0]]
                    ctx = context[slicer]

                # methylation for this CDS masked to current context
                me = methyl[f.seqid][slicer] * ctx 
                # number of sites in this context.
                ctx_sites = ctx.sum() 
                # number of site in this context that are methylated.
                m_ctx_sites = (me > 0).sum() 
                # proprtion of sites that can be methylated that are.
                p_methylated = float(m_ctx_sites) / ctx_sites

                # average methylation for sites in this context.
                avg_methyl = me.sum() / float(ctx_sites)

                # average methylation for sites that are methylated. (exclude zeros).
                avg_methyl_gt0 = me.sum() / float(m_ctx_sites)
                data.extend(["%.5f" % d for d in [avg_methyl, avg_methyl_gt0]])
                data.extend(["%i" % d for d in [m_ctx_sites, ctx_sites]])
                data.append("%.5f" % p_methylated)
        
        print ",".join(data)