Beispiel #1
def calc_stats(mfiles_pat, gff):
    fat = Fat(gff)

    hists = {}
    for i in range(1, 6):
        i = str(i)
        mf = mfiles_pat % i
        assert os.path.exists(mf)
        hists[i] = np.fromfile(mf, dtype=np.float32)
    header = "accn,gene,cds,intron,up10,up100,up1000,down10,down100,down1000"

    print header

    for accn, f in sorted(fat.iteritems()):

        data = [accn]
        if not f.seqid in hists: continue  # C, G chrs
        hist = hists[f.seqid]

        for locs in ([[f.start, f.end]], getattr(f, 'CDS',
                                                 None), fat.introns(f),
                     fat.upstream(f, 10, noncoding=True),
                     fat.upstream(f, 100, noncoding=True),
                     fat.upstream(f, 1000, noncoding=True),
                     fat.downstream(f, 10, noncoding=True),
                     fat.downstream(f, 100, noncoding=True),
                     fat.downstream(f, 1000, noncoding=True)):
            if locs is None:
                # occurs when there's no CDS.

            slicer = pairs_to_slice(locs)
                m = hist[slicer]  # this context.
            except IndexError:
                # difference between fasta and features due to version
                slicer = slicer[slicer < hist.shape[0]]
                m = hist[slicer]

            data.append("%.5f" % m.mean())

        print ",".join(data)
Beispiel #2
def calc_stats(mfiles_pat, gff):
    fat = Fat(gff)

    methyl = {}
    contexts = {}
    for i in range(1, 6):
        i = str(i)
        mf = mfiles_pat % i
        assert os.path.exists(mf)
        methyl[i] = np.fromfile(mf, dtype=np.float32)
        mt = np.fromfile(mf.replace(".methyl.", ".methyltype."), dtype=np.uint8)
        # these can be used to mask to a given context.
        contexts[i] = {'cg': (mt == 1) | (mt == 4),
                       'chg': (mt == 2) | (mt == 5),
                       'chh': (mt == 3) | (mt == 6)}
    header = ["accn"]
    for ctx in ('cg', 'chg', 'chh'):
        # TODO: make this suck less.
        ]).replace('CTX', ctx)) # bleckh. shrug.
    print ",".join(header)

    for accn, f in sorted(fat.iteritems()):
        data = [accn]
        if not f.seqid in contexts: continue # C, G

        for mtype, context in sorted(contexts[f.seqid].iteritems()):
            for locs in ([[f.start, f.end]], 
                         getattr(f, 'CDS', None), 
                         fat.upstream(f, 10, noncoding=True), 
                         fat.upstream(f, 100, noncoding=True), 
                         fat.upstream(f, 1000, noncoding=True),
                         fat.downstream(f, 10, noncoding=True), 
                         fat.downstream(f, 100, noncoding=True), 
                         fat.downstream(f, 1000, noncoding=True)
                if locs is None:
                    # occurs when there's no CDS.

                slicer = pairs_to_slice(locs)
                    ctx = context[slicer] # this context.
                except IndexError: 
                    # difference between fasta and features due to version
                    slicer = slicer[slicer < context.shape[0]]
                    ctx = context[slicer]

                # methylation for this CDS masked to current context
                me = methyl[f.seqid][slicer] * ctx 
                # number of sites in this context.
                ctx_sites = ctx.sum() 
                # number of site in this context that are methylated.
                m_ctx_sites = (me > 0).sum() 
                # proprtion of sites that can be methylated that are.
                p_methylated = float(m_ctx_sites) / ctx_sites

                # average methylation for sites in this context.
                avg_methyl = me.sum() / float(ctx_sites)

                # average methylation for sites that are methylated. (exclude zeros).
                avg_methyl_gt0 = me.sum() / float(m_ctx_sites)
                data.extend(["%.5f" % d for d in [avg_methyl, avg_methyl_gt0]])
                data.extend(["%i" % d for d in [m_ctx_sites, ctx_sites]])
                data.append("%.5f" % p_methylated)
        print ",".join(data)