Beispiel #1
0
def sj02bw(sj0, pathpre, genome, np=12):
    chroms = UT.chroms(genome)
    chromdf = UT.chromdf(genome).sort_values('size',ascending=False)
    chroms = [x for x in chromdf['chr'] if x in chroms]
    chromdic = UT.df2dict(chromdf, 'chr', 'size')
    if 'jcnt' not in sj0:
        sj0['jcnt'] = sj0['ucnt']+sj0['mcnt']
    files = []
    args = []
    for c in chroms:
        f = '{0}.{1}.{{0}}.wig'.format(pathpre,c)
        args.append((sj0[sj0['chr']==c], c, chromdic[c], f))
        files.append(f)
    rslts = UT.process_mp(sj02wig, args, np=np, doreduce=False)
    rmfiles = []
    for strand in ['+','-','.']:
        s = STRANDMAP0[strand]
        wig = pathpre+'.sj{0}.wig'.format(s)
        bwpath = pathpre+'.sj{0}.bw'.format(s)
        with open(wig, 'w') as dst:
            for tmpl in files:
                f = tmpl.format(strand)
                with open(f,'r') as src:
                    shutil.copyfileobj(src, dst)
                rmfiles.append(f)
        rmfiles.append(wig)
        wig2bw(wig, UT.chromsizes(genome), bwpath)
    for f in rmfiles:
        os.unlink(f)
    os.unlink(wig)
    
Beispiel #2
0
def merge_bigwigs_mp(bwfiles, genome, dstpath, scale=None, np=7):
    chroms = UT.chroms(genome)
    chromfile = UT.chromsizes(genome)
    chromsizes = UT.df2dict(UT.chromdf(genome), 'chr', 'size')
    # reorder chroms, so that chrX doesn't get processed alone at the end wasting MP time
    tmp = sorted([(chromsizes[c], c) for c in chroms])[::-1]
    chroms = [x[1] for x in tmp]
    args = [(bwfiles, c, chromsizes[c], dstpath + '.{0}.wig'.format(c), scale)
            for c in chroms]

    rslts = UT.process_mp(merge_bigwigs_chr, args, np, doreduce=False)

    dic = dict(rslts)
    LOG.debug('concatenating chromosomes...')
    wigpath = dstpath + '.wig'
    UT.makedirs(os.path.dirname(wigpath))
    with open(wigpath, 'wb') as dst:
        for c in chroms:
            with open(dic[c], 'rb') as src:
                shutil.copyfileobj(src, dst)

    LOG.debug('converting wiggle to bigwig')
    BT.wig2bw(wigpath, chromfile, dstpath)

    # clean up
    for c in chroms:
        f = dstpath + '.{0}.wig'.format(c)
        if os.path.exists(f):
            os.unlink(f)
    if os.path.exists(wigpath):
        os.unlink(wigpath)
Beispiel #3
0
def get_totbp_covbp_bw(bwfile, genome, chroms=None):
    """ Calculate total bp, covered bp, mean coverage, covered %. 

    Args:
        bwfile: bigwig file
        genome: UCSC genome name
        chroms (list): of chromosomes

    Returns:
        Pandas dataframe

    """
    chromdf = UT.chromdf(genome).set_index('chr')['size']

    def one(chrom):
        csize = chromdf.ix[chrom]
        a = get_bigwig_as_array(bwfile, chrom, 0, csize)
        totbp = N.sum(a)
        covbp = N.sum(a > 0)
        acov = float(totbp) / covbp
        covp = (float(covbp) / csize) * 100.
        return {'totbp': totbp, 'covbp': covbp, 'acov': acov, 'cov%': covp}

    if chroms is None:
        chroms = UT.chroms(genome)
    df = PD.DataFrame({x: one(x) for x in chroms})
    return df
Beispiel #4
0
    def __call__(self):
        chroms = UT.chroms(self.genome)
        csizedic = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size')
        args = []
        for c in chroms:
            csize = csizedic[c]
            args.append((self.bwsjpre, self.statspath, c, csize, self.params))
        rslts = UT.process_mp(filter_sj, args, np=self.np, doreduce=False)

        dstpath = self.bwsjpre + '.sjpath.filtered.bed.gz'
        with open(dstpath, 'wb') as dst:
            for c in chroms:
                srcpath = self.bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format(
                    c)
                with open(srcpath, 'rb') as src:
                    shutil.copyfileobj(src, dst)
Beispiel #5
0
def estimatecovs(modelpre, bwpre, dstpre, genome, tcovth=1, np=6):
    bed = GGB.read_bed(modelpre + '.paths.withse.bed.gz')
    chroms = bed['chr'].unique()
    csizedic = UT.df2dict(UT.chromdf(genome), 'chr', 'size')
    bundles = []
    args = []
    for chrom in chroms:
        sub = bed[(bed['chr'] == chrom)]
        uc = UT.union_contiguous(sub[['chr', 'st', 'ed']], returndf=True)
        # total about 30K=> make batch of ~1000
        n = len(uc)
        nb = int(N.ceil(n / 1000.))
        for i in range(nb):
            sti = 1000 * i
            edi = min(1000 * (i + 1), len(uc) - 1)
            st = max(uc.iloc[sti]['st'] - 100, 0)
            ed = min(uc.iloc[edi]['ed'] + 100, csizedic[chrom])
            args.append([modelpre, bwpre, chrom, st, ed, dstpre, tcovth])
            bundles.append((chrom, st, ed))

    rslts = UT.process_mp(bundle_estimator, args, np=np, doreduce=False)

    concatenate_bundles(bundles, dstpre)
Beispiel #6
0
 def __call__(self):
     # exdf => ex.p, ex.n, ex.u
     # sjdf => sj.p, sj.n, sj.u
     # paths => sjpath.bed
     # divide into tasks (exdf,sjdf,paths) x chroms
     self.server = server = TQ.Server(name='PrepBWSJ', np=self.np)
     self.chroms = chroms = UT.chroms(self.genome)
     csizes = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size')
     self.exstatus = exstatus = {}
     self.sjstatus = sjstatus = {}
     self.pastatus = pastatus = {}
     exdone = False
     sjdone = False
     padone = False
     with server:
         for chrom in chroms:
             # exdf tasks
             tname = 'prep_exwig_chr.{0}'.format(chrom)
             args = (self.j2pres, self.libsizes, self.dstpre, chrom,
                     csizes[chrom])
             task = TQ.Task(tname, prep_exwig_chr, args)
             server.add_task(task)
             # exdf tasks
             tname = 'prep_sjwig_chr.{0}'.format(chrom)
             args = (self.j2pres, self.libsizes, self.dstpre, chrom,
                     csizes[chrom])
             task = TQ.Task(tname, prep_sjwig_chr, args)
             server.add_task(task)
             # exdf tasks
             tname = 'prep_sjpath_chr.{0}'.format(chrom)
             args = (self.j2pres, self.libsizes, self.dstpre, chrom)
             task = TQ.Task(tname, prep_sjpath_chr, args)
             server.add_task(task)
         while server.check_error():
             try:
                 name, rslt = server.get_result(
                     timeout=5)  # block until result come in
             except TQ.Empty:
                 name, rslt = None, None
             if name is not None:
                 if name.startswith('prep_exwig_chr.'):
                     chrom = name.split('.')[1]
                     exstatus[chrom] = rslt
                     if len(exstatus) == len(chroms):  # all finished
                         print('$$$$$$$$ putting in prep_exbw $$$$$$$$$$$')
                         tname = 'prep_exbw'
                         args = (self.dstpre, chroms, self.genome)
                         task = TQ.Task(tname, prep_exbw, args)
                         server.add_task(task)
                 if name.startswith('prep_sjwig_chr.'):
                     chrom = name.split('.')[1]
                     sjstatus[chrom] = rslt
                     if len(sjstatus) == len(chroms):  # all finished
                         print('$$$$$$$$ putting in prep_sjbw $$$$$$$$$$$')
                         tname = 'prep_sjbw'
                         args = (self.dstpre, chroms, self.genome)
                         task = TQ.Task(tname, prep_sjbw, args)
                         server.add_task(task)
                 if name.startswith('prep_sjpath_chr.'):
                     chrom = name.split('.')[1]
                     pastatus[chrom] = rslt
                     if len(pastatus) == len(chroms):  # all finished
                         print(
                             '$$$$$$$$ putting in prep_sjpath $$$$$$$$$$$')
                         tname = 'prep_sjpath'
                         args = (self.dstpre, chroms)
                         task = TQ.Task(tname, prep_sjpath, args)
                         server.add_task(task)
                 if name == 'prep_exbw':
                     print('$$$$$$$$ prep_exbw done $$$$$$$$$$$')
                     exdone = True
                 if name == 'prep_sjbw':
                     print('$$$$$$$$ prep_sjbw done $$$$$$$$$$$')
                     sjdone = True
                 if name == 'prep_sjpath':
                     print('$$$$$$$$ prep_sjpath done $$$$$$$$$$$')
                     padone = True
                 if exdone & sjdone & padone:
                     break
         print('Exit Loop')
     print('Done')
Beispiel #7
0
def _process_mapbed_chr(dstpre, chrom, genome, chromdir, stranded):
    # 1st pass: calc dupdic
    bedpath = dstpre+'.{0}.bed'.format(chrom)
    dupids = UT.read_pandas(dstpre+'.dupitems.txt.gz', index_col=[0]).index
    # 2nd pass make wiggles
    gfc = FA.GenomeFASTAChroms(chromdir)
    chromsize = UT.df2dict(UT.chromdf(genome), 'chr', 'size')[chrom]
    
    # mqth MAPQ threshold there are ~6% <10
    # generator which makes an array
    fp = open(bedpath,'rb')

    wigs = {}
    wigpaths = {}
    for kind in ['.ex','.sj']:
        wigs[kind] = {}
        wigpaths[kind] = {}
        for strand in ['.p','.n','.u']:
            wigs[kind][strand] = {}
            wigpaths[kind][strand] = {}
            for suf in ['','.uniq']:
                wigpath = dstpre+kind+suf+strand+'.{0}.wig'.format(chrom)
                if os.path.exists(wigpath):
                    os.unlink(wigpath)
                wigpaths[kind][strand][suf] = wigpath
                wigs[kind][strand][suf] = N.zeros(chromsize, dtype=float)

    sjs = [] # path: (chr, st, ed, pcode, ucnt, strand, acnt)
    # pcode = a(apos)d(dpos) = a(ed)d(st) if strand=='+' else a(st)d(ed)
    # ucnt = unique read counts
    # acnt = multi-read adjusted all counts (=ucnt+Sum(mcnt(i)/dup(i)))
    # delete previous
    sjbed12 = dstpre+'.{0}.sjpath.bed'.format(chrom)
    if os.path.exists(sjbed12):
        os.unlink(sjbed12)

    def _write_arrays():
        for kind in ['.ex','.sj']:
            for strand in ['.p','.n','.u']:
                for suf in ['','.uniq']:
                    cybw.array2wiggle_chr64(wigs[kind][strand][suf], chrom,  wigpaths[kind][strand][suf], 'w')
        
    def _write_sj(sjs):
        # sjs = [(chr,st,ed,pathcode(name),ureads(sc1),strand,tst,ted,areads(sc2),cse),...]
        sjdf = PD.DataFrame(sjs, columns=GGB.BEDCOLS[:9]+['cse'])
        sjdfgr = sjdf.groupby('name')
        sj = sjdfgr.first()
        sj['sc1'] = sjdfgr['sc1'].sum().astype(int) # ucnt
        sj['sc2'] = sjdfgr['sc2'].sum().astype(int) # jcnt=ucnt+mcnt
        sj['st'] = sjdfgr['st'].min()
        sj['ed'] = sjdfgr['ed'].max()
        sj['#exons'] = sj['cse'].apply(len)+1
        sj['ests'] = [[0]+[z[1]-st for z in cse] for st,cse in sj[['st','cse']].values]
        sj['eeds'] = [[z[0]-st for z in cse]+[ed-st] for st,ed,cse in sj[['st','ed','cse']].values]
        esizes = [[u-v for u,v in zip(x,y)] for x,y in sj[['eeds','ests']].values]
        sj['estarts'] = ['{0},'.format(','.join([str(y) for y in x])) for x in sj['ests']]
        sj['esizes'] = ['{0},'.format(','.join([str(y) for y in x])) for x in esizes]
        sj['name'] = sj.index
        # sj = sj.reset_index()
        with open(sjbed12, 'w') as f:
            sj[GGB.BEDCOLS].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE)
            
    def _append_sj(cse, css, csj, chrom,ureads,areads):
        if (len(cse)>0): # spits out splice rec
            # chr,st,ed,pathcode,ureads,strand,tst,ted,areads
            tst = cse[0][0]
            ted = cse[-1][1]
            if len(css)>0:
                strand = Counter(css).most_common()[0][0]
            else:
                strand = '.'
            name = pathcode(cse, strand)
            st = int(csj[0][1]) # first segment start
            ed = int(csj[-1][2]) # last segment end
            sjs.append((chrom,st,ed,name,ureads,strand,tst,ted,areads,cse))   
    
    def _add_to_ex_arrays(st,ed,dup,strand):
        kind='.ex'
        strand = STRANDMAP[(strand,stranded)]
        dic = wigs[kind][strand]
        dic[''][st:ed] += 1
        if not dup:
            dic['.uniq'][st:ed] += 1

    def _add_to_sj_arrays(sst,sed,dup,strand):
        kind='.sj'
        s = {'+':'.p','-':'.n','.':'.u'}[strand]
        dic = wigs[kind][s]
        # add to the arrays
        dic[''][sst:sed] += 1
        if not dup:
            dic['.uniq'][sst:sed] += 1
            ureads,areads = 1,1
        else:
            ureads,areads = 0,1
        return ureads,areads
        
    csj = [] # current collection of spliced reads
    css = [] # current strands
    cse = [] # current (sst,sed)
    csn = 0 # current segment number
    ureads,areads = 1,1 # uniq, total reads it's either 1,1 or 0,1
    pmid = None # previous map id common to spliced segments
    for line in fp:
        rec = line.strip().split(b'\t')
        # 7 column bed: chr(0), st(1), ed(2), name(3), mapq(4), strand(5), mapid(6)
        cchr = rec[0].decode()
        st,ed = int(rec[1]),int(rec[2])
        dup = rec[3] in dupids #dic[rec[3]]
        estrand = rec[5]
        _add_to_ex_arrays(st,ed,dup,estrand)
        # process splice
        if pmid != rec[6]: # new map 
            _append_sj(cse, css, csj, chrom, ureads, areads)
            csj,css,cse,csn = [rec],[],[],0 # reset running params
        else: # add segments
            csj.append(rec)            
            prec = csj[-2] # previous rec
            sst = int(prec[2]) # ed of previous segment
            sed = int(rec[1]) # st of current segment
            cse.append((sst,sed))
            # find strand
            sted = gfc.get(chrom,sst,sst+2)+gfc.get(chrom,sed-2,sed)
            strand = STED2STRAND.get(sted,'.')
            if strand != '.':
                css.append(strand)
            ureads,areads = _add_to_sj_arrays(sst,sed,dup,strand)
        pmid = rec[6]

    _append_sj(cse, css, csj, chrom, ureads, areads)

    _write_arrays()
    _write_sj(sjs)
Beispiel #8
0
def process_mapbed(bedpath, dstpre, genome, chromdir, stranded='.', np=3):
    """
    Args:
        bedpath: path to gzipped BED7 file (converted from BAM)
        dstpre: path prefix to destination
        genome: UCSC genome (mm10 etc.)
        chromdir: directory containing chromosome sequence in FASTA
        np: number of CPU to use

    Outputs:
        1. dstpre+'.ex.p.bw'
        2. dstpre+'.ex.n.bw'
        3. dstpre+'.ex.u.bw'
        4. dstpre+'.sj.p.bw'
        5. dstpre+'.sj.n.bw'
        6. dstpre+'.sj.u.bw'
        7. dstpre+'.ex.p.uniq.bw'
        8. dstpre+'.ex.n.uniq.bw'
        9. dstpre+'.ex.u.uniq.bw'
        10. dstpre+'.sj.p.uniq.bw'
        11. dstpre+'.sj.n.uniq.bw'
        12. dstpre+'.sj.u.uniq.bw'
        13. dstpre+'.sjpath.bed' BED12 (sc1:ucnt, sc2:jcnt=ucnt+mcnt)
    """
    chroms = UT.chroms(genome)
    chromdf = UT.chromdf(genome)
    chromsizes = UT.chromsizes(genome)

    # split into chroms
    UT.makedirs(dstpre)
    splitbedgz(bedpath, dstpre) # ~30sec
    duppath = dstpre+'.dupitems.txt.gz'
    chroms = [c for c in chroms if os.path.exists(dstpre+'.{0}.bed'.format(c))]
    files = [dstpre+'.{0}.bed'.format(c) for c in chroms]
    _scan_make_map(files, duppath)

    files0 = [dstpre+'.{0}.bed'.format(c) for c  in chromdf['chr'].values] # to be deleted
    args = [(dstpre, x, genome, chromdir, stranded) for x in chroms]
    # spread to CPUs
    rslts = UT.process_mp2(_process_mapbed_chr, args, np=np, doreduce=False)
    # concatenate chr files
    files1 = []
    dstpath = dstpre+'.sjpath.bed'
    LOG.info('making {0}...'.format(dstpath))
    with open(dstpath, 'wb') as dst:
        for c in chroms:
            srcpath = dstpre+'.{0}.sjpath.bed'.format(c)
            files1.append(srcpath)
            with open(srcpath, 'rb') as src:
                shutil.copyfileobj(src, dst)
    dstpath = UT.compress(dstpath)

    for kind in ['.ex','.sj']:
        for strand in ['.p','.n','.u']:
            for suf in ['','.uniq']:
                pre = dstpre+kind+suf+strand
                wigpath = pre+'.wig'
                bwpath = pre+'.bw'
                with open(wigpath, 'wb') as dst:
                    for c in chroms:
                        srcpath = pre+'.{0}.wig'.format(c)
                        files1.append(srcpath)
                        if os.path.exists(srcpath):
                            with open(srcpath,'rb') as src:
                                shutil.copyfileobj(src, dst)
                LOG.info('making {0}...'.format(bwpath))
                if os.path.getsize(wigpath)>0:
                    wig2bw(wigpath, chromsizes, bwpath)
                files1.append(wigpath)

    # clean up temp files
    LOG.info('deleting intermediate files...')
    for x in files0+files1:
        if os.path.exists(x):
            LOG.debug('deleting {0}...'.format(x))
            os.unlink(x)