def sj02bw(sj0, pathpre, genome, np=12): chroms = UT.chroms(genome) chromdf = UT.chromdf(genome).sort_values('size',ascending=False) chroms = [x for x in chromdf['chr'] if x in chroms] chromdic = UT.df2dict(chromdf, 'chr', 'size') if 'jcnt' not in sj0: sj0['jcnt'] = sj0['ucnt']+sj0['mcnt'] files = [] args = [] for c in chroms: f = '{0}.{1}.{{0}}.wig'.format(pathpre,c) args.append((sj0[sj0['chr']==c], c, chromdic[c], f)) files.append(f) rslts = UT.process_mp(sj02wig, args, np=np, doreduce=False) rmfiles = [] for strand in ['+','-','.']: s = STRANDMAP0[strand] wig = pathpre+'.sj{0}.wig'.format(s) bwpath = pathpre+'.sj{0}.bw'.format(s) with open(wig, 'w') as dst: for tmpl in files: f = tmpl.format(strand) with open(f,'r') as src: shutil.copyfileobj(src, dst) rmfiles.append(f) rmfiles.append(wig) wig2bw(wig, UT.chromsizes(genome), bwpath) for f in rmfiles: os.unlink(f) os.unlink(wig)
def merge_bigwigs_mp(bwfiles, genome, dstpath, scale=None, np=7): chroms = UT.chroms(genome) chromfile = UT.chromsizes(genome) chromsizes = UT.df2dict(UT.chromdf(genome), 'chr', 'size') # reorder chroms, so that chrX doesn't get processed alone at the end wasting MP time tmp = sorted([(chromsizes[c], c) for c in chroms])[::-1] chroms = [x[1] for x in tmp] args = [(bwfiles, c, chromsizes[c], dstpath + '.{0}.wig'.format(c), scale) for c in chroms] rslts = UT.process_mp(merge_bigwigs_chr, args, np, doreduce=False) dic = dict(rslts) LOG.debug('concatenating chromosomes...') wigpath = dstpath + '.wig' UT.makedirs(os.path.dirname(wigpath)) with open(wigpath, 'wb') as dst: for c in chroms: with open(dic[c], 'rb') as src: shutil.copyfileobj(src, dst) LOG.debug('converting wiggle to bigwig') BT.wig2bw(wigpath, chromfile, dstpath) # clean up for c in chroms: f = dstpath + '.{0}.wig'.format(c) if os.path.exists(f): os.unlink(f) if os.path.exists(wigpath): os.unlink(wigpath)
def get_totbp_covbp_bw(bwfile, genome, chroms=None): """ Calculate total bp, covered bp, mean coverage, covered %. Args: bwfile: bigwig file genome: UCSC genome name chroms (list): of chromosomes Returns: Pandas dataframe """ chromdf = UT.chromdf(genome).set_index('chr')['size'] def one(chrom): csize = chromdf.ix[chrom] a = get_bigwig_as_array(bwfile, chrom, 0, csize) totbp = N.sum(a) covbp = N.sum(a > 0) acov = float(totbp) / covbp covp = (float(covbp) / csize) * 100. return {'totbp': totbp, 'covbp': covbp, 'acov': acov, 'cov%': covp} if chroms is None: chroms = UT.chroms(genome) df = PD.DataFrame({x: one(x) for x in chroms}) return df
def __call__(self): chroms = UT.chroms(self.genome) csizedic = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size') args = [] for c in chroms: csize = csizedic[c] args.append((self.bwsjpre, self.statspath, c, csize, self.params)) rslts = UT.process_mp(filter_sj, args, np=self.np, doreduce=False) dstpath = self.bwsjpre + '.sjpath.filtered.bed.gz' with open(dstpath, 'wb') as dst: for c in chroms: srcpath = self.bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format( c) with open(srcpath, 'rb') as src: shutil.copyfileobj(src, dst)
def estimatecovs(modelpre, bwpre, dstpre, genome, tcovth=1, np=6): bed = GGB.read_bed(modelpre + '.paths.withse.bed.gz') chroms = bed['chr'].unique() csizedic = UT.df2dict(UT.chromdf(genome), 'chr', 'size') bundles = [] args = [] for chrom in chroms: sub = bed[(bed['chr'] == chrom)] uc = UT.union_contiguous(sub[['chr', 'st', 'ed']], returndf=True) # total about 30K=> make batch of ~1000 n = len(uc) nb = int(N.ceil(n / 1000.)) for i in range(nb): sti = 1000 * i edi = min(1000 * (i + 1), len(uc) - 1) st = max(uc.iloc[sti]['st'] - 100, 0) ed = min(uc.iloc[edi]['ed'] + 100, csizedic[chrom]) args.append([modelpre, bwpre, chrom, st, ed, dstpre, tcovth]) bundles.append((chrom, st, ed)) rslts = UT.process_mp(bundle_estimator, args, np=np, doreduce=False) concatenate_bundles(bundles, dstpre)
def __call__(self): # exdf => ex.p, ex.n, ex.u # sjdf => sj.p, sj.n, sj.u # paths => sjpath.bed # divide into tasks (exdf,sjdf,paths) x chroms self.server = server = TQ.Server(name='PrepBWSJ', np=self.np) self.chroms = chroms = UT.chroms(self.genome) csizes = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size') self.exstatus = exstatus = {} self.sjstatus = sjstatus = {} self.pastatus = pastatus = {} exdone = False sjdone = False padone = False with server: for chrom in chroms: # exdf tasks tname = 'prep_exwig_chr.{0}'.format(chrom) args = (self.j2pres, self.libsizes, self.dstpre, chrom, csizes[chrom]) task = TQ.Task(tname, prep_exwig_chr, args) server.add_task(task) # exdf tasks tname = 'prep_sjwig_chr.{0}'.format(chrom) args = (self.j2pres, self.libsizes, self.dstpre, chrom, csizes[chrom]) task = TQ.Task(tname, prep_sjwig_chr, args) server.add_task(task) # exdf tasks tname = 'prep_sjpath_chr.{0}'.format(chrom) args = (self.j2pres, self.libsizes, self.dstpre, chrom) task = TQ.Task(tname, prep_sjpath_chr, args) server.add_task(task) while server.check_error(): try: name, rslt = server.get_result( timeout=5) # block until result come in except TQ.Empty: name, rslt = None, None if name is not None: if name.startswith('prep_exwig_chr.'): chrom = name.split('.')[1] exstatus[chrom] = rslt if len(exstatus) == len(chroms): # all finished print('$$$$$$$$ putting in prep_exbw $$$$$$$$$$$') tname = 'prep_exbw' args = (self.dstpre, chroms, self.genome) task = TQ.Task(tname, prep_exbw, args) server.add_task(task) if name.startswith('prep_sjwig_chr.'): chrom = name.split('.')[1] sjstatus[chrom] = rslt if len(sjstatus) == len(chroms): # all finished print('$$$$$$$$ putting in prep_sjbw $$$$$$$$$$$') tname = 'prep_sjbw' args = (self.dstpre, chroms, self.genome) task = TQ.Task(tname, prep_sjbw, args) server.add_task(task) if name.startswith('prep_sjpath_chr.'): chrom = name.split('.')[1] pastatus[chrom] = rslt if len(pastatus) == len(chroms): # all finished print( '$$$$$$$$ putting in prep_sjpath $$$$$$$$$$$') tname = 'prep_sjpath' args = (self.dstpre, chroms) task = TQ.Task(tname, prep_sjpath, args) server.add_task(task) if name == 'prep_exbw': print('$$$$$$$$ prep_exbw done $$$$$$$$$$$') exdone = True if name == 'prep_sjbw': print('$$$$$$$$ prep_sjbw done $$$$$$$$$$$') sjdone = True if name == 'prep_sjpath': print('$$$$$$$$ prep_sjpath done $$$$$$$$$$$') padone = True if exdone & sjdone & padone: break print('Exit Loop') print('Done')
def _process_mapbed_chr(dstpre, chrom, genome, chromdir, stranded): # 1st pass: calc dupdic bedpath = dstpre+'.{0}.bed'.format(chrom) dupids = UT.read_pandas(dstpre+'.dupitems.txt.gz', index_col=[0]).index # 2nd pass make wiggles gfc = FA.GenomeFASTAChroms(chromdir) chromsize = UT.df2dict(UT.chromdf(genome), 'chr', 'size')[chrom] # mqth MAPQ threshold there are ~6% <10 # generator which makes an array fp = open(bedpath,'rb') wigs = {} wigpaths = {} for kind in ['.ex','.sj']: wigs[kind] = {} wigpaths[kind] = {} for strand in ['.p','.n','.u']: wigs[kind][strand] = {} wigpaths[kind][strand] = {} for suf in ['','.uniq']: wigpath = dstpre+kind+suf+strand+'.{0}.wig'.format(chrom) if os.path.exists(wigpath): os.unlink(wigpath) wigpaths[kind][strand][suf] = wigpath wigs[kind][strand][suf] = N.zeros(chromsize, dtype=float) sjs = [] # path: (chr, st, ed, pcode, ucnt, strand, acnt) # pcode = a(apos)d(dpos) = a(ed)d(st) if strand=='+' else a(st)d(ed) # ucnt = unique read counts # acnt = multi-read adjusted all counts (=ucnt+Sum(mcnt(i)/dup(i))) # delete previous sjbed12 = dstpre+'.{0}.sjpath.bed'.format(chrom) if os.path.exists(sjbed12): os.unlink(sjbed12) def _write_arrays(): for kind in ['.ex','.sj']: for strand in ['.p','.n','.u']: for suf in ['','.uniq']: cybw.array2wiggle_chr64(wigs[kind][strand][suf], chrom, wigpaths[kind][strand][suf], 'w') def _write_sj(sjs): # sjs = [(chr,st,ed,pathcode(name),ureads(sc1),strand,tst,ted,areads(sc2),cse),...] sjdf = PD.DataFrame(sjs, columns=GGB.BEDCOLS[:9]+['cse']) sjdfgr = sjdf.groupby('name') sj = sjdfgr.first() sj['sc1'] = sjdfgr['sc1'].sum().astype(int) # ucnt sj['sc2'] = sjdfgr['sc2'].sum().astype(int) # jcnt=ucnt+mcnt sj['st'] = sjdfgr['st'].min() sj['ed'] = sjdfgr['ed'].max() sj['#exons'] = sj['cse'].apply(len)+1 sj['ests'] = [[0]+[z[1]-st for z in cse] for st,cse in sj[['st','cse']].values] sj['eeds'] = [[z[0]-st for z in cse]+[ed-st] for st,ed,cse in sj[['st','ed','cse']].values] esizes = [[u-v for u,v in zip(x,y)] for x,y in sj[['eeds','ests']].values] sj['estarts'] = ['{0},'.format(','.join([str(y) for y in x])) for x in sj['ests']] sj['esizes'] = ['{0},'.format(','.join([str(y) for y in x])) for x in esizes] sj['name'] = sj.index # sj = sj.reset_index() with open(sjbed12, 'w') as f: sj[GGB.BEDCOLS].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE) def _append_sj(cse, css, csj, chrom,ureads,areads): if (len(cse)>0): # spits out splice rec # chr,st,ed,pathcode,ureads,strand,tst,ted,areads tst = cse[0][0] ted = cse[-1][1] if len(css)>0: strand = Counter(css).most_common()[0][0] else: strand = '.' name = pathcode(cse, strand) st = int(csj[0][1]) # first segment start ed = int(csj[-1][2]) # last segment end sjs.append((chrom,st,ed,name,ureads,strand,tst,ted,areads,cse)) def _add_to_ex_arrays(st,ed,dup,strand): kind='.ex' strand = STRANDMAP[(strand,stranded)] dic = wigs[kind][strand] dic[''][st:ed] += 1 if not dup: dic['.uniq'][st:ed] += 1 def _add_to_sj_arrays(sst,sed,dup,strand): kind='.sj' s = {'+':'.p','-':'.n','.':'.u'}[strand] dic = wigs[kind][s] # add to the arrays dic[''][sst:sed] += 1 if not dup: dic['.uniq'][sst:sed] += 1 ureads,areads = 1,1 else: ureads,areads = 0,1 return ureads,areads csj = [] # current collection of spliced reads css = [] # current strands cse = [] # current (sst,sed) csn = 0 # current segment number ureads,areads = 1,1 # uniq, total reads it's either 1,1 or 0,1 pmid = None # previous map id common to spliced segments for line in fp: rec = line.strip().split(b'\t') # 7 column bed: chr(0), st(1), ed(2), name(3), mapq(4), strand(5), mapid(6) cchr = rec[0].decode() st,ed = int(rec[1]),int(rec[2]) dup = rec[3] in dupids #dic[rec[3]] estrand = rec[5] _add_to_ex_arrays(st,ed,dup,estrand) # process splice if pmid != rec[6]: # new map _append_sj(cse, css, csj, chrom, ureads, areads) csj,css,cse,csn = [rec],[],[],0 # reset running params else: # add segments csj.append(rec) prec = csj[-2] # previous rec sst = int(prec[2]) # ed of previous segment sed = int(rec[1]) # st of current segment cse.append((sst,sed)) # find strand sted = gfc.get(chrom,sst,sst+2)+gfc.get(chrom,sed-2,sed) strand = STED2STRAND.get(sted,'.') if strand != '.': css.append(strand) ureads,areads = _add_to_sj_arrays(sst,sed,dup,strand) pmid = rec[6] _append_sj(cse, css, csj, chrom, ureads, areads) _write_arrays() _write_sj(sjs)
def process_mapbed(bedpath, dstpre, genome, chromdir, stranded='.', np=3): """ Args: bedpath: path to gzipped BED7 file (converted from BAM) dstpre: path prefix to destination genome: UCSC genome (mm10 etc.) chromdir: directory containing chromosome sequence in FASTA np: number of CPU to use Outputs: 1. dstpre+'.ex.p.bw' 2. dstpre+'.ex.n.bw' 3. dstpre+'.ex.u.bw' 4. dstpre+'.sj.p.bw' 5. dstpre+'.sj.n.bw' 6. dstpre+'.sj.u.bw' 7. dstpre+'.ex.p.uniq.bw' 8. dstpre+'.ex.n.uniq.bw' 9. dstpre+'.ex.u.uniq.bw' 10. dstpre+'.sj.p.uniq.bw' 11. dstpre+'.sj.n.uniq.bw' 12. dstpre+'.sj.u.uniq.bw' 13. dstpre+'.sjpath.bed' BED12 (sc1:ucnt, sc2:jcnt=ucnt+mcnt) """ chroms = UT.chroms(genome) chromdf = UT.chromdf(genome) chromsizes = UT.chromsizes(genome) # split into chroms UT.makedirs(dstpre) splitbedgz(bedpath, dstpre) # ~30sec duppath = dstpre+'.dupitems.txt.gz' chroms = [c for c in chroms if os.path.exists(dstpre+'.{0}.bed'.format(c))] files = [dstpre+'.{0}.bed'.format(c) for c in chroms] _scan_make_map(files, duppath) files0 = [dstpre+'.{0}.bed'.format(c) for c in chromdf['chr'].values] # to be deleted args = [(dstpre, x, genome, chromdir, stranded) for x in chroms] # spread to CPUs rslts = UT.process_mp2(_process_mapbed_chr, args, np=np, doreduce=False) # concatenate chr files files1 = [] dstpath = dstpre+'.sjpath.bed' LOG.info('making {0}...'.format(dstpath)) with open(dstpath, 'wb') as dst: for c in chroms: srcpath = dstpre+'.{0}.sjpath.bed'.format(c) files1.append(srcpath) with open(srcpath, 'rb') as src: shutil.copyfileobj(src, dst) dstpath = UT.compress(dstpath) for kind in ['.ex','.sj']: for strand in ['.p','.n','.u']: for suf in ['','.uniq']: pre = dstpre+kind+suf+strand wigpath = pre+'.wig' bwpath = pre+'.bw' with open(wigpath, 'wb') as dst: for c in chroms: srcpath = pre+'.{0}.wig'.format(c) files1.append(srcpath) if os.path.exists(srcpath): with open(srcpath,'rb') as src: shutil.copyfileobj(src, dst) LOG.info('making {0}...'.format(bwpath)) if os.path.getsize(wigpath)>0: wig2bw(wigpath, chromsizes, bwpath) files1.append(wigpath) # clean up temp files LOG.info('deleting intermediate files...') for x in files0+files1: if os.path.exists(x): LOG.debug('deleting {0}...'.format(x)) os.unlink(x)