def test_chromsizes(): assert UT.chroms('mm10') == ['chr{0}'.format(i + 1) for i in range(19)] + ['chrX', 'chrY'] assert UT.chroms('dm3') == [ 'chr2L', 'chr2LHet', 'chr2R', 'chr2RHet', 'chr3L', 'chr3LHet', 'chr3R', 'chr3RHet', 'chr4', 'chrX', 'chrXHet', 'chrYHet', 'chrU', 'chrUextra' ] assert os.path.exists(UT.chromsizes('mm10')) assert os.path.exists(UT.chromsizes('dm3')) assert os.path.exists(UT.chromsizes('hg19'))
def sj02bw(sj0, pathpre, genome, np=12): chroms = UT.chroms(genome) chromdf = UT.chromdf(genome).sort_values('size',ascending=False) chroms = [x for x in chromdf['chr'] if x in chroms] chromdic = UT.df2dict(chromdf, 'chr', 'size') if 'jcnt' not in sj0: sj0['jcnt'] = sj0['ucnt']+sj0['mcnt'] files = [] args = [] for c in chroms: f = '{0}.{1}.{{0}}.wig'.format(pathpre,c) args.append((sj0[sj0['chr']==c], c, chromdic[c], f)) files.append(f) rslts = UT.process_mp(sj02wig, args, np=np, doreduce=False) rmfiles = [] for strand in ['+','-','.']: s = STRANDMAP0[strand] wig = pathpre+'.sj{0}.wig'.format(s) bwpath = pathpre+'.sj{0}.bw'.format(s) with open(wig, 'w') as dst: for tmpl in files: f = tmpl.format(strand) with open(f,'r') as src: shutil.copyfileobj(src, dst) rmfiles.append(f) rmfiles.append(wig) wig2bw(wig, UT.chromsizes(genome), bwpath) for f in rmfiles: os.unlink(f) os.unlink(wig)
def merge_bigwigs_mp(bwfiles, genome, dstpath, scale=None, np=7): chroms = UT.chroms(genome) chromfile = UT.chromsizes(genome) chromsizes = UT.df2dict(UT.chromdf(genome), 'chr', 'size') # reorder chroms, so that chrX doesn't get processed alone at the end wasting MP time tmp = sorted([(chromsizes[c], c) for c in chroms])[::-1] chroms = [x[1] for x in tmp] args = [(bwfiles, c, chromsizes[c], dstpath + '.{0}.wig'.format(c), scale) for c in chroms] rslts = UT.process_mp(merge_bigwigs_chr, args, np, doreduce=False) dic = dict(rslts) LOG.debug('concatenating chromosomes...') wigpath = dstpath + '.wig' UT.makedirs(os.path.dirname(wigpath)) with open(wigpath, 'wb') as dst: for c in chroms: with open(dic[c], 'rb') as src: shutil.copyfileobj(src, dst) LOG.debug('converting wiggle to bigwig') BT.wig2bw(wigpath, chromfile, dstpath) # clean up for c in chroms: f = dstpath + '.{0}.wig'.format(c) if os.path.exists(f): os.unlink(f) if os.path.exists(wigpath): os.unlink(wigpath)
def get_totbp_covbp_bw(bwfile, genome, chroms=None): """ Calculate total bp, covered bp, mean coverage, covered %. Args: bwfile: bigwig file genome: UCSC genome name chroms (list): of chromosomes Returns: Pandas dataframe """ chromdf = UT.chromdf(genome).set_index('chr')['size'] def one(chrom): csize = chromdf.ix[chrom] a = get_bigwig_as_array(bwfile, chrom, 0, csize) totbp = N.sum(a) covbp = N.sum(a > 0) acov = float(totbp) / covbp covp = (float(covbp) / csize) * 100. return {'totbp': totbp, 'covbp': covbp, 'acov': acov, 'cov%': covp} if chroms is None: chroms = UT.chroms(genome) df = PD.DataFrame({x: one(x) for x in chroms}) return df
def __call__(self): chroms = UT.chroms(self.genome) csizedic = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size') args = [] for c in chroms: csize = csizedic[c] args.append((self.bwsjpre, self.statspath, c, csize, self.params)) rslts = UT.process_mp(filter_sj, args, np=self.np, doreduce=False) dstpath = self.bwsjpre + '.sjpath.filtered.bed.gz' with open(dstpath, 'wb') as dst: for c in chroms: srcpath = self.bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format( c) with open(srcpath, 'rb') as src: shutil.copyfileobj(src, dst)
def calc_flux_mp(self, beddf, np=10): chroms = UT.chroms(self.genome) args = [] for c in chroms: bedc = beddf[beddf['chr'] == c] if len(bedc) > 0: # args.append((bedc, self.bwpaths.copy())) args.append((bedc, self.bwpre)) rslts = UT.process_mp2(calc_flux_chr, args, np=np, doreduce=True) df = PD.DataFrame(rslts, columns=CALCFLUXCOLS) exdfi = beddf.set_index('_id').ix[df['_id'].values] for f in COPYCOLS: if f in exdfi: df[f] = exdfi[f].values df['len'] = df['ed'] - df['st'] return df
def calc_params_mp(self, beddf, win=600, siz=10, direction='>', gapmode='53', np=10, covfactor=0): chroms = UT.chroms(self.genome) args = [] for c in chroms: bedc = beddf[beddf['chr'] == c] if len(bedc) > 0: # args.append((bedc, self.bwpaths.copy(), win, siz, direction, gapmode, covfactor)) args.append((bedc, self.bwpre, win, siz, direction, gapmode, covfactor)) rslts = UT.process_mp2(calc_params_chr, args, np=np, doreduce=True) df = PD.DataFrame(rslts, columns=CALCPARAMCOLS) exdfi = beddf.set_index('_id').ix[df['_id'].values] for f in COPYCOLS: if f in exdfi: df[f] = exdfi[f].values df['len'] = df['ed'] - df['st'] return df
def __call__(self): # exdf => ex.p, ex.n, ex.u # sjdf => sj.p, sj.n, sj.u # paths => sjpath.bed # divide into tasks (exdf,sjdf,paths) x chroms self.server = server = TQ.Server(name='PrepBWSJ', np=self.np) self.chroms = chroms = UT.chroms(self.genome) csizes = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size') self.exstatus = exstatus = {} self.sjstatus = sjstatus = {} self.pastatus = pastatus = {} exdone = False sjdone = False padone = False with server: for chrom in chroms: # exdf tasks tname = 'prep_exwig_chr.{0}'.format(chrom) args = (self.j2pres, self.libsizes, self.dstpre, chrom, csizes[chrom]) task = TQ.Task(tname, prep_exwig_chr, args) server.add_task(task) # exdf tasks tname = 'prep_sjwig_chr.{0}'.format(chrom) args = (self.j2pres, self.libsizes, self.dstpre, chrom, csizes[chrom]) task = TQ.Task(tname, prep_sjwig_chr, args) server.add_task(task) # exdf tasks tname = 'prep_sjpath_chr.{0}'.format(chrom) args = (self.j2pres, self.libsizes, self.dstpre, chrom) task = TQ.Task(tname, prep_sjpath_chr, args) server.add_task(task) while server.check_error(): try: name, rslt = server.get_result( timeout=5) # block until result come in except TQ.Empty: name, rslt = None, None if name is not None: if name.startswith('prep_exwig_chr.'): chrom = name.split('.')[1] exstatus[chrom] = rslt if len(exstatus) == len(chroms): # all finished print('$$$$$$$$ putting in prep_exbw $$$$$$$$$$$') tname = 'prep_exbw' args = (self.dstpre, chroms, self.genome) task = TQ.Task(tname, prep_exbw, args) server.add_task(task) if name.startswith('prep_sjwig_chr.'): chrom = name.split('.')[1] sjstatus[chrom] = rslt if len(sjstatus) == len(chroms): # all finished print('$$$$$$$$ putting in prep_sjbw $$$$$$$$$$$') tname = 'prep_sjbw' args = (self.dstpre, chroms, self.genome) task = TQ.Task(tname, prep_sjbw, args) server.add_task(task) if name.startswith('prep_sjpath_chr.'): chrom = name.split('.')[1] pastatus[chrom] = rslt if len(pastatus) == len(chroms): # all finished print( '$$$$$$$$ putting in prep_sjpath $$$$$$$$$$$') tname = 'prep_sjpath' args = (self.dstpre, chroms) task = TQ.Task(tname, prep_sjpath, args) server.add_task(task) if name == 'prep_exbw': print('$$$$$$$$ prep_exbw done $$$$$$$$$$$') exdone = True if name == 'prep_sjbw': print('$$$$$$$$ prep_sjbw done $$$$$$$$$$$') sjdone = True if name == 'prep_sjpath': print('$$$$$$$$ prep_sjpath done $$$$$$$$$$$') padone = True if exdone & sjdone & padone: break print('Exit Loop') print('Done')
def process_mapbed(bedpath, dstpre, genome, chromdir, stranded='.', np=3): """ Args: bedpath: path to gzipped BED7 file (converted from BAM) dstpre: path prefix to destination genome: UCSC genome (mm10 etc.) chromdir: directory containing chromosome sequence in FASTA np: number of CPU to use Outputs: 1. dstpre+'.ex.p.bw' 2. dstpre+'.ex.n.bw' 3. dstpre+'.ex.u.bw' 4. dstpre+'.sj.p.bw' 5. dstpre+'.sj.n.bw' 6. dstpre+'.sj.u.bw' 7. dstpre+'.ex.p.uniq.bw' 8. dstpre+'.ex.n.uniq.bw' 9. dstpre+'.ex.u.uniq.bw' 10. dstpre+'.sj.p.uniq.bw' 11. dstpre+'.sj.n.uniq.bw' 12. dstpre+'.sj.u.uniq.bw' 13. dstpre+'.sjpath.bed' BED12 (sc1:ucnt, sc2:jcnt=ucnt+mcnt) """ chroms = UT.chroms(genome) chromdf = UT.chromdf(genome) chromsizes = UT.chromsizes(genome) # split into chroms UT.makedirs(dstpre) splitbedgz(bedpath, dstpre) # ~30sec duppath = dstpre+'.dupitems.txt.gz' chroms = [c for c in chroms if os.path.exists(dstpre+'.{0}.bed'.format(c))] files = [dstpre+'.{0}.bed'.format(c) for c in chroms] _scan_make_map(files, duppath) files0 = [dstpre+'.{0}.bed'.format(c) for c in chromdf['chr'].values] # to be deleted args = [(dstpre, x, genome, chromdir, stranded) for x in chroms] # spread to CPUs rslts = UT.process_mp2(_process_mapbed_chr, args, np=np, doreduce=False) # concatenate chr files files1 = [] dstpath = dstpre+'.sjpath.bed' LOG.info('making {0}...'.format(dstpath)) with open(dstpath, 'wb') as dst: for c in chroms: srcpath = dstpre+'.{0}.sjpath.bed'.format(c) files1.append(srcpath) with open(srcpath, 'rb') as src: shutil.copyfileobj(src, dst) dstpath = UT.compress(dstpath) for kind in ['.ex','.sj']: for strand in ['.p','.n','.u']: for suf in ['','.uniq']: pre = dstpre+kind+suf+strand wigpath = pre+'.wig' bwpath = pre+'.bw' with open(wigpath, 'wb') as dst: for c in chroms: srcpath = pre+'.{0}.wig'.format(c) files1.append(srcpath) if os.path.exists(srcpath): with open(srcpath,'rb') as src: shutil.copyfileobj(src, dst) LOG.info('making {0}...'.format(bwpath)) if os.path.getsize(wigpath)>0: wig2bw(wigpath, chromsizes, bwpath) files1.append(wigpath) # clean up temp files LOG.info('deleting intermediate files...') for x in files0+files1: if os.path.exists(x): LOG.debug('deleting {0}...'.format(x)) os.unlink(x)