def merge_bigwigs_mp(bwfiles, genome, dstpath, scale=None, np=7): chroms = UT.chroms(genome) chromfile = UT.chromsizes(genome) chromsizes = UT.df2dict(UT.chromdf(genome), 'chr', 'size') # reorder chroms, so that chrX doesn't get processed alone at the end wasting MP time tmp = sorted([(chromsizes[c], c) for c in chroms])[::-1] chroms = [x[1] for x in tmp] args = [(bwfiles, c, chromsizes[c], dstpath + '.{0}.wig'.format(c), scale) for c in chroms] rslts = UT.process_mp(merge_bigwigs_chr, args, np, doreduce=False) dic = dict(rslts) LOG.debug('concatenating chromosomes...') wigpath = dstpath + '.wig' UT.makedirs(os.path.dirname(wigpath)) with open(wigpath, 'wb') as dst: for c in chroms: with open(dic[c], 'rb') as src: shutil.copyfileobj(src, dst) LOG.debug('converting wiggle to bigwig') BT.wig2bw(wigpath, chromfile, dstpath) # clean up for c in chroms: f = dstpath + '.{0}.wig'.format(c) if os.path.exists(f): os.unlink(f) if os.path.exists(wigpath): os.unlink(wigpath)
def bw2bed(bwfile, bedfile, chroms, th, compress=True): """Transform BigWig genomeCov to binary BED by thresholding. Makes result file (bwfile[:-3]+'.binary%g.bed'.format(th)) Args: bwfile: path to BigWig file chroms: list of chromosome names th: coverage threshold Returns: path to generated BED file """ bedbase = bedfile[:-3] if bedfile[-3:] == '.gz' else bedfile #bedfile = '{0}.binary{1:g}.bed'.format(bwfile[:-3], th) if UT.notstale(bwfile, bedbase + '.gz'): return bedbase + '.gz' # make sure bwfile exists if not (os.path.exists(bwfile)): raise RuntimeError('BigWig file {0} does not exist.'.format(bwfile)) processor = apply_threshold(bwfile, th, chroms) UT.makedirs(os.path.dirname(bedfile)) out = open(bedbase, 'w') out.write(''.join(['%s\t%i\t%i\n' % x for x in processor])) #out.write('\n') #<= this introduces space inbetween chroms in mp ode # which terminates bedtools at chr1 out.close() if compress: return UT.compress(bedbase) return bedbase
def write_iso(self, fname, maxisonum): if fname[-3:]=='.gz': fname = fname[:-3] UT.makedirs(os.path.dirname(fname)) with open(fname,'w') as fobj: for x in self.gen_iso_all(maxisonum=maxisonum): fobj.write('\t'.join(map(str,x))+'\n') UT.compress(fname)
def wig2bw(wigpath, chromsizes, bwpath): """Generate bigwig coverage from WIGGLE. Runs Kent's tool wigToBigWig. """ cmd = ['wigToBigWig', wigpath, chromsizes, bwpath] UT.makedirs(os.path.dirname(bwpath)) err = subprocess.call(cmd) return err
def write_ggb(df, fname, cols, mode='w'): # df.loc[:,'st'] = df['st'].astype(int) # df.loc[:,'ed'] = df['ed'].astype(int) if fname[-3:]=='.gz': compress=True fname = fname[:-3] else: compress=False if (df.dtypes['st'] != int) or (df.dtypes['ed'] != int): LOG.warning('st,ed not integer: copy and converting') df = df.copy() df['st'] = df['st'].astype(int) df['ed'] = df['ed'].astype(int) UT.makedirs(os.path.dirname(fname)) with open(fname, mode) as f: df[cols].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE) if compress: return UT.compress(fname) return fname
def save(self): # [i,5,5b,3,3b,s,sb,j,glc,ecc,jcc] # light weight stats also usable from others ==> dict # auc, detected1, ..., sigmoid,...,maxx,avgx,avgy,... # ==> pickle or json decode = '{0}.{1}'.format(self.en1.code, self.datacode) fname1 = self.en2.fname2('stats.json', decode, category='output') UT.makedirs(os.path.dirname(fname1)) with open(fname1, 'w') as fp: json.dump(self.stats, fp) # [i,5,5b,3,3b,s,sb,j] cov(x),ratio(y) => in a dataframe # [glc,ecc,jcc] gcov(x), ratio(y) => in a dataframe # ==> put all in one four column dataframe (kind, id, x, y) fname2 = self.en2.fname2('ratios.txt.gz', decode, category='output') for k, v in self.ratios.items(): v['kind'] = k df = PD.concat(self.ratios.values(), ignore_index=True) UT.write_pandas(df, fname2, 'h') # DP dp = self.get_detection_percentages() fname3 = self.en2.fname2('dp.txt.gz', decode, category='output') UT.write_pandas(dp, fname3, 'ih')
def wrap(*args,**kwargs): # check output '.gz' if outname in kwargs: opath = kwargs[outname] else: opath = args[pos] args = list(args) if opath[-3:]=='.gz': compress = True opath = opath[:-3] else: compress = False UT.makedirs(os.path.dirname(opath)) if outname in kwargs: kwargs[outname] = opath else: args[pos] = opath err = func(*args, **kwargs) if err != noerr: LOG.warning('bederror:{0}, err={1}'.format(func.__name__, err)) raise RuntimeError(func.__name__) if compress: return UT.compress(opath) return opath
def bam2bw(fpath, chromsizes, bpath, aligned=None): """ Generate normalized coverage from BAM Args: fpath (str): path to BAM chromsizes (str): path to chromsizes file bpath (str): path to BIGWIG aligned (int): number of aligned reads, if None uses samtools to find it from BAM Requires Bedtools (genomeCoverageBed) and Kent Tool (wigToBigWig) """ # countreads if aligned is None: aligned = cnt_bam(fpath) scale = 1000000. / float(aligned) # convert_to_wig tpath = bpath + '.wig' UT.makedirs(os.path.dirname(tpath)) tfobj = open(tpath, 'wb') cmd1 = [ 'genomeCoverageBed', '-split', '-bg', '-ibam', fpath, '-g', chromsizes, '-scale', str(scale) ] p1 = subprocess.Popen(cmd1, stdout=tfobj) p1.wait() tfobj.close() # convet_wig_to_bigwig cmd2 = ['wigToBigWig', tpath, chromsizes, bpath] p2 = subprocess.call(cmd2) # remove_temporary_file os.remove(tpath)
def test_makedirs(tmpdir): path = os.path.join(str(tmpdir), 'a/b/c') UT.makedirs(path) assert os.path.exists(path) # should not raise UT.makedirs(path) # make a file path2 = os.path.join(str(tmpdir), 'a/b/c/d') open(path2, 'w').write('test\n') # should raise with pytest.raises(OSError): UT.makedirs(path2)
def process_mapbed(bedpath, dstpre, genome, chromdir, stranded='.', np=3): """ Args: bedpath: path to gzipped BED7 file (converted from BAM) dstpre: path prefix to destination genome: UCSC genome (mm10 etc.) chromdir: directory containing chromosome sequence in FASTA np: number of CPU to use Outputs: 1. dstpre+'.ex.p.bw' 2. dstpre+'.ex.n.bw' 3. dstpre+'.ex.u.bw' 4. dstpre+'.sj.p.bw' 5. dstpre+'.sj.n.bw' 6. dstpre+'.sj.u.bw' 7. dstpre+'.ex.p.uniq.bw' 8. dstpre+'.ex.n.uniq.bw' 9. dstpre+'.ex.u.uniq.bw' 10. dstpre+'.sj.p.uniq.bw' 11. dstpre+'.sj.n.uniq.bw' 12. dstpre+'.sj.u.uniq.bw' 13. dstpre+'.sjpath.bed' BED12 (sc1:ucnt, sc2:jcnt=ucnt+mcnt) """ chroms = UT.chroms(genome) chromdf = UT.chromdf(genome) chromsizes = UT.chromsizes(genome) # split into chroms UT.makedirs(dstpre) splitbedgz(bedpath, dstpre) # ~30sec duppath = dstpre+'.dupitems.txt.gz' chroms = [c for c in chroms if os.path.exists(dstpre+'.{0}.bed'.format(c))] files = [dstpre+'.{0}.bed'.format(c) for c in chroms] _scan_make_map(files, duppath) files0 = [dstpre+'.{0}.bed'.format(c) for c in chromdf['chr'].values] # to be deleted args = [(dstpre, x, genome, chromdir, stranded) for x in chroms] # spread to CPUs rslts = UT.process_mp2(_process_mapbed_chr, args, np=np, doreduce=False) # concatenate chr files files1 = [] dstpath = dstpre+'.sjpath.bed' LOG.info('making {0}...'.format(dstpath)) with open(dstpath, 'wb') as dst: for c in chroms: srcpath = dstpre+'.{0}.sjpath.bed'.format(c) files1.append(srcpath) with open(srcpath, 'rb') as src: shutil.copyfileobj(src, dst) dstpath = UT.compress(dstpath) for kind in ['.ex','.sj']: for strand in ['.p','.n','.u']: for suf in ['','.uniq']: pre = dstpre+kind+suf+strand wigpath = pre+'.wig' bwpath = pre+'.bw' with open(wigpath, 'wb') as dst: for c in chroms: srcpath = pre+'.{0}.wig'.format(c) files1.append(srcpath) if os.path.exists(srcpath): with open(srcpath,'rb') as src: shutil.copyfileobj(src, dst) LOG.info('making {0}...'.format(bwpath)) if os.path.getsize(wigpath)>0: wig2bw(wigpath, chromsizes, bwpath) files1.append(wigpath) # clean up temp files LOG.info('deleting intermediate files...') for x in files0+files1: if os.path.exists(x): LOG.debug('deleting {0}...'.format(x)) os.unlink(x)
def find_genes4(sj, ae, filepre, cachename=None, np=1, override=False, depth=500, separatese=True): """ Adds _gidx column to ae Connection: 1) by junctions, 2) by overlap in the same strand Returns genes [set([_id,..]), ...] """ if '_id' not in ae.columns: LOG.info('setting ex _id...') UT.set_ids(ae) if '_id' not in sj.columns: LOG.info('setting sj _id...') UT.set_ids(sj) if 'cat' not in ae.columns: UT.set_exon_category(sj, ae) if 'a_id' not in ae.columns: UT.set_ad_info(sj, ae) ### FIND GENES if cachename and os.path.exists(cachename) and not override: LOG.info('loading cached genes (connected components)...') genes = pickle.load(open(cachename, 'rb')) else: LOG.info('finding genes (connected components)...') _sttime = time.time() if separatese: me, se = UT.mese(ae) genes = mcore_allcomponents4(sj, me, filepre, np, depth=depth) # SE genes genes += [set([x]) for x in se['_id']] else: genes = mcore_allcomponents4(sj, ae, filepre, np, depth=depth) # version 4 graph: uses overlaps in addition to junctions to connect # genes = [set([_id's]),...] if cachename: UT.makedirs(os.path.dirname(cachename)) pickle.dump(genes, open(cachename, 'wb')) LOG.info(' time: {0:.3f}s'.format(time.time() - _sttime)) ### WRITE EXONS W/ GENE number LOG.info('assigning gidx...') _sttime = time.time() i2g = {} # eid => _gidx i2gn = {} # eidt => gname g2gn = {} i2s = dict(UT.izipcols(ae, ['_id', 'strand'])) # eid => strand #i2c = dict(UT.izipcols(ae, ['_id','cat'])) # eid => category s2n = {'+': 'P', '-': 'N', '.': '', '.+': '', '.-': ''} c2n = {'s': 'S', 'i': 'G', '5': 'G', '3': 'G'} for i, ids in enumerate(genes): gid = i + 1 strand = s2n[i2s[list(ids)[0]]] cat = 'S' if len(ids) == 1 else 'G' if strand == 'N': # negative strand gid = -gid gname = 'J{0}{1}{2}'.format(strand, cat, abs(gid)) g2gn[gid] = gname for x in ids: i2g[x] = gid i2gn[x] = gname ae['_gidx'] = [i2g[x] for x in ae['_id']] ae['gname'] = [i2gn[x] for x in ae['_id']] ## set sj _gidx, use acceptor=>_gidx map (exon a_id, sj a_id) a2g = dict(UT.izipcols(ae, ['a_id', '_gidx'])) d2g = dict(UT.izipcols(ae, ['d_id', '_gidx'])) sj['_gidx'] = [ a2g.get(x, d2g.get(y, 0)) for x, y in UT.izipcols(sj, ['a_id', 'd_id']) ] sj['gname'] = [g2gn.get(x, '') for x in sj['_gidx']] # This shouldn't happen nidx = ae['_gidx'] == 0 if N.sum(nidx) > 0: LOG.warning( '###### WARNING!!!!!! exons with no gene assignment:{0}'.format( N.sum(nidx))) #ae.loc[nidx, '_gidx'] = N.arange(len(ae),len(ae)+N.sum(nidx)) return genes