def write_iso(self, fname, maxisonum): if fname[-3:]=='.gz': fname = fname[:-3] UT.makedirs(os.path.dirname(fname)) with open(fname,'w') as fobj: for x in self.gen_iso_all(maxisonum=maxisonum): fobj.write('\t'.join(map(str,x))+'\n') UT.compress(fname)
def _bedtoolscatcherror(which, aname, bname, cname, **kwargs): if not os.path.exists(aname): raise ValueError('{0} does not exists'.format(aname)) if not os.path.exists(bname): raise ValueError('{0} does not exists'.format(bname)) if cname.endswith('.gz'): cname = cname[:-3] compress=True else: compress=False try: ret = _runbedtools3(which,aname,bname,cname,**kwargs) except RuntimeError: LOG.warning('bedtool error: repeating on uncompressed a:{0},b:{1},c:{2}'.format(aname,bname,cname)) aname2 = UT.uncompresscopy(aname) bname2 = UT.uncompresscopy(bname) ret = _runbedtools3(which,aname2,bname2,cname,**kwargs) if aname2 != aname: os.unlink(aname2) if bname2 != bname: os.unlink(bname2) if compress: return UT.compress(cname) return cname
def bw2bed_mp(bwfile, bedfile, chroms, th, np=4): """ multi CPU version of bw2bed """ args = [] files = [] for chrom in chroms: bedchromfile = bedfile + '.{0}.bed.gz'.format(chrom) files.append(bedchromfile) args.append((bwfile, bedchromfile, [chrom], th, False)) rslts = UT.process_mp(bw2bed, args, np=np, doreduce=False) # concatenate gz files bedbase = bedfile[:-3] if bedfile[-3:] == '.gz' else bedfile with open(bedbase, 'wb') as dst: for f in rslts: with open(f, 'rb') as src: shutil.copyfileobj(src, dst) # !!! bedtool gzip problem againg !!! # bedtools only process first one if just concatenate gzipped files # => concatenate unzipped and gzip whole thing at the end bedfile = UT.compress(bedbase) # clean up temp files for f in rslts: os.unlink(f) return bedfile
def bw2bed(bwfile, bedfile, chroms, th, compress=True): """Transform BigWig genomeCov to binary BED by thresholding. Makes result file (bwfile[:-3]+'.binary%g.bed'.format(th)) Args: bwfile: path to BigWig file chroms: list of chromosome names th: coverage threshold Returns: path to generated BED file """ bedbase = bedfile[:-3] if bedfile[-3:] == '.gz' else bedfile #bedfile = '{0}.binary{1:g}.bed'.format(bwfile[:-3], th) if UT.notstale(bwfile, bedbase + '.gz'): return bedbase + '.gz' # make sure bwfile exists if not (os.path.exists(bwfile)): raise RuntimeError('BigWig file {0} does not exist.'.format(bwfile)) processor = apply_threshold(bwfile, th, chroms) UT.makedirs(os.path.dirname(bedfile)) out = open(bedbase, 'w') out.write(''.join(['%s\t%i\t%i\n' % x for x in processor])) #out.write('\n') #<= this introduces space inbetween chroms in mp ode # which terminates bedtools at chr1 out.close() if compress: return UT.compress(bedbase) return bedbase
def bed2gtf(fpath, compress=True): """Convert BED to GTF. Uses bedToGenePred, genePredToGtf (UCSC Kent Tools) Args: gtfname: path to BED file compress: whether to gzip (default True) Returns: Pandas.DataFrame containing converted GTF data """ if fpath.endswith('.gz'): base = fpath[:-7] else: base = fpath[:-4] gppath = base+'.genePred' bdpath = base+'.gtf' # sc1 should be integer d = read_bed(fpath) d['sc1'] = d['sc1'].astype(int) write_bed(d, base+'.bed') cmd = ['bedToGenePred',base+'.bed', gppath] LOG.debug( "converting to GenPred...{0}".format(base)) subprocess.call(cmd) cmd = ['genePredToGtf','-source=.','file', gppath, bdpath] LOG.debug( "converting to GTF...{0}".format(base)) subprocess.call(cmd) os.unlink(gppath) # gzip LOG.debug( "gzipping ... {0}".format(bdpath)) if compress: UT.compress(bdpath) bdpath=bdpath+'.gz' if fpath.endswith('.gz'): LOG.debug( "gzipping ... {0}".format(fpath[:-3])) subprocess.call(['gzip',fpath[:-3]]) return bdpath
def write_ggb(df, fname, cols, mode='w'): # df.loc[:,'st'] = df['st'].astype(int) # df.loc[:,'ed'] = df['ed'].astype(int) if fname[-3:]=='.gz': compress=True fname = fname[:-3] else: compress=False if (df.dtypes['st'] != int) or (df.dtypes['ed'] != int): LOG.warning('st,ed not integer: copy and converting') df = df.copy() df['st'] = df['st'].astype(int) df['ed'] = df['ed'].astype(int) UT.makedirs(os.path.dirname(fname)) with open(fname, mode) as f: df[cols].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE) if compress: return UT.compress(fname) return fname
def gtf2bed12(fpath, compress=True): """Convert GTF to BED. Uses gtfToGenePred, genePredToBed (UCSC Kent Tools) Args: gtfname: path to GTF file compress: whether to gzip (default True) Returns: Pandas.DataFrame containing converted BED12 data """ if fpath.endswith('.gz'): base = fpath[:-7] cmd = ['gunzip',fpath] LOG.debug( "expanding compressed ...{0}".format(base)) subprocess.call(cmd) else: base = fpath[:-4] cmd = ['gtfToGenePred','-genePredExt','-ignoreGroupsWithoutExons',base+'.gtf',base+'.gp'] LOG.debug( "converting to GenPred...{0}".format(base)) ret = subprocess.call(cmd) if ret != 0: LOG.debug("error converting to GenPred...code {0}".format(ret)) raise Exception cmd = ['genePredToBed', base+'.gp', base+'.bed'] LOG.debug( "converting to Bed12... {0}".format(base)) ret = subprocess.call(cmd) if ret != 0: LOG.debug("error converting to GenPred...code {0}".format(ret)) raise Exception os.unlink(base+'.gp') # gzip LOG.debug("gzipping ...{0}.bed".format(base)) bdpath = base+'.bed' if compress: bdpath = UT.compress(bdpath) if fpath.endswith('.gz'): LOG.debug( "gzipping ...{0}".format(fpath[:-3])) p = subprocess.call(['gzip',fpath[:-3]]) LOG.debug( "subprocess result: {0} ".format(p)) return bdpath
def wrap(*args,**kwargs): # check output '.gz' if outname in kwargs: opath = kwargs[outname] else: opath = args[pos] args = list(args) if opath[-3:]=='.gz': compress = True opath = opath[:-3] else: compress = False UT.makedirs(os.path.dirname(opath)) if outname in kwargs: kwargs[outname] = opath else: args[pos] = opath err = func(*args, **kwargs) if err != noerr: LOG.warning('bederror:{0}, err={1}'.format(func.__name__, err)) raise RuntimeError(func.__name__) if compress: return UT.compress(opath) return opath
def process_mapbed(bedpath, dstpre, genome, chromdir, stranded='.', np=3): """ Args: bedpath: path to gzipped BED7 file (converted from BAM) dstpre: path prefix to destination genome: UCSC genome (mm10 etc.) chromdir: directory containing chromosome sequence in FASTA np: number of CPU to use Outputs: 1. dstpre+'.ex.p.bw' 2. dstpre+'.ex.n.bw' 3. dstpre+'.ex.u.bw' 4. dstpre+'.sj.p.bw' 5. dstpre+'.sj.n.bw' 6. dstpre+'.sj.u.bw' 7. dstpre+'.ex.p.uniq.bw' 8. dstpre+'.ex.n.uniq.bw' 9. dstpre+'.ex.u.uniq.bw' 10. dstpre+'.sj.p.uniq.bw' 11. dstpre+'.sj.n.uniq.bw' 12. dstpre+'.sj.u.uniq.bw' 13. dstpre+'.sjpath.bed' BED12 (sc1:ucnt, sc2:jcnt=ucnt+mcnt) """ chroms = UT.chroms(genome) chromdf = UT.chromdf(genome) chromsizes = UT.chromsizes(genome) # split into chroms UT.makedirs(dstpre) splitbedgz(bedpath, dstpre) # ~30sec duppath = dstpre+'.dupitems.txt.gz' chroms = [c for c in chroms if os.path.exists(dstpre+'.{0}.bed'.format(c))] files = [dstpre+'.{0}.bed'.format(c) for c in chroms] _scan_make_map(files, duppath) files0 = [dstpre+'.{0}.bed'.format(c) for c in chromdf['chr'].values] # to be deleted args = [(dstpre, x, genome, chromdir, stranded) for x in chroms] # spread to CPUs rslts = UT.process_mp2(_process_mapbed_chr, args, np=np, doreduce=False) # concatenate chr files files1 = [] dstpath = dstpre+'.sjpath.bed' LOG.info('making {0}...'.format(dstpath)) with open(dstpath, 'wb') as dst: for c in chroms: srcpath = dstpre+'.{0}.sjpath.bed'.format(c) files1.append(srcpath) with open(srcpath, 'rb') as src: shutil.copyfileobj(src, dst) dstpath = UT.compress(dstpath) for kind in ['.ex','.sj']: for strand in ['.p','.n','.u']: for suf in ['','.uniq']: pre = dstpre+kind+suf+strand wigpath = pre+'.wig' bwpath = pre+'.bw' with open(wigpath, 'wb') as dst: for c in chroms: srcpath = pre+'.{0}.wig'.format(c) files1.append(srcpath) if os.path.exists(srcpath): with open(srcpath,'rb') as src: shutil.copyfileobj(src, dst) LOG.info('making {0}...'.format(bwpath)) if os.path.getsize(wigpath)>0: wig2bw(wigpath, chromsizes, bwpath) files1.append(wigpath) # clean up temp files LOG.info('deleting intermediate files...') for x in files0+files1: if os.path.exists(x): LOG.debug('deleting {0}...'.format(x)) os.unlink(x)