Example #1
0
 def write_iso(self, fname, maxisonum):
     if fname[-3:]=='.gz':
         fname = fname[:-3]
     UT.makedirs(os.path.dirname(fname))
     with open(fname,'w') as fobj:
         for x in self.gen_iso_all(maxisonum=maxisonum):
             fobj.write('\t'.join(map(str,x))+'\n')
     UT.compress(fname)
Example #2
0
def _bedtoolscatcherror(which, aname, bname, cname, **kwargs):
    if not os.path.exists(aname):
        raise ValueError('{0} does not exists'.format(aname))
    if not os.path.exists(bname):
        raise ValueError('{0} does not exists'.format(bname))
        
    if cname.endswith('.gz'):
        cname = cname[:-3]
        compress=True
    else:
        compress=False
    try:
        ret = _runbedtools3(which,aname,bname,cname,**kwargs)
    except RuntimeError:
        LOG.warning('bedtool error: repeating on uncompressed a:{0},b:{1},c:{2}'.format(aname,bname,cname))
        aname2 = UT.uncompresscopy(aname)
        bname2 = UT.uncompresscopy(bname)
        ret = _runbedtools3(which,aname2,bname2,cname,**kwargs)
        if aname2 != aname:
            os.unlink(aname2)
        if bname2 != bname:
            os.unlink(bname2)
    if compress:
        return UT.compress(cname)
    return cname
Example #3
0
def bw2bed_mp(bwfile, bedfile, chroms, th, np=4):
    """ multi CPU version of bw2bed """

    args = []
    files = []
    for chrom in chroms:
        bedchromfile = bedfile + '.{0}.bed.gz'.format(chrom)
        files.append(bedchromfile)
        args.append((bwfile, bedchromfile, [chrom], th, False))

    rslts = UT.process_mp(bw2bed, args, np=np, doreduce=False)

    # concatenate gz files
    bedbase = bedfile[:-3] if bedfile[-3:] == '.gz' else bedfile
    with open(bedbase, 'wb') as dst:
        for f in rslts:
            with open(f, 'rb') as src:
                shutil.copyfileobj(src, dst)
    # !!! bedtool gzip problem againg !!!
    # bedtools only process first one if just concatenate gzipped files
    # => concatenate unzipped and gzip whole thing at the end
    bedfile = UT.compress(bedbase)

    # clean up temp files
    for f in rslts:
        os.unlink(f)

    return bedfile
Example #4
0
def bw2bed(bwfile, bedfile, chroms, th, compress=True):
    """Transform BigWig genomeCov to binary BED by thresholding. 
    Makes result file (bwfile[:-3]+'.binary%g.bed'.format(th))

    Args:
        bwfile: path to BigWig file
        chroms: list of chromosome names
        th: coverage threshold

    Returns:
        path to generated BED file
    """
    bedbase = bedfile[:-3] if bedfile[-3:] == '.gz' else bedfile
    #bedfile = '{0}.binary{1:g}.bed'.format(bwfile[:-3], th)
    if UT.notstale(bwfile, bedbase + '.gz'):
        return bedbase + '.gz'
    # make sure bwfile exists
    if not (os.path.exists(bwfile)):
        raise RuntimeError('BigWig file {0} does not exist.'.format(bwfile))
    processor = apply_threshold(bwfile, th, chroms)
    UT.makedirs(os.path.dirname(bedfile))
    out = open(bedbase, 'w')
    out.write(''.join(['%s\t%i\t%i\n' % x for x in processor]))
    #out.write('\n') #<= this introduces space inbetween chroms in mp ode
    # which terminates bedtools at chr1
    out.close()
    if compress:
        return UT.compress(bedbase)
    return bedbase
Example #5
0
def bed2gtf(fpath, compress=True):
    """Convert BED to GTF. Uses bedToGenePred, genePredToGtf (UCSC Kent Tools)

    Args:
        gtfname: path to BED file
        compress: whether to gzip (default True)

    Returns:
        Pandas.DataFrame containing converted GTF data
    """

    if fpath.endswith('.gz'):
        base = fpath[:-7]
    else:
        base = fpath[:-4]
    gppath = base+'.genePred'
    bdpath = base+'.gtf'

    # sc1 should be integer
    d = read_bed(fpath)
    d['sc1'] = d['sc1'].astype(int)
    write_bed(d, base+'.bed')

    cmd = ['bedToGenePred',base+'.bed', gppath]
    LOG.debug( "converting to GenPred...{0}".format(base))
    subprocess.call(cmd)
    cmd = ['genePredToGtf','-source=.','file', gppath, bdpath]
    LOG.debug( "converting to GTF...{0}".format(base))
    subprocess.call(cmd)
    os.unlink(gppath)
    # gzip
    LOG.debug( "gzipping ... {0}".format(bdpath))
    if compress:
        UT.compress(bdpath)
        bdpath=bdpath+'.gz'
    if fpath.endswith('.gz'):
        LOG.debug( "gzipping ... {0}".format(fpath[:-3]))
        subprocess.call(['gzip',fpath[:-3]])
    return bdpath
Example #6
0
def write_ggb(df, fname, cols, mode='w'):    
    # df.loc[:,'st'] = df['st'].astype(int)
    # df.loc[:,'ed'] = df['ed'].astype(int)
    if fname[-3:]=='.gz':
        compress=True
        fname = fname[:-3]
    else:
        compress=False
    if (df.dtypes['st'] != int) or (df.dtypes['ed'] != int):
        LOG.warning('st,ed not integer: copy and converting')
        df = df.copy()
        df['st'] = df['st'].astype(int)
        df['ed'] = df['ed'].astype(int)
    UT.makedirs(os.path.dirname(fname))
    with open(fname, mode) as f:
        df[cols].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE)
    if compress:
        return UT.compress(fname)
    return fname
Example #7
0
def gtf2bed12(fpath, compress=True):
    """Convert GTF to BED. Uses gtfToGenePred, genePredToBed (UCSC Kent Tools)

    Args:
        gtfname: path to GTF file
        compress: whether to gzip (default True)

    Returns:
        Pandas.DataFrame containing converted BED12 data
    """
    if fpath.endswith('.gz'):
        base = fpath[:-7]
        cmd = ['gunzip',fpath]
        LOG.debug( "expanding compressed ...{0}".format(base))
        subprocess.call(cmd)
    else:
        base = fpath[:-4]
    cmd = ['gtfToGenePred','-genePredExt','-ignoreGroupsWithoutExons',base+'.gtf',base+'.gp']
    LOG.debug( "converting to GenPred...{0}".format(base))
    ret = subprocess.call(cmd)
    if ret != 0:
        LOG.debug("error converting to GenPred...code {0}".format(ret))
        raise Exception
    cmd = ['genePredToBed', base+'.gp', base+'.bed']
    LOG.debug( "converting to Bed12... {0}".format(base))
    ret = subprocess.call(cmd)
    if ret != 0:
        LOG.debug("error converting to GenPred...code {0}".format(ret))
        raise Exception
    os.unlink(base+'.gp')
    # gzip
    LOG.debug("gzipping ...{0}.bed".format(base))
    bdpath = base+'.bed'
    if compress:
        bdpath = UT.compress(bdpath)
    if fpath.endswith('.gz'):
        LOG.debug( "gzipping ...{0}".format(fpath[:-3]))
        p = subprocess.call(['gzip',fpath[:-3]])
        LOG.debug( "subprocess result: {0} ".format(p))
    return bdpath
Example #8
0
 def wrap(*args,**kwargs):
     # check output '.gz'
     if outname in kwargs:
         opath = kwargs[outname]
     else:
         opath = args[pos]
         args = list(args)
     if opath[-3:]=='.gz':
         compress = True
         opath = opath[:-3]
     else:
         compress = False
     UT.makedirs(os.path.dirname(opath))
     if outname in kwargs:
         kwargs[outname] = opath
     else:
         args[pos] = opath
     err = func(*args, **kwargs)
     if err != noerr:
         LOG.warning('bederror:{0}, err={1}'.format(func.__name__, err))
         raise RuntimeError(func.__name__)
     if compress:
         return UT.compress(opath)
     return opath
Example #9
0
def process_mapbed(bedpath, dstpre, genome, chromdir, stranded='.', np=3):
    """
    Args:
        bedpath: path to gzipped BED7 file (converted from BAM)
        dstpre: path prefix to destination
        genome: UCSC genome (mm10 etc.)
        chromdir: directory containing chromosome sequence in FASTA
        np: number of CPU to use

    Outputs:
        1. dstpre+'.ex.p.bw'
        2. dstpre+'.ex.n.bw'
        3. dstpre+'.ex.u.bw'
        4. dstpre+'.sj.p.bw'
        5. dstpre+'.sj.n.bw'
        6. dstpre+'.sj.u.bw'
        7. dstpre+'.ex.p.uniq.bw'
        8. dstpre+'.ex.n.uniq.bw'
        9. dstpre+'.ex.u.uniq.bw'
        10. dstpre+'.sj.p.uniq.bw'
        11. dstpre+'.sj.n.uniq.bw'
        12. dstpre+'.sj.u.uniq.bw'
        13. dstpre+'.sjpath.bed' BED12 (sc1:ucnt, sc2:jcnt=ucnt+mcnt)
    """
    chroms = UT.chroms(genome)
    chromdf = UT.chromdf(genome)
    chromsizes = UT.chromsizes(genome)

    # split into chroms
    UT.makedirs(dstpre)
    splitbedgz(bedpath, dstpre) # ~30sec
    duppath = dstpre+'.dupitems.txt.gz'
    chroms = [c for c in chroms if os.path.exists(dstpre+'.{0}.bed'.format(c))]
    files = [dstpre+'.{0}.bed'.format(c) for c in chroms]
    _scan_make_map(files, duppath)

    files0 = [dstpre+'.{0}.bed'.format(c) for c  in chromdf['chr'].values] # to be deleted
    args = [(dstpre, x, genome, chromdir, stranded) for x in chroms]
    # spread to CPUs
    rslts = UT.process_mp2(_process_mapbed_chr, args, np=np, doreduce=False)
    # concatenate chr files
    files1 = []
    dstpath = dstpre+'.sjpath.bed'
    LOG.info('making {0}...'.format(dstpath))
    with open(dstpath, 'wb') as dst:
        for c in chroms:
            srcpath = dstpre+'.{0}.sjpath.bed'.format(c)
            files1.append(srcpath)
            with open(srcpath, 'rb') as src:
                shutil.copyfileobj(src, dst)
    dstpath = UT.compress(dstpath)

    for kind in ['.ex','.sj']:
        for strand in ['.p','.n','.u']:
            for suf in ['','.uniq']:
                pre = dstpre+kind+suf+strand
                wigpath = pre+'.wig'
                bwpath = pre+'.bw'
                with open(wigpath, 'wb') as dst:
                    for c in chroms:
                        srcpath = pre+'.{0}.wig'.format(c)
                        files1.append(srcpath)
                        if os.path.exists(srcpath):
                            with open(srcpath,'rb') as src:
                                shutil.copyfileobj(src, dst)
                LOG.info('making {0}...'.format(bwpath))
                if os.path.getsize(wigpath)>0:
                    wig2bw(wigpath, chromsizes, bwpath)
                files1.append(wigpath)

    # clean up temp files
    LOG.info('deleting intermediate files...')
    for x in files0+files1:
        if os.path.exists(x):
            LOG.debug('deleting {0}...'.format(x))
            os.unlink(x)