Esempio n. 1
0
def test_chromsizes():
    assert UT.chroms('mm10') == ['chr{0}'.format(i + 1)
                                 for i in range(19)] + ['chrX', 'chrY']
    assert UT.chroms('dm3') == [
        'chr2L', 'chr2LHet', 'chr2R', 'chr2RHet', 'chr3L', 'chr3LHet', 'chr3R',
        'chr3RHet', 'chr4', 'chrX', 'chrXHet', 'chrYHet', 'chrU', 'chrUextra'
    ]
    assert os.path.exists(UT.chromsizes('mm10'))
    assert os.path.exists(UT.chromsizes('dm3'))
    assert os.path.exists(UT.chromsizes('hg19'))
Esempio n. 2
0
def sj02bw(sj0, pathpre, genome, np=12):
    chroms = UT.chroms(genome)
    chromdf = UT.chromdf(genome).sort_values('size',ascending=False)
    chroms = [x for x in chromdf['chr'] if x in chroms]
    chromdic = UT.df2dict(chromdf, 'chr', 'size')
    if 'jcnt' not in sj0:
        sj0['jcnt'] = sj0['ucnt']+sj0['mcnt']
    files = []
    args = []
    for c in chroms:
        f = '{0}.{1}.{{0}}.wig'.format(pathpre,c)
        args.append((sj0[sj0['chr']==c], c, chromdic[c], f))
        files.append(f)
    rslts = UT.process_mp(sj02wig, args, np=np, doreduce=False)
    rmfiles = []
    for strand in ['+','-','.']:
        s = STRANDMAP0[strand]
        wig = pathpre+'.sj{0}.wig'.format(s)
        bwpath = pathpre+'.sj{0}.bw'.format(s)
        with open(wig, 'w') as dst:
            for tmpl in files:
                f = tmpl.format(strand)
                with open(f,'r') as src:
                    shutil.copyfileobj(src, dst)
                rmfiles.append(f)
        rmfiles.append(wig)
        wig2bw(wig, UT.chromsizes(genome), bwpath)
    for f in rmfiles:
        os.unlink(f)
    os.unlink(wig)
    
Esempio n. 3
0
def merge_bigwigs_mp(bwfiles, genome, dstpath, scale=None, np=7):
    chroms = UT.chroms(genome)
    chromfile = UT.chromsizes(genome)
    chromsizes = UT.df2dict(UT.chromdf(genome), 'chr', 'size')
    # reorder chroms, so that chrX doesn't get processed alone at the end wasting MP time
    tmp = sorted([(chromsizes[c], c) for c in chroms])[::-1]
    chroms = [x[1] for x in tmp]
    args = [(bwfiles, c, chromsizes[c], dstpath + '.{0}.wig'.format(c), scale)
            for c in chroms]

    rslts = UT.process_mp(merge_bigwigs_chr, args, np, doreduce=False)

    dic = dict(rslts)
    LOG.debug('concatenating chromosomes...')
    wigpath = dstpath + '.wig'
    UT.makedirs(os.path.dirname(wigpath))
    with open(wigpath, 'wb') as dst:
        for c in chroms:
            with open(dic[c], 'rb') as src:
                shutil.copyfileobj(src, dst)

    LOG.debug('converting wiggle to bigwig')
    BT.wig2bw(wigpath, chromfile, dstpath)

    # clean up
    for c in chroms:
        f = dstpath + '.{0}.wig'.format(c)
        if os.path.exists(f):
            os.unlink(f)
    if os.path.exists(wigpath):
        os.unlink(wigpath)
Esempio n. 4
0
def get_totbp_covbp_bw(bwfile, genome, chroms=None):
    """ Calculate total bp, covered bp, mean coverage, covered %. 

    Args:
        bwfile: bigwig file
        genome: UCSC genome name
        chroms (list): of chromosomes

    Returns:
        Pandas dataframe

    """
    chromdf = UT.chromdf(genome).set_index('chr')['size']

    def one(chrom):
        csize = chromdf.ix[chrom]
        a = get_bigwig_as_array(bwfile, chrom, 0, csize)
        totbp = N.sum(a)
        covbp = N.sum(a > 0)
        acov = float(totbp) / covbp
        covp = (float(covbp) / csize) * 100.
        return {'totbp': totbp, 'covbp': covbp, 'acov': acov, 'cov%': covp}

    if chroms is None:
        chroms = UT.chroms(genome)
    df = PD.DataFrame({x: one(x) for x in chroms})
    return df
Esempio n. 5
0
    def __call__(self):
        chroms = UT.chroms(self.genome)
        csizedic = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size')
        args = []
        for c in chroms:
            csize = csizedic[c]
            args.append((self.bwsjpre, self.statspath, c, csize, self.params))
        rslts = UT.process_mp(filter_sj, args, np=self.np, doreduce=False)

        dstpath = self.bwsjpre + '.sjpath.filtered.bed.gz'
        with open(dstpath, 'wb') as dst:
            for c in chroms:
                srcpath = self.bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format(
                    c)
                with open(srcpath, 'rb') as src:
                    shutil.copyfileobj(src, dst)
Esempio n. 6
0
 def calc_flux_mp(self, beddf, np=10):
     chroms = UT.chroms(self.genome)
     args = []
     for c in chroms:
         bedc = beddf[beddf['chr'] == c]
         if len(bedc) > 0:
             # args.append((bedc, self.bwpaths.copy()))
             args.append((bedc, self.bwpre))
     rslts = UT.process_mp2(calc_flux_chr, args, np=np, doreduce=True)
     df = PD.DataFrame(rslts, columns=CALCFLUXCOLS)
     exdfi = beddf.set_index('_id').ix[df['_id'].values]
     for f in COPYCOLS:
         if f in exdfi:
             df[f] = exdfi[f].values
     df['len'] = df['ed'] - df['st']
     return df
Esempio n. 7
0
 def calc_params_mp(self,
                    beddf,
                    win=600,
                    siz=10,
                    direction='>',
                    gapmode='53',
                    np=10,
                    covfactor=0):
     chroms = UT.chroms(self.genome)
     args = []
     for c in chroms:
         bedc = beddf[beddf['chr'] == c]
         if len(bedc) > 0:
             # args.append((bedc, self.bwpaths.copy(), win, siz, direction, gapmode, covfactor))
             args.append((bedc, self.bwpre, win, siz, direction, gapmode,
                          covfactor))
     rslts = UT.process_mp2(calc_params_chr, args, np=np, doreduce=True)
     df = PD.DataFrame(rslts, columns=CALCPARAMCOLS)
     exdfi = beddf.set_index('_id').ix[df['_id'].values]
     for f in COPYCOLS:
         if f in exdfi:
             df[f] = exdfi[f].values
     df['len'] = df['ed'] - df['st']
     return df
Esempio n. 8
0
 def __call__(self):
     # exdf => ex.p, ex.n, ex.u
     # sjdf => sj.p, sj.n, sj.u
     # paths => sjpath.bed
     # divide into tasks (exdf,sjdf,paths) x chroms
     self.server = server = TQ.Server(name='PrepBWSJ', np=self.np)
     self.chroms = chroms = UT.chroms(self.genome)
     csizes = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size')
     self.exstatus = exstatus = {}
     self.sjstatus = sjstatus = {}
     self.pastatus = pastatus = {}
     exdone = False
     sjdone = False
     padone = False
     with server:
         for chrom in chroms:
             # exdf tasks
             tname = 'prep_exwig_chr.{0}'.format(chrom)
             args = (self.j2pres, self.libsizes, self.dstpre, chrom,
                     csizes[chrom])
             task = TQ.Task(tname, prep_exwig_chr, args)
             server.add_task(task)
             # exdf tasks
             tname = 'prep_sjwig_chr.{0}'.format(chrom)
             args = (self.j2pres, self.libsizes, self.dstpre, chrom,
                     csizes[chrom])
             task = TQ.Task(tname, prep_sjwig_chr, args)
             server.add_task(task)
             # exdf tasks
             tname = 'prep_sjpath_chr.{0}'.format(chrom)
             args = (self.j2pres, self.libsizes, self.dstpre, chrom)
             task = TQ.Task(tname, prep_sjpath_chr, args)
             server.add_task(task)
         while server.check_error():
             try:
                 name, rslt = server.get_result(
                     timeout=5)  # block until result come in
             except TQ.Empty:
                 name, rslt = None, None
             if name is not None:
                 if name.startswith('prep_exwig_chr.'):
                     chrom = name.split('.')[1]
                     exstatus[chrom] = rslt
                     if len(exstatus) == len(chroms):  # all finished
                         print('$$$$$$$$ putting in prep_exbw $$$$$$$$$$$')
                         tname = 'prep_exbw'
                         args = (self.dstpre, chroms, self.genome)
                         task = TQ.Task(tname, prep_exbw, args)
                         server.add_task(task)
                 if name.startswith('prep_sjwig_chr.'):
                     chrom = name.split('.')[1]
                     sjstatus[chrom] = rslt
                     if len(sjstatus) == len(chroms):  # all finished
                         print('$$$$$$$$ putting in prep_sjbw $$$$$$$$$$$')
                         tname = 'prep_sjbw'
                         args = (self.dstpre, chroms, self.genome)
                         task = TQ.Task(tname, prep_sjbw, args)
                         server.add_task(task)
                 if name.startswith('prep_sjpath_chr.'):
                     chrom = name.split('.')[1]
                     pastatus[chrom] = rslt
                     if len(pastatus) == len(chroms):  # all finished
                         print(
                             '$$$$$$$$ putting in prep_sjpath $$$$$$$$$$$')
                         tname = 'prep_sjpath'
                         args = (self.dstpre, chroms)
                         task = TQ.Task(tname, prep_sjpath, args)
                         server.add_task(task)
                 if name == 'prep_exbw':
                     print('$$$$$$$$ prep_exbw done $$$$$$$$$$$')
                     exdone = True
                 if name == 'prep_sjbw':
                     print('$$$$$$$$ prep_sjbw done $$$$$$$$$$$')
                     sjdone = True
                 if name == 'prep_sjpath':
                     print('$$$$$$$$ prep_sjpath done $$$$$$$$$$$')
                     padone = True
                 if exdone & sjdone & padone:
                     break
         print('Exit Loop')
     print('Done')
Esempio n. 9
0
def process_mapbed(bedpath, dstpre, genome, chromdir, stranded='.', np=3):
    """
    Args:
        bedpath: path to gzipped BED7 file (converted from BAM)
        dstpre: path prefix to destination
        genome: UCSC genome (mm10 etc.)
        chromdir: directory containing chromosome sequence in FASTA
        np: number of CPU to use

    Outputs:
        1. dstpre+'.ex.p.bw'
        2. dstpre+'.ex.n.bw'
        3. dstpre+'.ex.u.bw'
        4. dstpre+'.sj.p.bw'
        5. dstpre+'.sj.n.bw'
        6. dstpre+'.sj.u.bw'
        7. dstpre+'.ex.p.uniq.bw'
        8. dstpre+'.ex.n.uniq.bw'
        9. dstpre+'.ex.u.uniq.bw'
        10. dstpre+'.sj.p.uniq.bw'
        11. dstpre+'.sj.n.uniq.bw'
        12. dstpre+'.sj.u.uniq.bw'
        13. dstpre+'.sjpath.bed' BED12 (sc1:ucnt, sc2:jcnt=ucnt+mcnt)
    """
    chroms = UT.chroms(genome)
    chromdf = UT.chromdf(genome)
    chromsizes = UT.chromsizes(genome)

    # split into chroms
    UT.makedirs(dstpre)
    splitbedgz(bedpath, dstpre) # ~30sec
    duppath = dstpre+'.dupitems.txt.gz'
    chroms = [c for c in chroms if os.path.exists(dstpre+'.{0}.bed'.format(c))]
    files = [dstpre+'.{0}.bed'.format(c) for c in chroms]
    _scan_make_map(files, duppath)

    files0 = [dstpre+'.{0}.bed'.format(c) for c  in chromdf['chr'].values] # to be deleted
    args = [(dstpre, x, genome, chromdir, stranded) for x in chroms]
    # spread to CPUs
    rslts = UT.process_mp2(_process_mapbed_chr, args, np=np, doreduce=False)
    # concatenate chr files
    files1 = []
    dstpath = dstpre+'.sjpath.bed'
    LOG.info('making {0}...'.format(dstpath))
    with open(dstpath, 'wb') as dst:
        for c in chroms:
            srcpath = dstpre+'.{0}.sjpath.bed'.format(c)
            files1.append(srcpath)
            with open(srcpath, 'rb') as src:
                shutil.copyfileobj(src, dst)
    dstpath = UT.compress(dstpath)

    for kind in ['.ex','.sj']:
        for strand in ['.p','.n','.u']:
            for suf in ['','.uniq']:
                pre = dstpre+kind+suf+strand
                wigpath = pre+'.wig'
                bwpath = pre+'.bw'
                with open(wigpath, 'wb') as dst:
                    for c in chroms:
                        srcpath = pre+'.{0}.wig'.format(c)
                        files1.append(srcpath)
                        if os.path.exists(srcpath):
                            with open(srcpath,'rb') as src:
                                shutil.copyfileobj(src, dst)
                LOG.info('making {0}...'.format(bwpath))
                if os.path.getsize(wigpath)>0:
                    wig2bw(wigpath, chromsizes, bwpath)
                files1.append(wigpath)

    # clean up temp files
    LOG.info('deleting intermediate files...')
    for x in files0+files1:
        if os.path.exists(x):
            LOG.debug('deleting {0}...'.format(x))
            os.unlink(x)