Esempio n. 1
0
def test_notstale(outdir):
    a = os.path.join(outdir, 'a')
    af = open(a, 'w').write('a')
    b = os.path.join(outdir, 'b')
    open(b, 'w').write('b')
    c = os.path.join(outdir, 'c')
    open(c, 'w').write('c')
    d = os.path.join(outdir, 'd')
    # simple: a < b
    assert UT.notstale(a, b) == True
    # multiple: [a,b] < c
    assert UT.notstale([a, b], c) == True
    # non-existent cache
    assert UT.notstale(a, d) == False
Esempio n. 2
0
def bw2bed(bwfile, bedfile, chroms, th, compress=True):
    """Transform BigWig genomeCov to binary BED by thresholding. 
    Makes result file (bwfile[:-3]+'.binary%g.bed'.format(th))

    Args:
        bwfile: path to BigWig file
        chroms: list of chromosome names
        th: coverage threshold

    Returns:
        path to generated BED file
    """
    bedbase = bedfile[:-3] if bedfile[-3:] == '.gz' else bedfile
    #bedfile = '{0}.binary{1:g}.bed'.format(bwfile[:-3], th)
    if UT.notstale(bwfile, bedbase + '.gz'):
        return bedbase + '.gz'
    # make sure bwfile exists
    if not (os.path.exists(bwfile)):
        raise RuntimeError('BigWig file {0} does not exist.'.format(bwfile))
    processor = apply_threshold(bwfile, th, chroms)
    UT.makedirs(os.path.dirname(bedfile))
    out = open(bedbase, 'w')
    out.write(''.join(['%s\t%i\t%i\n' % x for x in processor]))
    #out.write('\n') #<= this introduces space inbetween chroms in mp ode
    # which terminates bedtools at chr1
    out.close()
    if compress:
        return UT.compress(bedbase)
    return bedbase
Esempio n. 3
0
def fillgap(binfile, gapfile, gap=50):
    if gapfile[-3:]=='.gz':
        gapfile = gapfile[:-3]
    #gapfile = binfile[:-7]+'.gap%d.bed' % gap
    if UT.notstale(binfile, gapfile+'.gz'):
        return gapfile+'.gz'
    gapfile = bedtoolmerge(binfile, gapfile, d=gap)
    return gapfile
Esempio n. 4
0
def calc_ovlratio(aname, bname, tname, nacol, nbcol, idcol=['chr','st','ed'], returnbcols=False):
    """Calculate overlapped portion of b onto a. 
    Will check existence of result file (tname) and uses it if newer than input files.

    Args:
        aname (str): bed file name 1
        bname (str): bed file name 2
        tname (str): result file name
        nacol (int): number of columns in file 1
        nbcol (int): number of columns in file 2

    Optional:
        idcol (list of str): columns which specify unique entry

    Returns:
        A Pandas DataFrame which contains overlap info
    """
    # requirement: no overlap within b
    # cache?
    if UT.notstale([aname,bname], tname):
        return UT.read_pandas(tname)
    # calculate bedtools intersect
    tmpsuf='.ovlbed.txt'
    cname = aname+tmpsuf
    if nacol==12:
        cname = bedtoolintersect(aname, bname, cname, wao=True, split=True)
    else:
        cname = bedtoolintersect(aname, bname, cname, wao=True)
    # read tmp file
    acols = GGB.BEDCOLS[:nacol]
    bcols = ['b_'+x for x in GGB.BEDCOLS[:nbcol]]
    cols = acols + bcols +['ovl']
    df = UT.read_pandas(cname, names=cols)
    dfg = df.groupby(idcol) #['chr','st','ed'])
    if returnbcols:
        dfa = dfg.first().reset_index()[acols+bcols]
    else:
        dfa = dfg.first().reset_index()[acols]        
    if nacol==12:# sum of exon sizes
        dfa['len'] = [N.sum(map(int, x.split(',')[:-1])) for x in dfa['esizes']]
    else: 
        dfa['len'] = dfa['ed']-dfa['st']
    # since b does not overlap by itself total overlap of an element of a to b is 
    # sum of overlap to individual b
    dfa['ovl'] = dfg['ovl'].sum().values
    dfa['ovlratio'] = dfa['ovl'].astype(float)/dfa['len']
    dfa['notcovbp'] = dfa['len'] - dfa['ovl']
    # clean up
    os.unlink(cname)
    # save
    UT.save_tsv_nidx_whead(dfa, tname)
    return dfa
Esempio n. 5
0
def chop_chrs_gtf(gtfname, chrs, outdir=None):
    """Separate chromosomes into different files.

    Args:
        gtfname: path to GTF
        chrs: list of chromosome names
        outdir: output directory, if None (default), then use same directory as input
        
    """
    #chrs = ['chr%d' % (x+1,) for x in range(19)] +['chrX','chrY']
    if outdir is None:
        outdir = os.path.dirname(gtfname)
    base = os.path.basename(gtfname)[:-4]
    outnames = [os.path.join(outdir, base+'-%s.gtf' % x) for x in chrs]
    if all([UT.notstale(gtfname, x) for x in outnames]):
        # all files already exist and newer than gtfname
        return outnames
    gtf = read_gtf(gtfname, parseattrs=[]) # don't parse attrs
    for c,fname in zip(chrs,outnames):
        LOG.debug( "writing %s to %s..." % (c, fname))
        sub = gtf[gtf['chr']==c]
        write_gtf(sub, fname, compress=False)
    return outnames    
Esempio n. 6
0
def calc_gcov(expath, cipath, bwpath, dstprefix, override=False, np=4):
    """Calculate gene coverages.

    Args:
        expath: merged ex
        cipath: chopped interval for ex
        bwpath: bigwig file (sample)
        dstprefix: prefix for outputs

    Outputs:
        1. dstprefix+'.covci.txt.gz'
        2. dstprefix+'.gcov.txt.gz' : DataFrame(col:_gidx,len,val,gcov,len2,gcov2,cids)
            len2: calculate length from ci with cov > 0
            (normal length = use entire ci's belonging to the gene)
            gcov2 = val/len2
            cids: cid with cov > for the gene ','.joined
    """
    ex = UT.read_pandas(expath)
    covcipath = dstprefix + 'covci.txt.gz'
    gcovpath = dstprefix + 'gcov.txt.gz'

    if UT.notstale([expath, cipath], covcipath, override):
        cc = UT.read_pandas(covcipath)
    else:
        if UT.notstale(expath, cipath, False):
            ci = UT.read_pandas(cipath,
                                names=['chr', 'st', 'ed', 'name', 'id'])
        else:
            ci = UT.chopintervals(ex, cipath, idcol='_id')
        cc = calc_cov_mp(ci, bwpath, covcipath, np=np)

    # if override or (not os.path.exists(covcipath)):
    #     # calc covci
    #     if not os.path.exists(cipath):
    #         ci = UT.chopintervals(ex, cipath, idcol='_id')
    #     else:
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # else:
    #     cc = UT.read_pandas(covcipath)

    if 'id' not in cc.columns:
        cc['id'] = cc['sc1']
    if 'eid' not in cc.columns:
        cc['eid'] = cc['name'].astype(str).apply(
            lambda x: [int(y) for y in x.split(',')])
    cc['len'] = cc['ed'] - cc['st']
    cc['val'] = cc['cov'] * cc['len']
    ccf = UT.flattendf(cc[['id', 'eid', 'len', 'val', 'st', 'ed']], 'eid')
    e2g = dict(UT.izipcols(ex, ['_id', '_gidx']))
    ccf['_gidx'] = [e2g[x] for x in ccf['eid']]
    # for normal gcov: take unique combination of (gid, id) (id=cid)
    # for gocv2 : first select ccf with val>0
    ccf2 = ccf[ccf['val'] > 0].groupby(['_gidx', 'id']).first().reset_index()
    ccf2g = ccf2.groupby('_gidx')
    df2 = ccf2g[['len', 'val']].sum()
    df2['gcov2'] = df2['val'] / df2['len']
    df2['cids'] = ccf2g['id'].apply(lambda x: ','.join([str(y) for y in x]))
    df2['gst2'] = ccf2g['st'].min()
    df2['ged2'] = ccf2g['ed'].max()
    df2['glen2'] = df2['ged2'] - df2['gst2']

    df2 = df2.reset_index()

    ccf1 = ccf.groupby(['_gidx', 'id']).first().reset_index()
    ccf1g = ccf1.groupby('_gidx')
    df = ccf1g[['len', 'val']].sum()
    df['gcov'] = df['val'] / df['len']
    df['st'] = ccf1g['st'].min()
    df['ed'] = ccf1g['ed'].max()
    df['glen'] = df['ed'] - df['st']
    df = df.reset_index()
    g2chr = dict(UT.izipcols(ex, ['_gidx', 'chr']))
    df['chr'] = [g2chr[x] for x in df['_gidx']]

    def _set_df2prop(src, tgt, default):
        dic = dict(UT.izipcols(df2, ['_gidx', src]))
        df[tgt] = [dic.get(x, default) for x in df['_gidx']]

    _set_df2prop('gcov2', 'gcov2', 0)
    _set_df2prop('len', 'len2', 0)
    _set_df2prop('cids', 'cids', '')
    _set_df2prop('gst2', 'st2', -1)
    _set_df2prop('ged2', 'ed2', -1)
    _set_df2prop('glen2', 'glen2', 0)

    cols = [
        '_gidx', 'chr', 'st', 'ed', 'len', 'val', 'gcov', 'glen', 'len2',
        'gcov2', 'cids', 'st2', 'ed2', 'glen2'
    ]
    cols = ['_gidx', 'gcov']
    df = df[cols]
    UT.save_tsv_nidx_whead(df, gcovpath)
    return df
Esempio n. 7
0
def calc_ecov(expath,
              cipath,
              bwpath,
              dstprefix,
              blocksize=100,
              override=False,
              np=4):
    """Calculate exon coverages.

    Args:
        expath: merged ex
        cipath: chopped interval for ex
        bwpath: bigwig file (sample)
        dstprefix: prefix for outputs

    Outputs:
        1. dstprefix+'.covci.txt.gz': coverage for ci
        2. dstprefix+'.ecov.txt.gz' : DataFrame(cols: eid, chr, st, ed, ecov)

    """
    covcipath = dstprefix + 'covci.txt.gz'
    ecovpath = dstprefix + 'ecov.txt.gz'
    ex = UT.read_pandas(expath)

    if UT.notstale([expath, cipath], covcipath, override):
        cc = UT.read_pandas(covcipath)
    else:
        if UT.notstale(expath, cipath,
                       False):  # you do not want to override ci
            ci = UT.read_pandas(cipath,
                                names=['chr', 'st', 'ed', 'name', 'id'])
        else:
            #ex = UT.read_pandas(expath)
            ci = UT.chopintervals(ex, cipath, idcol='_id')
        cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # ex = UT.read_pandas(expath)
    # if 'locus2' not in ex:
    #     ex['locus2'] = UT.calc_locus_strand(ex)
    # if '_id' not in ex:
    #     UT.set_ids(ex)
    # e2l = UT.df2dict(ex, '_id', 'locus2')
    # ex2 = ex.groupby('locus2').first().reset_index()
    # # maps: eid (_id) <=> locus2
    # if UT.notstale([expath, cipath], covcipath, override):
    #     cc = UT.read_pandas(covcipath)
    # else:
    #     if UT.notstale(expath, cipath, False): # you do not want to override ci
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     else:
    #         ci = UT.chopintervals(ex2, cipath, idcol='_id')
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)

    # if override or (not os.path.exists(covcipath)):
    #     # calc covci
    #     if not os.path.exists(cipath):
    #         ex = UT.read_pandas(expath)
    #         ci = UT.chopintervals(ex, cipath, idcol='_id')
    #     else:
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # else:
    #     cc = UT.read_pandas(covcipath)

    if 'id' not in cc.columns:
        cc['id'] = cc['sc1']
    if 'pid' not in cc.columns:
        cc['pid'] = cc['name'].astype(str).apply(
            lambda x: [int(y) for y in x.split(',')])
        cc['name1'] = cc['pid']
    #ccf = UT.flattendf(cc[['chr','st','ed','pid']], 'pid')
    #ccfg = ccf.groupby('eid')
    #df = ccfg[['chr']].first()
    #df['st'] = ccfg['st'].min()
    #df['ed'] = ccfg['ed'].max()
    #df.reset_index(inplace=True)
    df = ex[['_id', '_pid']].rename(columns={'_id': 'eid', '_pid': 'pid'})
    e2cs = calc_ecov_mp(cc, None, np, blocksize)  # pid => cov
    # l2cs = {e2l[x]: e2cs[x] for x in e2cs} # locus2 => cov
    # ex['ecov'] = [l2cs[x] for x in ex['locus2']]
    df['ecov'] = [e2cs[x] for x in df['pid']]
    # UT.save_tsv_nidx_whead(ex[['_id','ecov']], ecovpath)
    # return ex
    UT.save_tsv_nidx_whead(df[['eid', 'pid', 'ecov']], ecovpath)
    return df
Esempio n. 8
0
 def ex(self):
     sjpath, expath = self.sjexpaths()
     if UT.notstale(expath):
         return UT.read_pandas(expath)
     sj,ex = self.sjex()
     return ex