def test_notstale(outdir): a = os.path.join(outdir, 'a') af = open(a, 'w').write('a') b = os.path.join(outdir, 'b') open(b, 'w').write('b') c = os.path.join(outdir, 'c') open(c, 'w').write('c') d = os.path.join(outdir, 'd') # simple: a < b assert UT.notstale(a, b) == True # multiple: [a,b] < c assert UT.notstale([a, b], c) == True # non-existent cache assert UT.notstale(a, d) == False
def bw2bed(bwfile, bedfile, chroms, th, compress=True): """Transform BigWig genomeCov to binary BED by thresholding. Makes result file (bwfile[:-3]+'.binary%g.bed'.format(th)) Args: bwfile: path to BigWig file chroms: list of chromosome names th: coverage threshold Returns: path to generated BED file """ bedbase = bedfile[:-3] if bedfile[-3:] == '.gz' else bedfile #bedfile = '{0}.binary{1:g}.bed'.format(bwfile[:-3], th) if UT.notstale(bwfile, bedbase + '.gz'): return bedbase + '.gz' # make sure bwfile exists if not (os.path.exists(bwfile)): raise RuntimeError('BigWig file {0} does not exist.'.format(bwfile)) processor = apply_threshold(bwfile, th, chroms) UT.makedirs(os.path.dirname(bedfile)) out = open(bedbase, 'w') out.write(''.join(['%s\t%i\t%i\n' % x for x in processor])) #out.write('\n') #<= this introduces space inbetween chroms in mp ode # which terminates bedtools at chr1 out.close() if compress: return UT.compress(bedbase) return bedbase
def fillgap(binfile, gapfile, gap=50): if gapfile[-3:]=='.gz': gapfile = gapfile[:-3] #gapfile = binfile[:-7]+'.gap%d.bed' % gap if UT.notstale(binfile, gapfile+'.gz'): return gapfile+'.gz' gapfile = bedtoolmerge(binfile, gapfile, d=gap) return gapfile
def calc_ovlratio(aname, bname, tname, nacol, nbcol, idcol=['chr','st','ed'], returnbcols=False): """Calculate overlapped portion of b onto a. Will check existence of result file (tname) and uses it if newer than input files. Args: aname (str): bed file name 1 bname (str): bed file name 2 tname (str): result file name nacol (int): number of columns in file 1 nbcol (int): number of columns in file 2 Optional: idcol (list of str): columns which specify unique entry Returns: A Pandas DataFrame which contains overlap info """ # requirement: no overlap within b # cache? if UT.notstale([aname,bname], tname): return UT.read_pandas(tname) # calculate bedtools intersect tmpsuf='.ovlbed.txt' cname = aname+tmpsuf if nacol==12: cname = bedtoolintersect(aname, bname, cname, wao=True, split=True) else: cname = bedtoolintersect(aname, bname, cname, wao=True) # read tmp file acols = GGB.BEDCOLS[:nacol] bcols = ['b_'+x for x in GGB.BEDCOLS[:nbcol]] cols = acols + bcols +['ovl'] df = UT.read_pandas(cname, names=cols) dfg = df.groupby(idcol) #['chr','st','ed']) if returnbcols: dfa = dfg.first().reset_index()[acols+bcols] else: dfa = dfg.first().reset_index()[acols] if nacol==12:# sum of exon sizes dfa['len'] = [N.sum(map(int, x.split(',')[:-1])) for x in dfa['esizes']] else: dfa['len'] = dfa['ed']-dfa['st'] # since b does not overlap by itself total overlap of an element of a to b is # sum of overlap to individual b dfa['ovl'] = dfg['ovl'].sum().values dfa['ovlratio'] = dfa['ovl'].astype(float)/dfa['len'] dfa['notcovbp'] = dfa['len'] - dfa['ovl'] # clean up os.unlink(cname) # save UT.save_tsv_nidx_whead(dfa, tname) return dfa
def chop_chrs_gtf(gtfname, chrs, outdir=None): """Separate chromosomes into different files. Args: gtfname: path to GTF chrs: list of chromosome names outdir: output directory, if None (default), then use same directory as input """ #chrs = ['chr%d' % (x+1,) for x in range(19)] +['chrX','chrY'] if outdir is None: outdir = os.path.dirname(gtfname) base = os.path.basename(gtfname)[:-4] outnames = [os.path.join(outdir, base+'-%s.gtf' % x) for x in chrs] if all([UT.notstale(gtfname, x) for x in outnames]): # all files already exist and newer than gtfname return outnames gtf = read_gtf(gtfname, parseattrs=[]) # don't parse attrs for c,fname in zip(chrs,outnames): LOG.debug( "writing %s to %s..." % (c, fname)) sub = gtf[gtf['chr']==c] write_gtf(sub, fname, compress=False) return outnames
def calc_gcov(expath, cipath, bwpath, dstprefix, override=False, np=4): """Calculate gene coverages. Args: expath: merged ex cipath: chopped interval for ex bwpath: bigwig file (sample) dstprefix: prefix for outputs Outputs: 1. dstprefix+'.covci.txt.gz' 2. dstprefix+'.gcov.txt.gz' : DataFrame(col:_gidx,len,val,gcov,len2,gcov2,cids) len2: calculate length from ci with cov > 0 (normal length = use entire ci's belonging to the gene) gcov2 = val/len2 cids: cid with cov > for the gene ','.joined """ ex = UT.read_pandas(expath) covcipath = dstprefix + 'covci.txt.gz' gcovpath = dstprefix + 'gcov.txt.gz' if UT.notstale([expath, cipath], covcipath, override): cc = UT.read_pandas(covcipath) else: if UT.notstale(expath, cipath, False): ci = UT.read_pandas(cipath, names=['chr', 'st', 'ed', 'name', 'id']) else: ci = UT.chopintervals(ex, cipath, idcol='_id') cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # if override or (not os.path.exists(covcipath)): # # calc covci # if not os.path.exists(cipath): # ci = UT.chopintervals(ex, cipath, idcol='_id') # else: # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # else: # cc = UT.read_pandas(covcipath) if 'id' not in cc.columns: cc['id'] = cc['sc1'] if 'eid' not in cc.columns: cc['eid'] = cc['name'].astype(str).apply( lambda x: [int(y) for y in x.split(',')]) cc['len'] = cc['ed'] - cc['st'] cc['val'] = cc['cov'] * cc['len'] ccf = UT.flattendf(cc[['id', 'eid', 'len', 'val', 'st', 'ed']], 'eid') e2g = dict(UT.izipcols(ex, ['_id', '_gidx'])) ccf['_gidx'] = [e2g[x] for x in ccf['eid']] # for normal gcov: take unique combination of (gid, id) (id=cid) # for gocv2 : first select ccf with val>0 ccf2 = ccf[ccf['val'] > 0].groupby(['_gidx', 'id']).first().reset_index() ccf2g = ccf2.groupby('_gidx') df2 = ccf2g[['len', 'val']].sum() df2['gcov2'] = df2['val'] / df2['len'] df2['cids'] = ccf2g['id'].apply(lambda x: ','.join([str(y) for y in x])) df2['gst2'] = ccf2g['st'].min() df2['ged2'] = ccf2g['ed'].max() df2['glen2'] = df2['ged2'] - df2['gst2'] df2 = df2.reset_index() ccf1 = ccf.groupby(['_gidx', 'id']).first().reset_index() ccf1g = ccf1.groupby('_gidx') df = ccf1g[['len', 'val']].sum() df['gcov'] = df['val'] / df['len'] df['st'] = ccf1g['st'].min() df['ed'] = ccf1g['ed'].max() df['glen'] = df['ed'] - df['st'] df = df.reset_index() g2chr = dict(UT.izipcols(ex, ['_gidx', 'chr'])) df['chr'] = [g2chr[x] for x in df['_gidx']] def _set_df2prop(src, tgt, default): dic = dict(UT.izipcols(df2, ['_gidx', src])) df[tgt] = [dic.get(x, default) for x in df['_gidx']] _set_df2prop('gcov2', 'gcov2', 0) _set_df2prop('len', 'len2', 0) _set_df2prop('cids', 'cids', '') _set_df2prop('gst2', 'st2', -1) _set_df2prop('ged2', 'ed2', -1) _set_df2prop('glen2', 'glen2', 0) cols = [ '_gidx', 'chr', 'st', 'ed', 'len', 'val', 'gcov', 'glen', 'len2', 'gcov2', 'cids', 'st2', 'ed2', 'glen2' ] cols = ['_gidx', 'gcov'] df = df[cols] UT.save_tsv_nidx_whead(df, gcovpath) return df
def calc_ecov(expath, cipath, bwpath, dstprefix, blocksize=100, override=False, np=4): """Calculate exon coverages. Args: expath: merged ex cipath: chopped interval for ex bwpath: bigwig file (sample) dstprefix: prefix for outputs Outputs: 1. dstprefix+'.covci.txt.gz': coverage for ci 2. dstprefix+'.ecov.txt.gz' : DataFrame(cols: eid, chr, st, ed, ecov) """ covcipath = dstprefix + 'covci.txt.gz' ecovpath = dstprefix + 'ecov.txt.gz' ex = UT.read_pandas(expath) if UT.notstale([expath, cipath], covcipath, override): cc = UT.read_pandas(covcipath) else: if UT.notstale(expath, cipath, False): # you do not want to override ci ci = UT.read_pandas(cipath, names=['chr', 'st', 'ed', 'name', 'id']) else: #ex = UT.read_pandas(expath) ci = UT.chopintervals(ex, cipath, idcol='_id') cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # ex = UT.read_pandas(expath) # if 'locus2' not in ex: # ex['locus2'] = UT.calc_locus_strand(ex) # if '_id' not in ex: # UT.set_ids(ex) # e2l = UT.df2dict(ex, '_id', 'locus2') # ex2 = ex.groupby('locus2').first().reset_index() # # maps: eid (_id) <=> locus2 # if UT.notstale([expath, cipath], covcipath, override): # cc = UT.read_pandas(covcipath) # else: # if UT.notstale(expath, cipath, False): # you do not want to override ci # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # else: # ci = UT.chopintervals(ex2, cipath, idcol='_id') # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # if override or (not os.path.exists(covcipath)): # # calc covci # if not os.path.exists(cipath): # ex = UT.read_pandas(expath) # ci = UT.chopintervals(ex, cipath, idcol='_id') # else: # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # else: # cc = UT.read_pandas(covcipath) if 'id' not in cc.columns: cc['id'] = cc['sc1'] if 'pid' not in cc.columns: cc['pid'] = cc['name'].astype(str).apply( lambda x: [int(y) for y in x.split(',')]) cc['name1'] = cc['pid'] #ccf = UT.flattendf(cc[['chr','st','ed','pid']], 'pid') #ccfg = ccf.groupby('eid') #df = ccfg[['chr']].first() #df['st'] = ccfg['st'].min() #df['ed'] = ccfg['ed'].max() #df.reset_index(inplace=True) df = ex[['_id', '_pid']].rename(columns={'_id': 'eid', '_pid': 'pid'}) e2cs = calc_ecov_mp(cc, None, np, blocksize) # pid => cov # l2cs = {e2l[x]: e2cs[x] for x in e2cs} # locus2 => cov # ex['ecov'] = [l2cs[x] for x in ex['locus2']] df['ecov'] = [e2cs[x] for x in df['pid']] # UT.save_tsv_nidx_whead(ex[['_id','ecov']], ecovpath) # return ex UT.save_tsv_nidx_whead(df[['eid', 'pid', 'ecov']], ecovpath) return df
def ex(self): sjpath, expath = self.sjexpaths() if UT.notstale(expath): return UT.read_pandas(expath) sj,ex = self.sjex() return ex