def calc_ovlratio(aname, bname, tname, nacol, nbcol, idcol=['chr','st','ed'], returnbcols=False): """Calculate overlapped portion of b onto a. Will check existence of result file (tname) and uses it if newer than input files. Args: aname (str): bed file name 1 bname (str): bed file name 2 tname (str): result file name nacol (int): number of columns in file 1 nbcol (int): number of columns in file 2 Optional: idcol (list of str): columns which specify unique entry Returns: A Pandas DataFrame which contains overlap info """ # requirement: no overlap within b # cache? if UT.notstale([aname,bname], tname): return UT.read_pandas(tname) # calculate bedtools intersect tmpsuf='.ovlbed.txt' cname = aname+tmpsuf if nacol==12: cname = bedtoolintersect(aname, bname, cname, wao=True, split=True) else: cname = bedtoolintersect(aname, bname, cname, wao=True) # read tmp file acols = GGB.BEDCOLS[:nacol] bcols = ['b_'+x for x in GGB.BEDCOLS[:nbcol]] cols = acols + bcols +['ovl'] df = UT.read_pandas(cname, names=cols) dfg = df.groupby(idcol) #['chr','st','ed']) if returnbcols: dfa = dfg.first().reset_index()[acols+bcols] else: dfa = dfg.first().reset_index()[acols] if nacol==12:# sum of exon sizes dfa['len'] = [N.sum(map(int, x.split(',')[:-1])) for x in dfa['esizes']] else: dfa['len'] = dfa['ed']-dfa['st'] # since b does not overlap by itself total overlap of an element of a to b is # sum of overlap to individual b dfa['ovl'] = dfg['ovl'].sum().values dfa['ovlratio'] = dfa['ovl'].astype(float)/dfa['len'] dfa['notcovbp'] = dfa['len'] - dfa['ovl'] # clean up os.unlink(cname) # save UT.save_tsv_nidx_whead(dfa, tname) return dfa
def trim_ex(expath, dstpath, dstcipath, length=1000, gidfld='_gidx', np=7): """Generate trimmed version of genes for calculating coverage to avoid length bias. Args: expath (str): path exon tsv dstpath (str): path to trimmed exon dstcipath (str): path to ci (chopped interval) length (pos int): length to trim from 3' end in base pair (default 1000 bp) gidfld (str): column name for gene id (default _gidx) np (pos int): number of CPU to use Generates: Two files (dstpath, dstcipath). Returns: a dataframe containing trimmed exons """ #ex = UT.read_pandas(MD.paths[code]['ex']) #dstpath = MD.trimmedex[code][length]['ex'] #dstcipath = MD.trimmedex[code][length]['ci'] ex = UT.read_pandas(expath) if 'len' not in ex.columns: ex['len'] = ex['ed'] - ex['st'] if np==1: recs = trim_ex_worker((ex, length, gidfld)) else: chroms = sorted(ex['chr'].unique()) data = [(ex[ex['chr']==c], length, gidfld) for c in chroms] recs = [] try: p = multiprocessing.Pool(np) for v in p.map(trim_ex_worker, data): recs += v #recs = reduce(iadd, p.map(trim_ex_worker, *zip(*data))) finally: p.close() # p.join() cols = list(ex.columns.values) nex = PD.DataFrame(recs, columns = cols) nex['len'] = nex['ed'] - nex['st'] # edge case nex.loc[nex['st']==nex['ed'],'ed'] = nex['st'] + 1 UT.save_tsv_nidx_whead(nex, dstpath) UT.chopintervals(nex, dstcipath) return nex
def calc_ecov_mp(covci, fname, np, blocksize=100): """ WARNING: this assumes _id is assinged according to sorted (chr,st,ed) """ LOG.debug('calc_ecov...') chroms = sorted(covci['chr'].unique()) if 'name1' not in covci.columns: covci['name1'] = covci['name'].astype(str).apply( lambda x: [int(y) for y in x.split(',')]) if 'eidmax' not in covci.columns: covci['eidmax'] = covci['name1'].apply(lambda x: max(x)) if 'eidmin' not in covci.columns: covci['eidmin'] = covci['name1'].apply(lambda x: min(x)) args = [(covci[covci['chr'] == c].copy(), blocksize) for c in chroms] e2cs = {} if np == 1: # for c,bwname,chrom,d in data: for arg in args: e2cs.update(calc_ecov_chrom(*arg)) else: try: p = multiprocessing.Pool(np) rslts = p.map(mp_worker, zip(repeat(calc_ecov_chrom), args)) finally: LOG.debug('closing pool') p.close() for x in rslts: e2cs.update(x) LOG.debug('writing rslts...') if fname is None: return e2cs ccf = UT.flattendf(covci, 'name1') ccfg = ccf.groupby('name1') e2chr = dict( UT.izipcols(ccfg['chr'].first().reset_index(), ['name1', 'chr'])) e2st = dict(UT.izipcols(ccfg['st'].min().reset_index(), ['name1', 'st'])) e2ed = dict(UT.izipcols(ccfg['ed'].max().reset_index(), ['name1', 'ed'])) df = PD.DataFrame(e2cs, index=['ecov']).T df.index.name = 'eid' df = df.reset_index() df['chr'] = [e2chr[x] for x in df['eid']] df['st'] = [e2st[x] for x in df['eid']] df['ed'] = [e2ed[x] for x in df['eid']] UT.save_tsv_nidx_whead(df[['eid', 'chr', 'st', 'ed', 'ecov']], fname) return df
def calc_cov_mp(bed, bwname, fname, np, which='cov'): if which == 'cov': worker = worker_cov elif which == 'max': worker = worker_max if UT.isstring(bed): bed = GGB.read_bed(bed) #cols = list(bed.columns)+['cov'] cols = list(bed.columns) + [which] chroms = bed['chr'].unique() #LOG.debug(chroms) cdir = os.path.dirname(__file__) data = [(bed[bed['chr'] == c].copy(), bwname, c, cdir) for c in chroms] recs = [] if np == 1: # for c,bwname,chrom,d in data: for arg in data: LOG.debug('cov calculation: processing {0}...'.format(arg[-2])) recs += worker(*arg) else: LOG.debug('{1} calculation: np={0}'.format(np, which)) try: p = multiprocessing.Pool(np) a = zip(repeat(worker), data) rslts = p.map(mp_worker, a) for v in rslts: recs += v LOG.debug('done {1} calculation: np={0}'.format(np, which)) finally: LOG.debug('closing pool') p.close() #p.join() #recs = reduce(iadd, rslts) LOG.debug('writing rslts...') df = PD.DataFrame(recs, columns=cols) UT.save_tsv_nidx_whead(df, fname) return df
def calc_cov_ovl_mp(srcname, bwname, dstname, np=1, covciname=None, ciname=None, colname='cov', override=False): """Calculate coverage (from BigWig) over intervals (from srcname). A column (default 'cov') which contains coverages is added to source dataframe and the source is overwritten. Args: srcname: path to exons tsv bwname: path to bigwig dstname: path for result np: number of processors covciname: path to covci (coverage for chopped interval dataframe) ciname: path to ci (chopped interval dataframe) colname: name for column which contain calculated coverages Returns: source dataframe with column (cov) added SideEffects: source tsv is overwritten with new column added """ if UT.isstring(srcname): exons = UT.read_pandas(srcname) else: exons = srcname # cache if covciname is None: assert (UT.isstring(srcname)) covciname = srcname[:-7] + '.covci.txt.gz' if ciname is None: assert (UT.isstring(srcname)) ciname = srcname[:-7] + '.ci.txt.gz' if override or (not os.path.exists(covciname)): LOG.debug('calculating covci...') _sttime = time.time() if override or not (os.path.exists(ciname)): ci = UT.chopintervals(exons, ciname) else: ci = UT.read_pandas(ciname, names=['chr', 'st', 'ed', 'name', 'id']) ci['name'] = ci['name'].astype(str) covci = calc_cov_mp(ci, bwname, covciname, np) LOG.debug(' time: {0:.3f}s'.format(time.time() - _sttime)) else: LOG.debug('loading cached covci...') covci = UT.read_pandas(covciname) covci['name'] = covci['name'].astype(str) # covci: chopped interval's cov => reverse # ci => exon id ====> revers exon => ci indices # exon cov = sum(cicov*cilen)/totlen LOG.debug('calculating exon cov...') if 'id' not in covci.columns: covci['id'] = covci['sc1'] _sttime = time.time() e2c = {} for i, name in covci[['id', 'name']].values: for eid in name.split(','): e2c.setdefault(int(eid), []).append(i) covci['len'] = covci['ed'] - covci['st'] covci['val'] = covci['cov'] * covci['len'] def _gen(): for eid in exons['_id']: for cid in e2c[eid]: yield (cid, eid) tmp = PD.DataFrame(list(set([x for x in _gen()])), columns=['cid', 'eid']) c2len = dict(covci[['id', 'len']].values) c2val = dict(covci[['id', 'val']].values) tmp['val'] = [c2val[x] for x in tmp['cid']] tmp['len'] = [c2len[x] for x in tmp['cid']] tmpg = tmp.groupby('eid')[['val', 'len']].sum().reset_index() tmpg['cov'] = tmpg['val'] / tmpg['len'] e2cov = dict(tmpg[['eid', 'cov']].values) exons[colname] = [e2cov[x] for x in exons['_id']] UT.save_tsv_nidx_whead(exons, dstname) return exons
def calc_gcov(expath, cipath, bwpath, dstprefix, override=False, np=4): """Calculate gene coverages. Args: expath: merged ex cipath: chopped interval for ex bwpath: bigwig file (sample) dstprefix: prefix for outputs Outputs: 1. dstprefix+'.covci.txt.gz' 2. dstprefix+'.gcov.txt.gz' : DataFrame(col:_gidx,len,val,gcov,len2,gcov2,cids) len2: calculate length from ci with cov > 0 (normal length = use entire ci's belonging to the gene) gcov2 = val/len2 cids: cid with cov > for the gene ','.joined """ ex = UT.read_pandas(expath) covcipath = dstprefix + 'covci.txt.gz' gcovpath = dstprefix + 'gcov.txt.gz' if UT.notstale([expath, cipath], covcipath, override): cc = UT.read_pandas(covcipath) else: if UT.notstale(expath, cipath, False): ci = UT.read_pandas(cipath, names=['chr', 'st', 'ed', 'name', 'id']) else: ci = UT.chopintervals(ex, cipath, idcol='_id') cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # if override or (not os.path.exists(covcipath)): # # calc covci # if not os.path.exists(cipath): # ci = UT.chopintervals(ex, cipath, idcol='_id') # else: # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # else: # cc = UT.read_pandas(covcipath) if 'id' not in cc.columns: cc['id'] = cc['sc1'] if 'eid' not in cc.columns: cc['eid'] = cc['name'].astype(str).apply( lambda x: [int(y) for y in x.split(',')]) cc['len'] = cc['ed'] - cc['st'] cc['val'] = cc['cov'] * cc['len'] ccf = UT.flattendf(cc[['id', 'eid', 'len', 'val', 'st', 'ed']], 'eid') e2g = dict(UT.izipcols(ex, ['_id', '_gidx'])) ccf['_gidx'] = [e2g[x] for x in ccf['eid']] # for normal gcov: take unique combination of (gid, id) (id=cid) # for gocv2 : first select ccf with val>0 ccf2 = ccf[ccf['val'] > 0].groupby(['_gidx', 'id']).first().reset_index() ccf2g = ccf2.groupby('_gidx') df2 = ccf2g[['len', 'val']].sum() df2['gcov2'] = df2['val'] / df2['len'] df2['cids'] = ccf2g['id'].apply(lambda x: ','.join([str(y) for y in x])) df2['gst2'] = ccf2g['st'].min() df2['ged2'] = ccf2g['ed'].max() df2['glen2'] = df2['ged2'] - df2['gst2'] df2 = df2.reset_index() ccf1 = ccf.groupby(['_gidx', 'id']).first().reset_index() ccf1g = ccf1.groupby('_gidx') df = ccf1g[['len', 'val']].sum() df['gcov'] = df['val'] / df['len'] df['st'] = ccf1g['st'].min() df['ed'] = ccf1g['ed'].max() df['glen'] = df['ed'] - df['st'] df = df.reset_index() g2chr = dict(UT.izipcols(ex, ['_gidx', 'chr'])) df['chr'] = [g2chr[x] for x in df['_gidx']] def _set_df2prop(src, tgt, default): dic = dict(UT.izipcols(df2, ['_gidx', src])) df[tgt] = [dic.get(x, default) for x in df['_gidx']] _set_df2prop('gcov2', 'gcov2', 0) _set_df2prop('len', 'len2', 0) _set_df2prop('cids', 'cids', '') _set_df2prop('gst2', 'st2', -1) _set_df2prop('ged2', 'ed2', -1) _set_df2prop('glen2', 'glen2', 0) cols = [ '_gidx', 'chr', 'st', 'ed', 'len', 'val', 'gcov', 'glen', 'len2', 'gcov2', 'cids', 'st2', 'ed2', 'glen2' ] cols = ['_gidx', 'gcov'] df = df[cols] UT.save_tsv_nidx_whead(df, gcovpath) return df
def calc_ecov(expath, cipath, bwpath, dstprefix, blocksize=100, override=False, np=4): """Calculate exon coverages. Args: expath: merged ex cipath: chopped interval for ex bwpath: bigwig file (sample) dstprefix: prefix for outputs Outputs: 1. dstprefix+'.covci.txt.gz': coverage for ci 2. dstprefix+'.ecov.txt.gz' : DataFrame(cols: eid, chr, st, ed, ecov) """ covcipath = dstprefix + 'covci.txt.gz' ecovpath = dstprefix + 'ecov.txt.gz' ex = UT.read_pandas(expath) if UT.notstale([expath, cipath], covcipath, override): cc = UT.read_pandas(covcipath) else: if UT.notstale(expath, cipath, False): # you do not want to override ci ci = UT.read_pandas(cipath, names=['chr', 'st', 'ed', 'name', 'id']) else: #ex = UT.read_pandas(expath) ci = UT.chopintervals(ex, cipath, idcol='_id') cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # ex = UT.read_pandas(expath) # if 'locus2' not in ex: # ex['locus2'] = UT.calc_locus_strand(ex) # if '_id' not in ex: # UT.set_ids(ex) # e2l = UT.df2dict(ex, '_id', 'locus2') # ex2 = ex.groupby('locus2').first().reset_index() # # maps: eid (_id) <=> locus2 # if UT.notstale([expath, cipath], covcipath, override): # cc = UT.read_pandas(covcipath) # else: # if UT.notstale(expath, cipath, False): # you do not want to override ci # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # else: # ci = UT.chopintervals(ex2, cipath, idcol='_id') # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # if override or (not os.path.exists(covcipath)): # # calc covci # if not os.path.exists(cipath): # ex = UT.read_pandas(expath) # ci = UT.chopintervals(ex, cipath, idcol='_id') # else: # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # else: # cc = UT.read_pandas(covcipath) if 'id' not in cc.columns: cc['id'] = cc['sc1'] if 'pid' not in cc.columns: cc['pid'] = cc['name'].astype(str).apply( lambda x: [int(y) for y in x.split(',')]) cc['name1'] = cc['pid'] #ccf = UT.flattendf(cc[['chr','st','ed','pid']], 'pid') #ccfg = ccf.groupby('eid') #df = ccfg[['chr']].first() #df['st'] = ccfg['st'].min() #df['ed'] = ccfg['ed'].max() #df.reset_index(inplace=True) df = ex[['_id', '_pid']].rename(columns={'_id': 'eid', '_pid': 'pid'}) e2cs = calc_ecov_mp(cc, None, np, blocksize) # pid => cov # l2cs = {e2l[x]: e2cs[x] for x in e2cs} # locus2 => cov # ex['ecov'] = [l2cs[x] for x in ex['locus2']] df['ecov'] = [e2cs[x] for x in df['pid']] # UT.save_tsv_nidx_whead(ex[['_id','ecov']], ecovpath) # return ex UT.save_tsv_nidx_whead(df[['eid', 'pid', 'ecov']], ecovpath) return df