def merge_bigwigs_mp(bwfiles, genome, dstpath, scale=None, np=7): chroms = UT.chroms(genome) chromfile = UT.chromsizes(genome) chromsizes = UT.df2dict(UT.chromdf(genome), 'chr', 'size') # reorder chroms, so that chrX doesn't get processed alone at the end wasting MP time tmp = sorted([(chromsizes[c], c) for c in chroms])[::-1] chroms = [x[1] for x in tmp] args = [(bwfiles, c, chromsizes[c], dstpath + '.{0}.wig'.format(c), scale) for c in chroms] rslts = UT.process_mp(merge_bigwigs_chr, args, np, doreduce=False) dic = dict(rslts) LOG.debug('concatenating chromosomes...') wigpath = dstpath + '.wig' UT.makedirs(os.path.dirname(wigpath)) with open(wigpath, 'wb') as dst: for c in chroms: with open(dic[c], 'rb') as src: shutil.copyfileobj(src, dst) LOG.debug('converting wiggle to bigwig') BT.wig2bw(wigpath, chromfile, dstpath) # clean up for c in chroms: f = dstpath + '.{0}.wig'.format(c) if os.path.exists(f): os.unlink(f) if os.path.exists(wigpath): os.unlink(wigpath)
def make_dm(self, targetlevel): """calculate 2 DMs (logdiff and minmin) at specified level """ # first make gcovlevel <=> targetlevel mapping si = self.si gl = self.gcovlevel gc = self.gcov ts = si.groupby(targetlevel, sort=False).first().index.values g2t = UT.df2dict(si, gl, targetlevel) t2g = make_dict(si, targetlevel, gl) lgc = N.log2(gc + 1) v0 = lgc.groupby(g2t, axis=1).mean() # target level maxe = v0.max(axis=1) gids = maxe[maxe > N.log2(self.maxeth + 1)].index.values v = v0.ix[gids][ts] # restrict to expressed # do the math in numpy to get normalized logdiff DM m = v.values logdiff = N.abs(m[:, :, N.newaxis] - m[:, N.newaxis, :]) maxdiff = logdiff.max(axis=2).max(axis=1) normdiff = logdiff / maxdiff[:, N.newaxis, N.newaxis] # normalized dm = PD.Panel(normdiff, v.index, ts, ts) # calculate minmin DM gmin = gc.ix[gids].groupby(g2t, axis=1).min()[ts].values a = gmin[:, :, N.newaxis] # i b = gmin[:, N.newaxis, :] # j minmin = N.minimum(a, b) mm = PD.Panel(minmin, v.index, ts, ts) self.dms[targetlevel] = dict(ts=ts, g2t=g2t, t2g=t2g, dm=dm, mm=mm, v=v)
def sj02bw(sj0, pathpre, genome, np=12): chroms = UT.chroms(genome) chromdf = UT.chromdf(genome).sort_values('size',ascending=False) chroms = [x for x in chromdf['chr'] if x in chroms] chromdic = UT.df2dict(chromdf, 'chr', 'size') if 'jcnt' not in sj0: sj0['jcnt'] = sj0['ucnt']+sj0['mcnt'] files = [] args = [] for c in chroms: f = '{0}.{1}.{{0}}.wig'.format(pathpre,c) args.append((sj0[sj0['chr']==c], c, chromdic[c], f)) files.append(f) rslts = UT.process_mp(sj02wig, args, np=np, doreduce=False) rmfiles = [] for strand in ['+','-','.']: s = STRANDMAP0[strand] wig = pathpre+'.sj{0}.wig'.format(s) bwpath = pathpre+'.sj{0}.bw'.format(s) with open(wig, 'w') as dst: for tmpl in files: f = tmpl.format(strand) with open(f,'r') as src: shutil.copyfileobj(src, dst) rmfiles.append(f) rmfiles.append(wig) wig2bw(wig, UT.chromsizes(genome), bwpath) for f in rmfiles: os.unlink(f) os.unlink(wig)
def filter_sj(bwsjpre, statspath, chrom, csize, params): # read in junction stats stats = UT.read_pandas(statspath) if 'chr' not in stats: stats['chr'] = [x.split(':')[0] for x in stats['locus']] if '#detected' in stats: stats.rename(columns={'#detected': 'detected'}, inplace=True) stats = stats[stats['chr'] == chrom].copy() if 'pc' not in stats: stats['pc'] = [locus2pc(x) for x in stats['locus']] flds = ['detected', 'maxcnt', 'maxoverhang'] dics = {f: UT.df2dict(stats, 'pc', f) for f in flds} # read sjpath fpath_chr = bwsjpre + '.sjpath.{0}.bed.gz'.format(chrom) dstpath = bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format(chrom) if os.path.exists(fpath_chr): sj = GGB.read_bed(fpath_chr) else: fpath = bwsjpre + '.sjpath.bed.gz' sj = GGB.read_bed(fpath) sj = sj[sj['chr'] == chrom].copy() name0 = sj.iloc[0]['name'] if len(name0.split('|')) < len(name0.split(',')): # exons attached? sj['name'] = [','.join(x.split(',')[1:-1]) for x in sj['name']] # filter unstranded sj = sj[sj['strand'].isin(['+', '-'])].copy() # filter with stats for f in flds: sj[f] = [ N.min([dics[f].get(x, 0) for x in y.split(',')]) for y in sj['name'] ] sj = sj[sj[f] > params['th_' + f]].copy() # filter # edge exon size sj['eflen'] = [int(x.split(',')[0]) for x in sj['esizes']] sj['ellen'] = [int(x.split(',')[-2]) for x in sj['esizes']] eth = params['th_minedgeexon'] sj = sj[(sj['eflen'] > eth) & (sj['ellen'] > eth)].copy() # calculate sjratio, sjratio2 sjexbw = A2.SjExBigWigs(bwsjpre, mixunstranded=False) for s in ['+', '-']: idx = sj['strand'] == s with sjexbw: sa = sjexbw.bws['sj'][s].get(chrom, 0, csize) ea = sjexbw.bws['ex'][s].get(chrom, 0, csize) a = sa + ea sj.loc[idx, 'sjratio2'] = [ x / N.mean(a[int(s):int(e)]) for x, s, e in sj[idx][['sc1', 'tst', 'ted']].values ] sj = sj[sj['sjratio2'] > params['th_sjratio2']] GGB.write_bed(sj, dstpath, ncols=12)
def count_repeats_mp(beddf, genomefastaobj, col='#repbp', returnseq=False, seqcol='seq', idfld='_id', np=4): """ MultiCPU version of counts_repeats """ # only send relevant part i.e. chr,st,ed,id if not idfld in beddf: beddf[idfld] = N.arange(len(beddf)) # number per CPU n = int(N.ceil(len(beddf) / float(np))) # per CPU args = [(beddf.iloc[i * n:(i + 1) * n], genomefastaobj, col, returnseq, seqcol) for i in range(np)] rslts = UT.process_mp(count_repeats, args, np=np, doreduce=False) df = PD.concat(rslts, ignore_index=True) i2c = UT.df2dict(df, idfld, col) beddf[col] = [i2c[x] for x in beddf[idfld]] if returnseq: i2s = UT.df2dict(df, idfld, seqcol) beddf[seqcol] = [i2s[x] for x in beddf[idfld]] return beddf
def assign_tcode_sj(self): self.sj_tgt = stgt = self.cn_tgt.model( 'sj') #UT.read_pandas(self.p1.sj) self.sj_ref = sref = self.cn_ref.model( 'sj') #UT.read_pandas(self.p2.sj) if 'locus' not in stgt.columns: stgt['locus'] = UT.calc_locus_strand(stgt) if 'locus' not in sref.columns: sref['locus'] = UT.calc_locus_strand(sref) l2c = dict([(x, 'k.me') for x in sref['locus']]) rcode = self.cn_ref.code setfld = 'etcode_' + rcode sgtfld = 'gtcode_' + rcode stgt[setfld] = [l2c.get(x, 'u.me') for x in stgt['locus']] g2c = UT.df2dict(self.ex_tgt, '_gidx', 'gtcode_' + rcode) stgt[sgtfld] = [g2c.get(x, 'u.me') for x in stgt['_gidx']]
def __call__(self): chroms = UT.chroms(self.genome) csizedic = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size') args = [] for c in chroms: csize = csizedic[c] args.append((self.bwsjpre, self.statspath, c, csize, self.params)) rslts = UT.process_mp(filter_sj, args, np=self.np, doreduce=False) dstpath = self.bwsjpre + '.sjpath.filtered.bed.gz' with open(dstpath, 'wb') as dst: for c in chroms: srcpath = self.bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format( c) with open(srcpath, 'rb') as src: shutil.copyfileobj(src, dst)
def estimatecovs(modelpre, bwpre, dstpre, genome, tcovth=1, np=6): bed = GGB.read_bed(modelpre + '.paths.withse.bed.gz') chroms = bed['chr'].unique() csizedic = UT.df2dict(UT.chromdf(genome), 'chr', 'size') bundles = [] args = [] for chrom in chroms: sub = bed[(bed['chr'] == chrom)] uc = UT.union_contiguous(sub[['chr', 'st', 'ed']], returndf=True) # total about 30K=> make batch of ~1000 n = len(uc) nb = int(N.ceil(n / 1000.)) for i in range(nb): sti = 1000 * i edi = min(1000 * (i + 1), len(uc) - 1) st = max(uc.iloc[sti]['st'] - 100, 0) ed = min(uc.iloc[edi]['ed'] + 100, csizedic[chrom]) args.append([modelpre, bwpre, chrom, st, ed, dstpre, tcovth]) bundles.append((chrom, st, ed)) rslts = UT.process_mp(bundle_estimator, args, np=np, doreduce=False) concatenate_bundles(bundles, dstpre)
def calc_many_specific(self, targetlevel, key2names, scoreth=None, rdratioth=0.6): """ Args: targetlevel: name or cg1 key2names: dict groupname (key) to names in targetlevel """ dfs = [] for k, ln in key2names.items(): print('{0}...'.format(k)) df = self.calc_one_specific(targetlevel, ln) cols = list(df.columns) if scoreth is not None: df = df[df['score'] > scoreth].copy() print('scoreth{0}:{1}'.format(scoreth, len(df))) if rdratioth is not None: idx1 = (df['gcov'] > df['gcov2']) & (df['rd'] > rdratioth) idx2 = (df['gcov'] <= df['gcov2']) & ( (1 - df['rd']) > rdratioth) df = df[idx1 | idx2].copy() print('rdratioth{0}:{1}'.format(rdratioth, len(df))) df['key'] = k df = df.sort_values('score', ascending=False) df['rank'] = N.arange(len(df)) df['id'] = df['key'] + '.' + df['rank'].astype(str) dfs.append(df) df0 = PD.concat(dfs, ignore_index=True) g2cg1 = UT.df2dict(self.si, 'group', 'cg1') df0['region'] = [g2cg1.get(x, x) for x in df0['key']] df0 = df0[['region', 'key', 'id'] + cols] df0 = self.annotate(df0) return df0
def gtf_from_bed12(modelpre, dstpath=None, source='.'): # path['gname'] contains gene id paths = GGB.read_bed(modelpre+'.paths.withse.bed.gz') ex = UT.read_pandas(modelpre+'.ex.txt.gz') ex['id'] = ex['chr']+':'+ex['name'] n2gn = UT.df2dict(ex, 'id', 'gname') # n2gn = UT.df2dict(ex, 'name', 'gname') # there may be same st,ed in different chromosome paths['id'] = paths['chr']+':'+paths['name'] paths['id0'] = paths['chr']+':'+paths['name'].str.split('|').str[0] paths['gname'] = [n2gn[x] for x in paths['id0']] g2cnt = {} tnames = [] for x in paths['gname']: i = g2cnt.get(x,1) tnames.append('{0}.{1}'.format(x,i)) g2cnt[x] = i+1 paths['tname'] = tnames txt = 'gene_id "{0}"; transcript_id "{1}"; exon_number "{2}";' def _gen(): cols = ['chr','st','ed','gname','tname','esizes','estarts','strand'] for c,s,e,gn,tn,esi,est,strand in paths[cols].values: esizes = [int(x) for x in esi.split(',')[:-1]] estarts = [int(x) for x in est.split(',')[:-1]] for i,(x,y) in enumerate(zip(esizes,estarts)): est = s+y eed = est+x extra = txt.format(gn,tn,i+1) yield (c,source,'exon',est+1,eed,'.',strand,'.',extra) df = PD.DataFrame([x for x in _gen()], columns=GGB.GTFCOLS) if dstpath is None: dstpath = bedpath.replace('.bed','.gtf') GGB.write_gtf(df, dstpath) idf = paths[['id','chr','name','tname','gname']] UT.write_pandas(idf, modelpre+'.idmap.txt.gz','h') return df
def prep_sjex(self, en, np=1, savesjex=True, calccovs=True): """ Assign ecov, gcov, jcnt """ dcode = self.datacode sj = en.model('sj', dcode) ex = en.model('ex', dcode) savesj = False saveex = False # check support if len(sj) > 0: dids = set(ex['d_id'].values) aids = set(ex['a_id'].values) idx = sj['a_id'].isin(aids) & sj['d_id'].isin(dids) sj = sj[idx].copy() en.sj = sj if '_id' not in ex.columns: # edge case (len(sj)==0) ex['_id'] = N.arange(len(ex)) if '_gidx' not in ex.columns: # edge case (len(sj)==0) ex['_gidx'] = N.arange(len(ex)) # length if 'len' not in sj.columns: sj['len'] = sj['ed'] - sj['st'] savesj = True if 'len' not in ex.columns: ex['len'] = ex['ed'] - ex['st'] saveex = True # ecov if calccovs: print('calccov for {0}'.format(en.code)) ecovname = self.colname('ecov') if ecovname not in ex.columns: ecov = CC.calc_ecov( expath=en.modelpath('ex'), cipath=en.modelpath('ci'), bwpath=self.bigwig, dstprefix=en.fname2( '', self.datacode), # cov is data dependent override=False, # override previous? np=np) ex[ecovname] = ecov.set_index('eid').ix[ ex['_id'].values]['ecov'].values saveex = True # gcov, glen gcovname = self.colname('gcov') if gcovname not in ex.columns: gcov = CC.calc_gcov( expath=en.modelpath('ex'), cipath=en.modelpath('ci'), bwpath=self.bigwig, dstprefix=en.fname2('', self.datacode), override=False, # reuse covci from ecov calc np=np) tmp = gcov.set_index('_gidx').ix[ex['_gidx'].values] ex[gcovname] = tmp['gcov'].values if 'glen' in tmp: ex['glen'] = tmp[ 'glen'].values # glen is only dependent on model not data saveex = True else: ecovname = self.colname('ecov') if ecovname not in ex.columns: ex[ecovname] = 0 gcovname = self.colname('gcov') if gcovname not in ex.columns: ex[gcovname] = 0 # sjcnt ucntname = self.colname('ucnt') mcntname = self.colname('mcnt') jcntname = self.colname('jcnt') sjfile = self.sjfile if ucntname not in sj.columns: if sjfile.endswith('.bed') or sjfile.endswith( '.bed.gz'): # no header dsj = UT.read_pandas(sjfile, names=[ 'chr', 'st', 'ed', 'name', 'ucnt', 'strand', 'mcnt' ]) else: # assume txt file with header dsj = UT.read_pandas(sjfile) # locus based matching dsj['locus'] = UT.calc_locus_strand(dsj) sj['locus'] = UT.calc_locus_strand(sj) l2u = UT.df2dict(dsj, 'locus', 'ucnt') l2m = UT.df2dict(dsj, 'locus', 'mcnt') sj[ucntname] = [l2u.get(x, 0) for x in sj['locus']] sj[mcntname] = [l2m.get(x, 0) for x in sj['locus']] sj[jcntname] = [x or y for x, y in sj[[ucntname, mcntname]].values] savesj = True if saveex and savesjex: en.savemodel('ex', dcode, category='output') if savesj and savesjex: en.savemodel('sj', dcode, category='output')
def calc_stats(self): ecovname = self.colname('ecov') jcntname = self.colname('jcnt') jhitname = self.colname2('jhit', self.en2.code) def _findclosest(e, which): e['dlen'] = N.abs(e['len'] - e['b_len'].astype(float)) e['ratio'] = e['b_len'].astype(float) / e['len'] e = e.sort_values(['_id', 'dlen'], ascending=True) f = e.groupby('_id', sort=False).first().reset_index() self.closest[which] = f return f def _count(dw, da1, da2, which): if which != 'j': da1 = da1[da1[ecovname] > 0] dw = dw[dw[ecovname] > 0] #da2 = da2[da2[ecovname]>0] else: da1 = da1[da1[jcntname] > 0] dw = dw[dw[jcntname] > 0] #da2 = da2[da2[jcntname]>0] pop = set(da1['_id'].values) hit = set(dw['_id'].values) pop2 = set(da2['_id'].values) #dif = pop.difference(hit) if len(pop) == 0: LOG.warning('no elements in {0} for population1'.format( self.abbr[which])) if len(pop2) == 0: LOG.warning('no elements in {0} for population2'.format( self.abbr[which])) if len(hit) == 0: LOG.warning('no elements in {0} for match'.format( self.abbr[which])) np1, nh, np2 = len(pop), len(hit), len(pop2) r1 = float(nh) / max(1, np1) r2 = float(nh) / max(1, np2) LOG.info( '[{5}] detected1:{0},\tmatched:{1},\t(detected2:{2}),\tratio:{3:.2f},\t(ratio2:{4:.2f})' .format(np1, nh, np2, r1, r2, which)) #return hit, pop, pop2 return nh, np1, np2 for which in ['i', '5', '3', 's', 'j', '5b', '3b', 'sb']: LOG.debug(which + '=' * 10) cn = 'hit{0}'.format(which) if which != 'j': e1, e2 = self.e1, self.e2 # use exons with reads ea1 = e1[(e1['cat'] == which[0])][['_id', ecovname, 'name']].copy() # all exons if len(which) == 1: ea2 = e2[(e2['cat'] == which[0])] else: # all of exons allowed ea2 = e2 ew = self.e[which] # matched exons hit, pop, pop2 = _count(ew, ea1, ea2, which) ew2 = _findclosest(ew, which) # calculate ratio i2r = UT.df2dict(ew2, '_id', 'ratio') ea1[cn] = [i2r.get(x, 0) for x in ea1['_id']] ea1 = ea1.set_index('_id') x = N.log2(ea1[ecovname] + 1) # log coverage y = ea1[cn] ns = ea1['name'] else: sa = self.s1 hit, pop, pop2 = _count(self.e['j'], sa, self.s2, which) sa[cn] = [1 if x > 0 else 0 for x in sa[jhitname]] # in case of NaN sa = sa.set_index('_id') x = N.log2(sa[jcntname] + 1) y = sa[cn] ns = sa['name'] # gen4 ecov>0, detected or not # if which != 'j': # idx2 = x>0 # x2 = x[idx2].values # y4 = N.array(y[idx2]>0, dtype=int) # else: # x2 = x.values # y4 = N.array(y>0, dtype=int) # only consider ones detected in the reference (en1) idx2 = x > 0 x2 = x[idx2].values y4 = N.array(y[idx2] > 0, dtype=int) # binary detection indicator (ratio>0) try: x3, y3, xth = UT.fit_sigmoid(x2, y4, (0, 5), 0.99) except: xth = N.NaN auc4, maxx4, avgy4, x4, y4 = self._calc_binned( x2, y4, self.binsize) p1 = float(hit) / pop if pop > 0 else 0. p2 = float(hit) / pop2 if pop2 > 0 else 0. self.ratios[which] = PD.DataFrame({'x': x, 'y': y, 'name': ns}) self.stats[which] = { 'detected1': pop, # int 'matched': hit, # int 'detected2': pop2, # int 'p1': p1, # float 'p2': p2, # float 'auc': auc4, # float 'maxx': list(maxx4), # list 'avgy': list(avgy4), # list 'xth': xth, # float }
def __call__(self): # exdf => ex.p, ex.n, ex.u # sjdf => sj.p, sj.n, sj.u # paths => sjpath.bed # divide into tasks (exdf,sjdf,paths) x chroms self.server = server = TQ.Server(name='PrepBWSJ', np=self.np) self.chroms = chroms = UT.chroms(self.genome) csizes = UT.df2dict(UT.chromdf(self.genome), 'chr', 'size') self.exstatus = exstatus = {} self.sjstatus = sjstatus = {} self.pastatus = pastatus = {} exdone = False sjdone = False padone = False with server: for chrom in chroms: # exdf tasks tname = 'prep_exwig_chr.{0}'.format(chrom) args = (self.j2pres, self.libsizes, self.dstpre, chrom, csizes[chrom]) task = TQ.Task(tname, prep_exwig_chr, args) server.add_task(task) # exdf tasks tname = 'prep_sjwig_chr.{0}'.format(chrom) args = (self.j2pres, self.libsizes, self.dstpre, chrom, csizes[chrom]) task = TQ.Task(tname, prep_sjwig_chr, args) server.add_task(task) # exdf tasks tname = 'prep_sjpath_chr.{0}'.format(chrom) args = (self.j2pres, self.libsizes, self.dstpre, chrom) task = TQ.Task(tname, prep_sjpath_chr, args) server.add_task(task) while server.check_error(): try: name, rslt = server.get_result( timeout=5) # block until result come in except TQ.Empty: name, rslt = None, None if name is not None: if name.startswith('prep_exwig_chr.'): chrom = name.split('.')[1] exstatus[chrom] = rslt if len(exstatus) == len(chroms): # all finished print('$$$$$$$$ putting in prep_exbw $$$$$$$$$$$') tname = 'prep_exbw' args = (self.dstpre, chroms, self.genome) task = TQ.Task(tname, prep_exbw, args) server.add_task(task) if name.startswith('prep_sjwig_chr.'): chrom = name.split('.')[1] sjstatus[chrom] = rslt if len(sjstatus) == len(chroms): # all finished print('$$$$$$$$ putting in prep_sjbw $$$$$$$$$$$') tname = 'prep_sjbw' args = (self.dstpre, chroms, self.genome) task = TQ.Task(tname, prep_sjbw, args) server.add_task(task) if name.startswith('prep_sjpath_chr.'): chrom = name.split('.')[1] pastatus[chrom] = rslt if len(pastatus) == len(chroms): # all finished print( '$$$$$$$$ putting in prep_sjpath $$$$$$$$$$$') tname = 'prep_sjpath' args = (self.dstpre, chroms) task = TQ.Task(tname, prep_sjpath, args) server.add_task(task) if name == 'prep_exbw': print('$$$$$$$$ prep_exbw done $$$$$$$$$$$') exdone = True if name == 'prep_sjbw': print('$$$$$$$$ prep_sjbw done $$$$$$$$$$$') sjdone = True if name == 'prep_sjpath': print('$$$$$$$$ prep_sjpath done $$$$$$$$$$$') padone = True if exdone & sjdone & padone: break print('Exit Loop') print('Done')
def calc_completeness(self): """Completeness measures how much of the reference gene structure is recovered. 1. GLC: gene length completeness = max(ratio of gene length covered by overlapping target gene) 2. ECC: exon count completeness = max(ratio of overlapping exon counts) 3. JCC: junction count completeness = max(ratio of overlapping junction counts) """ ov = self.ov # all if self.exclude_se_from_completeness: ov = ov[ov['cat'] != 's'] # actual overlap with correct strand ov2 = ov[(ov['b__gidx'] != '.') & ( (ov['strand'] == ov['b_strand']) | (ov['b_strand'] == '.'))] if self.exclude_se_from_completeness: ov2 = ov2[ov2['b_cat'] != 's'] gcovname = self.colname('gcov') g2gcov = UT.df2dict(self.e1, '_gidx', gcovname) xlim = [0, 6] # GLC g1 = ov.groupby('_gidx') glc = (g1['ed'].max() - g1['st'].min()).to_frame('glen') g2 = ov2.groupby(['_gidx', 'b__gidx']) gl2 = (g2['ed'].max() - g2['st'].min()).to_frame('b_glen').reset_index() gl2 = gl2.groupby('_gidx')['b_glen'].max() g2gl2 = UT.series2dict(gl2) glc['b_glen'] = [g2gl2.get(x, 0) for x in glc.index] glc['y'] = glc['b_glen'] / glc['glen'] glc['x'] = N.log2(N.array([g2gcov[x] for x in glc.index]) + 1.) self.ratios['glc'] = glc[['x', 'y']] x, y = glc['x'].values, glc['y'].values x2, y2, xth = UT.fit_sigmoid(x, y, xlim, 0.99) auc, maxx, avgy, x, y = self._calc_binned(x, y, self.binsize) self.stats['glc'] = { 'p1': N.sum(glc['b_glen'] > 0) / float(len(glc)), # float ratio detected 'auc': auc, # float 'maxx': list(maxx), # list 'avgy': list(avgy), # list 'xth': xth, # float } # ECC ecc = ov.groupby([ '_gidx', '_id' ]).first().reset_index().groupby('_gidx').size().to_frame('#exons') ec2 = ov2.groupby(['_gidx', 'b__gidx', '_id']).first().reset_index() ec2 = ec2.groupby(['_gidx', 'b__gidx']).size().to_frame('ec').reset_index() ec2 = ec2.groupby('_gidx')['ec'].max() g2ec2 = UT.series2dict(ec2) ecc['b_#exons'] = [g2ec2.get(x, 0) for x in ecc.index] ecc['y'] = ecc['b_#exons'] / ecc['#exons'] ecc['x'] = N.log2(N.array([g2gcov[x] for x in ecc.index]) + 1.) self.ratios['ecc'] = ecc[['x', 'y']] x, y = ecc['x'].values, ecc['y'].values x2, y2, xth = UT.fit_sigmoid(x, y, xlim, 0.99) auc, maxx, avgy, x, y = self._calc_binned(x, y, self.binsize) self.stats['ecc'] = { 'p1': N.sum(ecc['b_#exons'] > 0) / float(len(ecc)), 'auc': auc, 'maxx': list(maxx), 'avgy': list(avgy), 'xth': xth } # JCC s1 = self.s1 jcc = s1.groupby('_gidx').size().to_frame('jc') if '_gidx' not in self.s2: # adapt to old version where sj.txt.gz did not contain _gidx a2g = UT.df2dict(self.e2, 'a_id', '_gidx') d2g = UT.df2dict(self.e2, 'd_id', '_gidx') self.s2['_gidx'] = [ a2g.get(x, d2g.get(y, 0)) for x, y in self.s2[['a_id', 'd_id']].values ] l2g2 = UT.df2dict(self.s2, 'locus', '_gidx') s1['b__gidx'] = [l2g2.get(x, '.') for x in s1['locus'].values] s1o = s1[s1['b__gidx'] != '.'] # overlapping jc2 = s1o.groupby(['_gidx', 'b__gidx']).size().to_frame('jc2').reset_index() jc2 = jc2.groupby('_gidx')['jc2'].max() g2jc2 = UT.series2dict(jc2) jcc['b_jc'] = [g2jc2.get(x, 0) for x in jcc.index] jcc['y'] = jcc['b_jc'] / jcc['jc'] jcc['x'] = N.log2(N.array([g2gcov[x] for x in jcc.index]) + 1.) self.ratios['jcc'] = jcc[['x', 'y']] x, y = jcc['x'].values, jcc['y'].values x2, y2, xth = UT.fit_sigmoid(x, y, xlim, 0.99) auc, maxx, avgy, x, y = self._calc_binned(x, y, self.binsize) self.stats['jcc'] = { 'p1': N.sum(jcc['b_jc'] > 0) / float(len(jcc)), 'auc': auc, 'maxx': list(maxx), 'avgy': list(avgy), 'xth': xth }
def find_match(self): en1 = self.en1 en2 = self.en2 # write internal,3,5,se exons separately for finding match a = en1.fname2( 'emtmp.ex.bed.gz', en2.code ) # need to be unique to avoid parallel conflict (en1 ref shared) b = en2.fname('emtmp.ex.bed.gz') c = en1.fname2('emtmp.ex.ovl.txt.gz', en2.code) self.e1 = e1 = en1.model('ex') self.e2 = e2 = en2.model('ex') ecovname = self.colname('ecov') cols = [ 'chr', 'st', 'ed', 'cat', '_id', ecovname, '_gidx', 'len', 'strand' ] a = UT.write_pandas(e1[cols], a, '') b = UT.write_pandas(e2[cols], b, '') c = BT.bedtoolintersect(a, b, c, wao=True) ocols = cols + ['b_' + x for x in cols] + ['ovl'] self.ov = ov = UT.read_pandas(c, names=ocols) # overlaps of exons idxchr = ov['chr'] == ov['b_chr'] # str vs. str idxstrand = ov['strand'] == ov['b_strand'] # str vs. str idxp = (ov['strand'] == '+') & idxstrand idxn = (ov['strand'] == '-') & idxstrand idxst = ov['st'] == ov['b_st'] # b_st column mixed? type? idxed = ov['ed'] == ov['b_ed'] # b_ed column mixed? type? idxcat = ov['cat'] == ov['b_cat'] idxcov = ov[ecovname] > 0 # exons with reads LOG.debug( '=' * 10 + 'calculating match between {0} and {1}'.format(en1.code, en2.code)) LOG.debug('len(ov):{0}'.format(len(ov))) for k in [ 'idxchr', 'idxstrand', 'idxp', 'idxn', 'idxst', 'idxed', 'idxcat', 'idxcov' ]: v = locals()[k] LOG.debug('#{0}:{1}'.format(k, N.sum(v))) # internal exon cat='i' and chr,st,ed,strand match self.ei = ei = ov[idxchr & idxstrand & idxst & idxed & idxcat & (ov['cat'] == 'i')].copy() # 5' cat='5' and chr,donor (+,ed)|(-,st) match, find closest self.e5 = e5 = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) & idxcat & (ov['cat'] == '5')].copy() # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match self.e3 = e3 = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) & idxcat & (ov['cat'] == '3')].copy() # se cat='s' and chr, self.es = es = ov[idxchr & (ov['cat'] == 's') & idxcat].copy() # allow overlap to ther categories self.e5b = e5b = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) & (ov['cat'] == '5')].copy() # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match self.e3b = e3b = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) & (ov['cat'] == '3')].copy() # se cat='s' and chr, self.esb = esb = ov[idxchr & (ov['cat'] == 's')].copy() # splice junction self.s1 = s1 = en1.model('sj') self.s2 = s2 = en2.model('sj') jcntname = self.colname('jcnt') l2c = UT.df2dict(s2, 'locus', jcntname) jhitname = self.colname2('jhit', en2.code) s1[jhitname] = [l2c.get(x, 0) for x in s1['locus']] # corresponding s2 count self.sj = sj = s1[ s1[jhitname] > 0].copy() # only consider s2 count > 0 # for batch processing self.e = { 'i': ei, '5': e5, '3': e3, 's': es, 'j': sj, '5b': e5b, '3b': e3b, 'sb': esb }
def _process_mapbed_chr(dstpre, chrom, genome, chromdir, stranded): # 1st pass: calc dupdic bedpath = dstpre+'.{0}.bed'.format(chrom) dupids = UT.read_pandas(dstpre+'.dupitems.txt.gz', index_col=[0]).index # 2nd pass make wiggles gfc = FA.GenomeFASTAChroms(chromdir) chromsize = UT.df2dict(UT.chromdf(genome), 'chr', 'size')[chrom] # mqth MAPQ threshold there are ~6% <10 # generator which makes an array fp = open(bedpath,'rb') wigs = {} wigpaths = {} for kind in ['.ex','.sj']: wigs[kind] = {} wigpaths[kind] = {} for strand in ['.p','.n','.u']: wigs[kind][strand] = {} wigpaths[kind][strand] = {} for suf in ['','.uniq']: wigpath = dstpre+kind+suf+strand+'.{0}.wig'.format(chrom) if os.path.exists(wigpath): os.unlink(wigpath) wigpaths[kind][strand][suf] = wigpath wigs[kind][strand][suf] = N.zeros(chromsize, dtype=float) sjs = [] # path: (chr, st, ed, pcode, ucnt, strand, acnt) # pcode = a(apos)d(dpos) = a(ed)d(st) if strand=='+' else a(st)d(ed) # ucnt = unique read counts # acnt = multi-read adjusted all counts (=ucnt+Sum(mcnt(i)/dup(i))) # delete previous sjbed12 = dstpre+'.{0}.sjpath.bed'.format(chrom) if os.path.exists(sjbed12): os.unlink(sjbed12) def _write_arrays(): for kind in ['.ex','.sj']: for strand in ['.p','.n','.u']: for suf in ['','.uniq']: cybw.array2wiggle_chr64(wigs[kind][strand][suf], chrom, wigpaths[kind][strand][suf], 'w') def _write_sj(sjs): # sjs = [(chr,st,ed,pathcode(name),ureads(sc1),strand,tst,ted,areads(sc2),cse),...] sjdf = PD.DataFrame(sjs, columns=GGB.BEDCOLS[:9]+['cse']) sjdfgr = sjdf.groupby('name') sj = sjdfgr.first() sj['sc1'] = sjdfgr['sc1'].sum().astype(int) # ucnt sj['sc2'] = sjdfgr['sc2'].sum().astype(int) # jcnt=ucnt+mcnt sj['st'] = sjdfgr['st'].min() sj['ed'] = sjdfgr['ed'].max() sj['#exons'] = sj['cse'].apply(len)+1 sj['ests'] = [[0]+[z[1]-st for z in cse] for st,cse in sj[['st','cse']].values] sj['eeds'] = [[z[0]-st for z in cse]+[ed-st] for st,ed,cse in sj[['st','ed','cse']].values] esizes = [[u-v for u,v in zip(x,y)] for x,y in sj[['eeds','ests']].values] sj['estarts'] = ['{0},'.format(','.join([str(y) for y in x])) for x in sj['ests']] sj['esizes'] = ['{0},'.format(','.join([str(y) for y in x])) for x in esizes] sj['name'] = sj.index # sj = sj.reset_index() with open(sjbed12, 'w') as f: sj[GGB.BEDCOLS].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE) def _append_sj(cse, css, csj, chrom,ureads,areads): if (len(cse)>0): # spits out splice rec # chr,st,ed,pathcode,ureads,strand,tst,ted,areads tst = cse[0][0] ted = cse[-1][1] if len(css)>0: strand = Counter(css).most_common()[0][0] else: strand = '.' name = pathcode(cse, strand) st = int(csj[0][1]) # first segment start ed = int(csj[-1][2]) # last segment end sjs.append((chrom,st,ed,name,ureads,strand,tst,ted,areads,cse)) def _add_to_ex_arrays(st,ed,dup,strand): kind='.ex' strand = STRANDMAP[(strand,stranded)] dic = wigs[kind][strand] dic[''][st:ed] += 1 if not dup: dic['.uniq'][st:ed] += 1 def _add_to_sj_arrays(sst,sed,dup,strand): kind='.sj' s = {'+':'.p','-':'.n','.':'.u'}[strand] dic = wigs[kind][s] # add to the arrays dic[''][sst:sed] += 1 if not dup: dic['.uniq'][sst:sed] += 1 ureads,areads = 1,1 else: ureads,areads = 0,1 return ureads,areads csj = [] # current collection of spliced reads css = [] # current strands cse = [] # current (sst,sed) csn = 0 # current segment number ureads,areads = 1,1 # uniq, total reads it's either 1,1 or 0,1 pmid = None # previous map id common to spliced segments for line in fp: rec = line.strip().split(b'\t') # 7 column bed: chr(0), st(1), ed(2), name(3), mapq(4), strand(5), mapid(6) cchr = rec[0].decode() st,ed = int(rec[1]),int(rec[2]) dup = rec[3] in dupids #dic[rec[3]] estrand = rec[5] _add_to_ex_arrays(st,ed,dup,estrand) # process splice if pmid != rec[6]: # new map _append_sj(cse, css, csj, chrom, ureads, areads) csj,css,cse,csn = [rec],[],[],0 # reset running params else: # add segments csj.append(rec) prec = csj[-2] # previous rec sst = int(prec[2]) # ed of previous segment sed = int(rec[1]) # st of current segment cse.append((sst,sed)) # find strand sted = gfc.get(chrom,sst,sst+2)+gfc.get(chrom,sed-2,sed) strand = STED2STRAND.get(sted,'.') if strand != '.': css.append(strand) ureads,areads = _add_to_sj_arrays(sst,sed,dup,strand) pmid = rec[6] _append_sj(cse, css, csj, chrom, ureads, areads) _write_arrays() _write_sj(sjs)
def count_repeats_viz_mp(beddf, rmskvizpath, idcol='_id', np=3, prefix=None, expand=0, col='repnames'): """Use rmsk-viz track and check each (unioned) exon overlaps with repeats and report repeat name(s). Uses Bedtools and calculates chromosome-wise. Args: beddf: Pandas DataFrame with chr,st,ed cols, when calculating repeats bp for genes, unioned bed should be used (use utils.make_unionex) idcol: colname for unique row id (default _id) rmskvizpath: path to repeat masker viz BED7 file (created using rmskviz2bed7) np: number of CPU to use prefix: path prefix for temp file, if not None temp files are kept. (default None) expand: how many bases to expand exon region in each side (default 0) col: column name to put in overlapping repeat names (if multiple comma separated) Outputs: are put into beddf columns with colname col(default repnames) """ cleanup = False if prefix is None: cleanup = True prefix = os.path.join(os.path.dirname(rmskvizpath), str(uuid.uuid4()) + '_') # chrom-wise chroms = sorted(beddf['chr'].unique()) # check whether rmskviz is already split splitrmsk = False for chrom in chroms: rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom) # reuse if not os.path.exists(rpath): splitrmsk = True break if splitrmsk: rmsk = GGB.read_bed(rmskvizpath) args = [] bfiles = [] ofiles = [] for chrom in chroms: bpath = prefix + 'tgt.{0}.bed'.format(chrom) # don't compress rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom) # reuse if expand > 0: bchr = beddf[beddf['chr'] == chrom].copy() bchr['st'] = bchr['st'] - expand bchr['ed'] = bchr['ed'] + expand bchr.loc[bchr['st'] < 0, 'st'] = 0 else: bchr = beddf[beddf['chr'] == chrom] UT.write_pandas(bchr[['chr', 'st', 'ed', idcol]], bpath, '') bfiles.append(bpath) if splitrmsk: rchr = rmsk[rmsk['chr'] == chrom] UT.write_pandas(rchr[['chr', 'st', 'ed', 'name', 'strand']], rpath, '') opath = prefix + 'out.{0}.bed'.format(chrom) ofiles.append(opath) args.append([bpath, rpath, opath]) rslts = UT.process_mp(count_repeats_viz_chr, args, np=np, doreduce=False) # gather outputs cols = ['name', 'repnames'] outs = [UT.read_pandas(f, names=cols) for f in ofiles] df = PD.concat(outs, ignore_index=True) df['name'] = df['name'].astype(str) i2rn = UT.df2dict(df, 'name', 'repnames') beddf[col] = [i2rn[str(x)] for x in beddf[idcol]] # cleanup if cleanup: for f in bfiles: os.unlink(f) for f in ofiles: os.unlink(f) return beddf