def extract_nonovl_exons(self): ex = self.ex sj = self.sj ex = ex[ex['st'] < ex['ed']] sj = sj[sj['st'] < sj['ed']] # nonovl exons # ex['gene_type'] = ex['extra'].str.split(';').str[2].str.split().str[1].str[1:-1] cols0 = ['chr', 'st', 'ed', '_id'] a = self.refpre + '.ex.bed.gz' a = UT.write_pandas(ex[cols0], a, '') b = self.refpre + '.sj.bed.gz' b = UT.write_pandas(sj[cols0], b, '') c1 = self.refpre + '.ex-ovl-sj.txt.gz' c2 = self.refpre + '.ex-ovl-ex.txt.gz' c1 = BT.bedtoolintersect(a, b, c1, wao=True) c2 = BT.bedtoolintersect(a, a, c2, wo=True) cols = cols0 + ['b_' + x for x in cols0] + ['ovl'] sov = UT.read_pandas(c1, names=cols) sov['len'] = sov['ed'] - sov['st'] sov['ovlratio'] = sov['ovl'] / sov['len'] sovg = sov.groupby('_id')['ovlratio'].max() snonov = sovg[sovg < 1.] # not completely covered by junction eov = UT.read_pandas(c2, names=cols) eovsize = eov.groupby('_id').size() enonov = eovsize[eovsize == 1] # only overlaps with self self.ne_i0 = ne_i0 = ex.set_index('_id').ix[enonov.index].sort_values( ['chr', 'st', 'ed']).reset_index() self.ne_i0['len'] = ne_i0['ed'] - ne_i0['st'] LOG.info('#non-ovl-ex0={0}'.format(len(enonov))) LOG.info('#non-ex-ovl-ex={0}, #non-sj-ovl-ex={1}'.format( len(enonov), len(snonov))) ids = set(enonov.index).intersection(snonov.index) LOG.info('#non-ovl-ex={0}'.format(len(ids))) self.nov_ex = novex = ex.set_index('_id').ix[ids].sort_values( ['chr', 'st', 'ed']).reset_index() novex['len'] = novex['ed'] - novex['st'] self.ne_i = novex[novex['cat'] == 'i'] self.ne_5 = novex[novex['cat'] == '5'] self.ne_3 = novex[novex['cat'] == '3'] self.ne_s = novex[novex['cat'] == 's']
def count_repeats_viz_chr(bedpath, rmskpath, outpath): c = BT.bedtoolintersect(bedpath, rmskpath, outpath, wao=True) cols = [ 'chr', 'st', 'ed', 'name', 'b_chr', 'b_st', 'b_ed', 'b_name', 'strand', 'ovl' ] df = UT.read_pandas(c, names=cols) df['rn'] = df['b_name'] + '(' + df['strand'] + ')' # group and concat repname dg = df.groupby('name')['rn'].apply( lambda x: ','.join(list(x))).reset_index() UT.write_pandas(dg, outpath, 'h')
def as3exsj(dstpre, minelen=150, np=7): ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A3.EXDFCOLS) sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A3.SJDFCOLS) se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A3.EXDFCOLS) paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A3.PATHCOLS) #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.' # sj.loc[sj['strand']=='.+','strand'] = '+' # sj.loc[sj['strand']=='.-','strand'] = '-' sj['st'] = sj['st']+1 cols = A3.EXDFCOLS ex = PD.concat([ex[cols],se[cols]],ignore_index=True) UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' graphpre = dstpre+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing # genes = GP.find_genes4(sj,ex, # filepre=prefix, # np=np, # override=False, # separatese=True) genes = GP.find_genes3(sj,ex, # don't use exon overlap as connection np=np, override=False) ex.loc[ex['kind']=='3','cat'] = '3' ex.loc[ex['kind']=='5','cat'] = '5' # remove these with elen smaller than minelen ex['len'] = ex['ed']-ex['st'] exsiz = ex.groupby('_gidx')['len'].sum() rgidx = exsiz[exsiz<minelen].index.values LOG.info('minelen filter #ex {0}=>{1}'.format(len(ex), len(ex)-len(rgidx))) ex2 = ex[~ex['_gidx'].isin(rgidx)] sj2 = sj[~sj['_gidx'].isin(rgidx)] # write UT.write_pandas(ex2, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj2, dstpre+'.sj.txt.gz', 'h') ci = UT.chopintervals(ex2, dstpre+'.ci.txt.gz') return sj2, ex2
def filter_sj(bwsjpre, statspath, chrom, csize, params): # read in junction stats stats = UT.read_pandas(statspath) if 'chr' not in stats: stats['chr'] = [x.split(':')[0] for x in stats['locus']] if '#detected' in stats: stats.rename(columns={'#detected': 'detected'}, inplace=True) stats = stats[stats['chr'] == chrom].copy() if 'pc' not in stats: stats['pc'] = [locus2pc(x) for x in stats['locus']] flds = ['detected', 'maxcnt', 'maxoverhang'] dics = {f: UT.df2dict(stats, 'pc', f) for f in flds} # read sjpath fpath_chr = bwsjpre + '.sjpath.{0}.bed.gz'.format(chrom) dstpath = bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format(chrom) if os.path.exists(fpath_chr): sj = GGB.read_bed(fpath_chr) else: fpath = bwsjpre + '.sjpath.bed.gz' sj = GGB.read_bed(fpath) sj = sj[sj['chr'] == chrom].copy() name0 = sj.iloc[0]['name'] if len(name0.split('|')) < len(name0.split(',')): # exons attached? sj['name'] = [','.join(x.split(',')[1:-1]) for x in sj['name']] # filter unstranded sj = sj[sj['strand'].isin(['+', '-'])].copy() # filter with stats for f in flds: sj[f] = [ N.min([dics[f].get(x, 0) for x in y.split(',')]) for y in sj['name'] ] sj = sj[sj[f] > params['th_' + f]].copy() # filter # edge exon size sj['eflen'] = [int(x.split(',')[0]) for x in sj['esizes']] sj['ellen'] = [int(x.split(',')[-2]) for x in sj['esizes']] eth = params['th_minedgeexon'] sj = sj[(sj['eflen'] > eth) & (sj['ellen'] > eth)].copy() # calculate sjratio, sjratio2 sjexbw = A2.SjExBigWigs(bwsjpre, mixunstranded=False) for s in ['+', '-']: idx = sj['strand'] == s with sjexbw: sa = sjexbw.bws['sj'][s].get(chrom, 0, csize) ea = sjexbw.bws['ex'][s].get(chrom, 0, csize) a = sa + ea sj.loc[idx, 'sjratio2'] = [ x / N.mean(a[int(s):int(e)]) for x, s, e in sj[idx][['sc1', 'tst', 'ted']].values ] sj = sj[sj['sjratio2'] > params['th_sjratio2']] GGB.write_bed(sj, dstpath, ncols=12)
def __init__(self, j2pre, code, chromdir, rmskviz, dstpre, **kw): self.j2pre = j2pre self.fnobj = FN.FileNamesBase(prefix) self.chromdir = chromdir self.rmskviz = rmskviz self.gfc = FA.GenomeFASTAChroms(chromdir) self.params = RMSKPARAMS.copy() self.params.update(kw) # get exons from paths self.paths = paths = UT.read_pandas(j2pre + '.paths.txt.gz', names=A2.PATHCOLS)
def ci(self): cicols = ['chr','st','ed','name','id'] cipath = self.cipath() if os.path.exists(cipath): LOG.info('reading ci({0}) from cache...'.format(cipath)) ci = UT.read_pandas(cipath, names=cicols) return ci if not os.path.exists(self.gtfpath): raise RuntimeError('file {0} does not exist'.format(self.gtfpath)) LOG.info('making ci..') sj,ex = self.sjex() ci = UT.chopintervals(ex, cipath) return ci
def filter_sjexdf(mdstpre, rdstpre): exdf = UT.read_pandas(mdstpre + '.exdf.txt.gz', names=A3.EXDFCOLS) sedf = UT.read_pandas(mdstpre + '.sedf.txt.gz', names=A3.EXDFCOLS) exdf = PD.concat([exdf, sedf], ignore_index=True) sjdf = UT.read_pandas(mdstpre + '.sjdf.txt.gz', names=A3.SJDFCOLS) ex = UT.read_pandas(rdstpre + '.ex.txt.gz') sj = UT.read_pandas(rdstpre + '.sj.txt.gz') def select_chromwise_df(exdf, ex): npchrs = [] for chrom in exdf['chr'].unique(): pchr = exdf[exdf['chr'] == chrom] echr = ex[ex['chr'] == chrom] exnames = set(echr['name'].values) idx = [x in exnames for x in pchr['name']] npchrs.append(pchr[idx]) return PD.concat(npchrs, ignore_index=True) nexdf = select_chromwise_df(exdf, ex) nsjdf = select_chromwise_df(sjdf, sj) UT.write_pandas(nexdf, rdstpre + '.exdf.txt.gz', '') UT.write_pandas(nsjdf, rdstpre + '.sjdf.txt.gz', '')
def read_sj(path, parsename=False): # read BED (input) or TXT (output) with consistent column names if path[-7:]=='.bed.gz' or path[-4:]=='.bed': df = read_bed(path).rename(columns={'sc1':'ucnt','tst':'mcnt'}) if parsename: # name is encoded as above 'motif-k0[k1]-u(reads)-m(reads)-o(maxoverhang)' # motif(0), known(1), u(2), m(3), o(4) tmp = df['name'].str.split('-') df['motif'] = tmp.str[0] df['annotated'] = tmp.str[1].str[1] df['maxoverhang'] = tmp.str[4].str[1:].astype(int) else: df = UT.read_pandas(path) # header should be there return df
def trim_ex(expath, dstpath, dstcipath, length=1000, gidfld='_gidx', np=7): """Generate trimmed version of genes for calculating coverage to avoid length bias. Args: expath (str): path exon tsv dstpath (str): path to trimmed exon dstcipath (str): path to ci (chopped interval) length (pos int): length to trim from 3' end in base pair (default 1000 bp) gidfld (str): column name for gene id (default _gidx) np (pos int): number of CPU to use Generates: Two files (dstpath, dstcipath). Returns: a dataframe containing trimmed exons """ #ex = UT.read_pandas(MD.paths[code]['ex']) #dstpath = MD.trimmedex[code][length]['ex'] #dstcipath = MD.trimmedex[code][length]['ci'] ex = UT.read_pandas(expath) if 'len' not in ex.columns: ex['len'] = ex['ed'] - ex['st'] if np==1: recs = trim_ex_worker((ex, length, gidfld)) else: chroms = sorted(ex['chr'].unique()) data = [(ex[ex['chr']==c], length, gidfld) for c in chroms] recs = [] try: p = multiprocessing.Pool(np) for v in p.map(trim_ex_worker, data): recs += v #recs = reduce(iadd, p.map(trim_ex_worker, *zip(*data))) finally: p.close() # p.join() cols = list(ex.columns.values) nex = PD.DataFrame(recs, columns = cols) nex['len'] = nex['ed'] - nex['st'] # edge case nex.loc[nex['st']==nex['ed'],'ed'] = nex['st'] + 1 UT.save_tsv_nidx_whead(nex, dstpath) UT.chopintervals(nex, dstcipath) return nex
def filter_paths(mdstpre, rdstpre): ex = UT.read_pandas(rdstpre + '.ex.txt.gz') def select_chromwise(paths, ex): npchrs = [] for chrom in paths['chr'].unique(): pchr = paths[paths['chr'] == chrom] echr = ex[ex['chr'] == chrom] exnames = set(echr['name'].values) #e2gname = UT.df2dict(echr,'name','gname') idx = [ all([x in exnames for x in y.split('|')]) for y in pchr['name'] ] npchrs.append(pchr[idx]) return PD.concat(npchrs, ignore_index=True) paths = GGB.read_bed(mdstpre + '.paths.withse.bed.gz') npaths = select_chromwise(paths, ex) GGB.write_bed(npaths, rdstpre + '.paths.withse.bed.gz', ncols=12) paths = GGB.read_bed(mdstpre + '.paths.txt.gz') npaths = select_chromwise(paths, ex) GGB.write_bed(npaths, rdstpre + '.paths.txt.gz', ncols=12)
def gtf_from_bed12(modelpre, dstpath=None, source='.'): # path['gname'] contains gene id paths = GGB.read_bed(modelpre+'.paths.withse.bed.gz') ex = UT.read_pandas(modelpre+'.ex.txt.gz') ex['id'] = ex['chr']+':'+ex['name'] n2gn = UT.df2dict(ex, 'id', 'gname') # n2gn = UT.df2dict(ex, 'name', 'gname') # there may be same st,ed in different chromosome paths['id'] = paths['chr']+':'+paths['name'] paths['id0'] = paths['chr']+':'+paths['name'].str.split('|').str[0] paths['gname'] = [n2gn[x] for x in paths['id0']] g2cnt = {} tnames = [] for x in paths['gname']: i = g2cnt.get(x,1) tnames.append('{0}.{1}'.format(x,i)) g2cnt[x] = i+1 paths['tname'] = tnames txt = 'gene_id "{0}"; transcript_id "{1}"; exon_number "{2}";' def _gen(): cols = ['chr','st','ed','gname','tname','esizes','estarts','strand'] for c,s,e,gn,tn,esi,est,strand in paths[cols].values: esizes = [int(x) for x in esi.split(',')[:-1]] estarts = [int(x) for x in est.split(',')[:-1]] for i,(x,y) in enumerate(zip(esizes,estarts)): est = s+y eed = est+x extra = txt.format(gn,tn,i+1) yield (c,source,'exon',est+1,eed,'.',strand,'.',extra) df = PD.DataFrame([x for x in _gen()], columns=GGB.GTFCOLS) if dstpath is None: dstpath = bedpath.replace('.bed','.gtf') GGB.write_gtf(df, dstpath) idf = paths[['id','chr','name','tname','gname']] UT.write_pandas(idf, modelpre+'.idmap.txt.gz','h') return df
def calc_53gap_params(self, covfactor=0, np=10, emaxth=1, eth=1): zoom = self.zoom d5path = self.dstpre + '.{0}.{1}.gap5params.txt.gz'.format( self.refcode, covfactor) d3path = self.dstpre + '.{0}.{1}.gap3params.txt.gz'.format( self.refcode, covfactor) if os.path.exists(d5path): print('reading from cache {0}'.format(d5path)) d5 = UT.read_pandas(d5path) else: d5 = self.calc_params_mp(self.ne_5, win=8192, np=np, gapmode='53', direction='<', covfactor=covfactor) UT.write_pandas(d5, d5path, 'h') if os.path.exists(d3path): print('reading from cache {0}'.format(d3path)) d3 = UT.read_pandas(d3path) else: d3 = self.calc_params_mp(self.ne_3, win=8192, np=np, gapmode='53', direction='>', covfactor=covfactor) UT.write_pandas(d3, d3path, 'h') i5 = (d5['eOut'] > eth) & (d5['emax'] > emaxth) i3 = (d3['eIn'] > eth) & (d3['emax'] > emaxth) d50 = d5[i5] d30 = d3[i3] def _fitone(d0, x, y1, y2, rx='sin', lrx='lsin'): da = d0[[x, y1]].copy().rename(columns={y1: 'gap', x: rx}) db = d0[[x, y2]].copy().rename(columns={y2: 'gap', x: rx}) da['kind'] = 1 db['kind'] = 0 D = PD.concat([da, db], ignore_index=True) D[lrx] = N.log2(zoom * D[rx] + 1) D['lgap'] = N.log2(D['gap'] + 1) X = D[[lrx, 'lgap']].values Y = D['kind'].values lr = LogisticRegression() lr.fit(X, Y) Z = lr.predict(X) return locals() fit5 = _fitone(d50, 'eOut', 'gap', 'gapIn', 'ein', 'lein') fit3 = _fitone(d30, 'eIn', 'gap', 'gapOut', 'ein', 'lein') # max exon size m5 = N.max(self.ne_5['len']) m3 = N.max(self.ne_3['len']) rx, lrx = 'ein', 'lein' # save coefs p5path = self.dstpre + '.{0}.gap5params.json'.format(self.refcode) f = fit5 self.write_params(p5path, f['lr'], f['Y'], f['Z'], [lrx, 'lgap'], { 'th': covfactor, 'zoom': zoom, 'maxsize': int(m5) }) p3path = self.dstpre + '.{0}.gap3params.json'.format(self.refcode) f = fit3 self.write_params(p3path, f['lr'], f['Y'], f['Z'], [lrx, 'lgap'], { 'th': covfactor, 'zoom': zoom, 'maxsize': int(m3) }) # save scatter plots spath = self.dstpre + '.{0}.gap53params'.format(self.refcode) title = self.dstpre.split('/')[-1] self.plot_gap53_fit(fit5, fit3, spath + '.0.png', title, ptyp='both') self.plot_gap53_fit(fit5, fit3, spath + '.pdf', title, ptyp='pdf') self.plot_gap53_fit(fit5, fit3, spath + '.png', title, ptyp='png') return locals()
def ex(self): sjpath, expath = self.sjexpaths() if UT.notstale(expath): return UT.read_pandas(expath) sj,ex = self.sjex() return ex
def extract_53_pair(self): # between genes ex = self.ex tmpprefix = self.refpre ex['_apos'] = ex['a_pos'].str.split(':').str[1].astype(int) ex['_dpos'] = ex['d_pos'].str.split(':').str[1].astype(int) ex.loc[ex['cat'] == '3', 'spos'] = ex['_apos'] ex.loc[ex['cat'] == '5', 'spos'] = ex['_dpos'] cols = ['chr', 'st', 'ed', 'name', 'strand', '_gidx1', '_gidx2'] def _find(ecs, chrom, strand): e53 = ecs[ecs['cat'].isin(['3', '5'])].sort_values('spos') #esorted = echrstrand.sort_values('_apos') v1 = e53.iloc[:-1][['spos', 'cat', '_gidx', '_id', 'st', 'ed']].values v2 = e53.iloc[1:][['spos', 'cat', '_gidx', '_id', 'st', 'ed']].values pairs = [] if strand == '+': for r1, r2 in zip(v1, v2): if r1[2] != r2[2]: # not same gene if (r1[1] == '3') & (r2[1] == '5') & ( r1[5] < r2[4]): # non overlapping 3=>5 name = '+g{0}e{1}|g{2}e{3}'.format( r1[2], r1[3], r2[2], r2[3]) pairs.append((chrom, r1[0], r2[0], name, strand, r1[2], r2[2])) else: for r1, r2 in zip(v1, v2): if r1[2] != r2[2]: if (r1[1] == '5') & (r2[1] == '3') & (r1[5] < r2[4]): # name = '-g{0}e{1}|g{2}e{3}'.format( r1[2], r1[3], r2[2], r2[3]) pairs.append((chrom, r1[0], r2[0], name, strand, r1[2], r2[2])) df = PD.DataFrame(pairs, columns=cols) return df rslts = [] for chrom in ex['chr'].unique(): for strand in ['+', '-']: echrstrand = ex[(ex['chr'] == chrom) & (ex['strand'] == strand)] rslts.append(_find(echrstrand, chrom, strand)) df = PD.concat(rslts, ignore_index=True).sort_values(['chr', 'st', 'ed']) # intersect with internal exons a = tmpprefix + '.53.exi.bed' # ncol 3 b = tmpprefix + '.53.bed' #ncol 5 c = tmpprefix + '.53.exi.ovl.txt' exi = ex[ex['cat'] == 'i'].sort_values(['chr', 'st', 'ed']) UT.write_pandas(exi[['chr', 'st', 'ed']], a, '') UT.write_pandas(df, b, '') c = BT.bedtoolintersect(b, a, c, wao=True) cols1 = cols + ['b_chr', 'b_st', 'b_ed', 'ovl'] cdf = UT.read_pandas(c, names=cols1) sdf = cdf[cdf['ovl'] == 0][cols] sdf['locus'] = UT.calc_locus(sdf) sdf['len'] = sdf['ed'] - sdf['st'] maxexonsize = self.ne_i['len'].max() sdf = sdf[(sdf['len'] > 20) & (sdf['len'] < max(2 * maxexonsize, 20000))] UT.write_pandas(sdf, tmpprefix + '.e53pair.bed.gz') sdf.index.name = '_id' self.e53 = sdf.reset_index()
def calc_53_params(self, sdiffth=1, np=10, alpha=0.1): # get parameters dic = {} zoom = self.zoom # seta = ['ne_i','ne_5','ne_3','e5i','e3i','e5ia','e3ia'] # setb = ['ne_5','ne_3','e5i','e3i','e5ia','e3ia'] # setc = ['ne_i','ne_3','ne_5','e5ia','e3ia'] seta = ['ne_i', 'ne_5', 'ne_3', 'e5i', 'e3i'] setb = ['ne_5', 'ne_3', 'e5i', 'e3i'] setc = ['ne_i', 'ne_3', 'ne_5'] for x in seta: fpath = self.dstpre + '.{0}.{1}.flux.txt.gz'.format( self.refcode, x) if os.path.exists(fpath): print('reading from cache {0}'.format(fpath)) dic[x] = UT.read_pandas(fpath) else: df = getattr(self, x) print('calculating {0}...'.format(x)) dic[x] = self.calc_flux_mp(df, np=np) UT.write_pandas(dic[x], fpath, 'h') dicb = {} FN0 = 0 for x in setb: f = dic[x] f['kind'] = 1 idx0 = N.abs( N.log2(zoom * f['sin'] + 1) - N.log2(zoom * f['sout'] + 1)) > sdiffth idx1 = (f['sdin'] != 0) | (f['sdout'] != 0 ) # should have either in or out idx = idx0 & idx1 FN0 += N.sum((~idx0) & idx1) # pre filtered positive dicb[x] = f[idx] f = dic['ne_i'] f['kind'] = 0 idx = (f['ecovmax'] > 1) & ( (f['sdin'] != 0) & (f['sdout'] != 0)) # should have both in&out dicb['ne_i'] = f[idx] D = PD.concat([dicb[x] for x in setc], ignore_index=True) D2 = PD.concat([dicb['ne_i'], dicb['e3i'], dicb['e5i']], ignore_index=True) # don't use e3i, e5i too many non-actives D['lsin'] = N.log2(zoom * D['sin'] + 1) D['lsout'] = N.log2(zoom * D['sout'] + 1) D['sdiff'] = N.abs(D['lsin'] - D['lsout']) D['smean'] = (D['lsin'] + D['lsout']) / 2. X = D[['sdiff', 'smean']].values Y = D['kind'].values lr = LogisticRegression() lr.fit(X, Y) Z = lr.predict(X) D2['lsin'] = N.log2(zoom * D2['sin'] + 1) D2['lsout'] = N.log2(zoom * D2['sout'] + 1) D2['sdiff'] = N.abs(D2['lsin'] - D2['lsout']) D2['smean'] = (D2['lsin'] + D2['lsout']) / 2. X2 = D2[['sdiff', 'smean']].values Z2 = lr.predict(X2) # save fit coefficients ppath = self.dstpre + '.{0}.e53params.json'.format(self.refcode) self.write_params(ppath, lr, Y, Z, ['sdiff', 'smean'], { 'sdiffth': sdiffth, 'zoom': zoom }, FN0=FN0) # save scatter plots spath = self.dstpre + '.{0}.e53params'.format(self.refcode) title = self.dstpre.split('/')[-1] self.plot_sin_sout(dic, D, Y, Z, D2, Z2, sdiffth, spath + '.0.png', title, alpha=alpha) self.plot_sin_sout(dic, D, Y, Z, D2, Z2, sdiffth, spath + '.pdf', title, ptyp='pdf', alpha=alpha) self.plot_sin_sout(dic, D, Y, Z, D2, Z2, sdiffth, spath + '.png', title, ptyp='png', alpha=alpha) return locals()
def calc_gcov(expath, cipath, bwpath, dstprefix, override=False, np=4): """Calculate gene coverages. Args: expath: merged ex cipath: chopped interval for ex bwpath: bigwig file (sample) dstprefix: prefix for outputs Outputs: 1. dstprefix+'.covci.txt.gz' 2. dstprefix+'.gcov.txt.gz' : DataFrame(col:_gidx,len,val,gcov,len2,gcov2,cids) len2: calculate length from ci with cov > 0 (normal length = use entire ci's belonging to the gene) gcov2 = val/len2 cids: cid with cov > for the gene ','.joined """ ex = UT.read_pandas(expath) covcipath = dstprefix + 'covci.txt.gz' gcovpath = dstprefix + 'gcov.txt.gz' if UT.notstale([expath, cipath], covcipath, override): cc = UT.read_pandas(covcipath) else: if UT.notstale(expath, cipath, False): ci = UT.read_pandas(cipath, names=['chr', 'st', 'ed', 'name', 'id']) else: ci = UT.chopintervals(ex, cipath, idcol='_id') cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # if override or (not os.path.exists(covcipath)): # # calc covci # if not os.path.exists(cipath): # ci = UT.chopintervals(ex, cipath, idcol='_id') # else: # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # else: # cc = UT.read_pandas(covcipath) if 'id' not in cc.columns: cc['id'] = cc['sc1'] if 'eid' not in cc.columns: cc['eid'] = cc['name'].astype(str).apply( lambda x: [int(y) for y in x.split(',')]) cc['len'] = cc['ed'] - cc['st'] cc['val'] = cc['cov'] * cc['len'] ccf = UT.flattendf(cc[['id', 'eid', 'len', 'val', 'st', 'ed']], 'eid') e2g = dict(UT.izipcols(ex, ['_id', '_gidx'])) ccf['_gidx'] = [e2g[x] for x in ccf['eid']] # for normal gcov: take unique combination of (gid, id) (id=cid) # for gocv2 : first select ccf with val>0 ccf2 = ccf[ccf['val'] > 0].groupby(['_gidx', 'id']).first().reset_index() ccf2g = ccf2.groupby('_gidx') df2 = ccf2g[['len', 'val']].sum() df2['gcov2'] = df2['val'] / df2['len'] df2['cids'] = ccf2g['id'].apply(lambda x: ','.join([str(y) for y in x])) df2['gst2'] = ccf2g['st'].min() df2['ged2'] = ccf2g['ed'].max() df2['glen2'] = df2['ged2'] - df2['gst2'] df2 = df2.reset_index() ccf1 = ccf.groupby(['_gidx', 'id']).first().reset_index() ccf1g = ccf1.groupby('_gidx') df = ccf1g[['len', 'val']].sum() df['gcov'] = df['val'] / df['len'] df['st'] = ccf1g['st'].min() df['ed'] = ccf1g['ed'].max() df['glen'] = df['ed'] - df['st'] df = df.reset_index() g2chr = dict(UT.izipcols(ex, ['_gidx', 'chr'])) df['chr'] = [g2chr[x] for x in df['_gidx']] def _set_df2prop(src, tgt, default): dic = dict(UT.izipcols(df2, ['_gidx', src])) df[tgt] = [dic.get(x, default) for x in df['_gidx']] _set_df2prop('gcov2', 'gcov2', 0) _set_df2prop('len', 'len2', 0) _set_df2prop('cids', 'cids', '') _set_df2prop('gst2', 'st2', -1) _set_df2prop('ged2', 'ed2', -1) _set_df2prop('glen2', 'glen2', 0) cols = [ '_gidx', 'chr', 'st', 'ed', 'len', 'val', 'gcov', 'glen', 'len2', 'gcov2', 'cids', 'st2', 'ed2', 'glen2' ] cols = ['_gidx', 'gcov'] df = df[cols] UT.save_tsv_nidx_whead(df, gcovpath) return df
def prep_sjex(self, en, np=1, savesjex=True, calccovs=True): """ Assign ecov, gcov, jcnt """ dcode = self.datacode sj = en.model('sj', dcode) ex = en.model('ex', dcode) savesj = False saveex = False # check support if len(sj) > 0: dids = set(ex['d_id'].values) aids = set(ex['a_id'].values) idx = sj['a_id'].isin(aids) & sj['d_id'].isin(dids) sj = sj[idx].copy() en.sj = sj if '_id' not in ex.columns: # edge case (len(sj)==0) ex['_id'] = N.arange(len(ex)) if '_gidx' not in ex.columns: # edge case (len(sj)==0) ex['_gidx'] = N.arange(len(ex)) # length if 'len' not in sj.columns: sj['len'] = sj['ed'] - sj['st'] savesj = True if 'len' not in ex.columns: ex['len'] = ex['ed'] - ex['st'] saveex = True # ecov if calccovs: print('calccov for {0}'.format(en.code)) ecovname = self.colname('ecov') if ecovname not in ex.columns: ecov = CC.calc_ecov( expath=en.modelpath('ex'), cipath=en.modelpath('ci'), bwpath=self.bigwig, dstprefix=en.fname2( '', self.datacode), # cov is data dependent override=False, # override previous? np=np) ex[ecovname] = ecov.set_index('eid').ix[ ex['_id'].values]['ecov'].values saveex = True # gcov, glen gcovname = self.colname('gcov') if gcovname not in ex.columns: gcov = CC.calc_gcov( expath=en.modelpath('ex'), cipath=en.modelpath('ci'), bwpath=self.bigwig, dstprefix=en.fname2('', self.datacode), override=False, # reuse covci from ecov calc np=np) tmp = gcov.set_index('_gidx').ix[ex['_gidx'].values] ex[gcovname] = tmp['gcov'].values if 'glen' in tmp: ex['glen'] = tmp[ 'glen'].values # glen is only dependent on model not data saveex = True else: ecovname = self.colname('ecov') if ecovname not in ex.columns: ex[ecovname] = 0 gcovname = self.colname('gcov') if gcovname not in ex.columns: ex[gcovname] = 0 # sjcnt ucntname = self.colname('ucnt') mcntname = self.colname('mcnt') jcntname = self.colname('jcnt') sjfile = self.sjfile if ucntname not in sj.columns: if sjfile.endswith('.bed') or sjfile.endswith( '.bed.gz'): # no header dsj = UT.read_pandas(sjfile, names=[ 'chr', 'st', 'ed', 'name', 'ucnt', 'strand', 'mcnt' ]) else: # assume txt file with header dsj = UT.read_pandas(sjfile) # locus based matching dsj['locus'] = UT.calc_locus_strand(dsj) sj['locus'] = UT.calc_locus_strand(sj) l2u = UT.df2dict(dsj, 'locus', 'ucnt') l2m = UT.df2dict(dsj, 'locus', 'mcnt') sj[ucntname] = [l2u.get(x, 0) for x in sj['locus']] sj[mcntname] = [l2m.get(x, 0) for x in sj['locus']] sj[jcntname] = [x or y for x, y in sj[[ucntname, mcntname]].values] savesj = True if saveex and savesjex: en.savemodel('ex', dcode, category='output') if savesj and savesjex: en.savemodel('sj', dcode, category='output')
def read_ucsc_knownGene(path): return UT.read_pandas(path, names=KGCOLS)
def find_match(self): en1 = self.en1 en2 = self.en2 # write internal,3,5,se exons separately for finding match a = en1.fname2( 'emtmp.ex.bed.gz', en2.code ) # need to be unique to avoid parallel conflict (en1 ref shared) b = en2.fname('emtmp.ex.bed.gz') c = en1.fname2('emtmp.ex.ovl.txt.gz', en2.code) self.e1 = e1 = en1.model('ex') self.e2 = e2 = en2.model('ex') ecovname = self.colname('ecov') cols = [ 'chr', 'st', 'ed', 'cat', '_id', ecovname, '_gidx', 'len', 'strand' ] a = UT.write_pandas(e1[cols], a, '') b = UT.write_pandas(e2[cols], b, '') c = BT.bedtoolintersect(a, b, c, wao=True) ocols = cols + ['b_' + x for x in cols] + ['ovl'] self.ov = ov = UT.read_pandas(c, names=ocols) # overlaps of exons idxchr = ov['chr'] == ov['b_chr'] # str vs. str idxstrand = ov['strand'] == ov['b_strand'] # str vs. str idxp = (ov['strand'] == '+') & idxstrand idxn = (ov['strand'] == '-') & idxstrand idxst = ov['st'] == ov['b_st'] # b_st column mixed? type? idxed = ov['ed'] == ov['b_ed'] # b_ed column mixed? type? idxcat = ov['cat'] == ov['b_cat'] idxcov = ov[ecovname] > 0 # exons with reads LOG.debug( '=' * 10 + 'calculating match between {0} and {1}'.format(en1.code, en2.code)) LOG.debug('len(ov):{0}'.format(len(ov))) for k in [ 'idxchr', 'idxstrand', 'idxp', 'idxn', 'idxst', 'idxed', 'idxcat', 'idxcov' ]: v = locals()[k] LOG.debug('#{0}:{1}'.format(k, N.sum(v))) # internal exon cat='i' and chr,st,ed,strand match self.ei = ei = ov[idxchr & idxstrand & idxst & idxed & idxcat & (ov['cat'] == 'i')].copy() # 5' cat='5' and chr,donor (+,ed)|(-,st) match, find closest self.e5 = e5 = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) & idxcat & (ov['cat'] == '5')].copy() # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match self.e3 = e3 = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) & idxcat & (ov['cat'] == '3')].copy() # se cat='s' and chr, self.es = es = ov[idxchr & (ov['cat'] == 's') & idxcat].copy() # allow overlap to ther categories self.e5b = e5b = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) & (ov['cat'] == '5')].copy() # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match self.e3b = e3b = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) & (ov['cat'] == '3')].copy() # se cat='s' and chr, self.esb = esb = ov[idxchr & (ov['cat'] == 's')].copy() # splice junction self.s1 = s1 = en1.model('sj') self.s2 = s2 = en2.model('sj') jcntname = self.colname('jcnt') l2c = UT.df2dict(s2, 'locus', jcntname) jhitname = self.colname2('jhit', en2.code) s1[jhitname] = [l2c.get(x, 0) for x in s1['locus']] # corresponding s2 count self.sj = sj = s1[ s1[jhitname] > 0].copy() # only consider s2 count > 0 # for batch processing self.e = { 'i': ei, '5': e5, '3': e3, 's': es, 'j': sj, '5b': e5b, '3b': e3b, 'sb': esb }
def count_repeats_viz_mp(beddf, rmskvizpath, idcol='_id', np=3, prefix=None, expand=0, col='repnames'): """Use rmsk-viz track and check each (unioned) exon overlaps with repeats and report repeat name(s). Uses Bedtools and calculates chromosome-wise. Args: beddf: Pandas DataFrame with chr,st,ed cols, when calculating repeats bp for genes, unioned bed should be used (use utils.make_unionex) idcol: colname for unique row id (default _id) rmskvizpath: path to repeat masker viz BED7 file (created using rmskviz2bed7) np: number of CPU to use prefix: path prefix for temp file, if not None temp files are kept. (default None) expand: how many bases to expand exon region in each side (default 0) col: column name to put in overlapping repeat names (if multiple comma separated) Outputs: are put into beddf columns with colname col(default repnames) """ cleanup = False if prefix is None: cleanup = True prefix = os.path.join(os.path.dirname(rmskvizpath), str(uuid.uuid4()) + '_') # chrom-wise chroms = sorted(beddf['chr'].unique()) # check whether rmskviz is already split splitrmsk = False for chrom in chroms: rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom) # reuse if not os.path.exists(rpath): splitrmsk = True break if splitrmsk: rmsk = GGB.read_bed(rmskvizpath) args = [] bfiles = [] ofiles = [] for chrom in chroms: bpath = prefix + 'tgt.{0}.bed'.format(chrom) # don't compress rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom) # reuse if expand > 0: bchr = beddf[beddf['chr'] == chrom].copy() bchr['st'] = bchr['st'] - expand bchr['ed'] = bchr['ed'] + expand bchr.loc[bchr['st'] < 0, 'st'] = 0 else: bchr = beddf[beddf['chr'] == chrom] UT.write_pandas(bchr[['chr', 'st', 'ed', idcol]], bpath, '') bfiles.append(bpath) if splitrmsk: rchr = rmsk[rmsk['chr'] == chrom] UT.write_pandas(rchr[['chr', 'st', 'ed', 'name', 'strand']], rpath, '') opath = prefix + 'out.{0}.bed'.format(chrom) ofiles.append(opath) args.append([bpath, rpath, opath]) rslts = UT.process_mp(count_repeats_viz_chr, args, np=np, doreduce=False) # gather outputs cols = ['name', 'repnames'] outs = [UT.read_pandas(f, names=cols) for f in ofiles] df = PD.concat(outs, ignore_index=True) df['name'] = df['name'].astype(str) i2rn = UT.df2dict(df, 'name', 'repnames') beddf[col] = [i2rn[str(x)] for x in beddf[idcol]] # cleanup if cleanup: for f in bfiles: os.unlink(f) for f in ofiles: os.unlink(f) return beddf
def read_ucsc_refGene(path): return UT.read_pandas(path, names=RGCOLS)
def read_txt(self, suffix, category='read'): return UT.read_pandas(self.txtname(suffix, category))
def prep_sjpath_chr(j2pres, libsizes, dstpre, chrom): pc2st = {} pc2ed = {} pc2tst = {} pc2ted = {} pc2strand = {} pc2tcov = {} # pc2tcov0 = {} # chr,st,ed,name,sc1(tcov),strand,tst,ted,sc2(),#exons,estarts,esizes # cols = ['st','ed','name','strand','tst','ted','tcov0','tcov'] path = dstpre + '.sjpath.{0}.bed.gz'.format(chrom) path0 = dstpre + '.sjpath.bed.gz' if os.path.exists(path0): return path if os.path.exists(path): return path cols = ['st', 'ed', 'name', 'strand', 'tst', 'ted', 'tcov'] if libsizes is None: n = 1 scales = N.ones(len(j2pres)) else: n = len(j2pres) scales = [1e6 / float(x) for x in libsizes] for pre, scale in zip(j2pres, scales): paths = UT.read_pandas(pre + '.paths.txt.gz', names=A2.PATHCOLS) paths = paths[paths['chr'] == chrom] for st, ed, name, s, tst, ted, tcov in paths[cols].values: pc = ','.join( name.split(',')[1:-1]) # trim 53exons => intron chain pc2st[pc] = min(st, pc2st.get(pc, st)) pc2ed[pc] = max(ed, pc2ed.get(pc, ed)) pc2tst[pc] = tst pc2ted[pc] = ted pc2strand[pc] = s pc2tcov[pc] = pc2tcov.get(pc, 0) + scale * tcov #pc2tcov0[pc] = pc2tcov0.get(pc,0)+scale*tcov0 df = PD.DataFrame({ 'st': pc2st, 'ed': pc2ed, 'tst': pc2tst, 'ted': pc2ted, 'strand': pc2strand, 'tcov': pc2tcov }) df['chr'] = chrom df.index.name = 'name' df.reset_index(inplace=True) # create bed12: parse name => #exons, esizes, estarts df['pc'] = df['name'].copy() idxp = df['strand'].isin(['+', '.+']) if libsizes is not None: df['tcov'] = df['tcov'] / float(n) df.loc[idxp, 'name'] = [ '{0},{1},{2}'.format(s, p, e) for s, p, e in df[idxp][['st', 'pc', 'ed']].values ] df.loc[~idxp, 'name'] = [ '{2},{1},{0}'.format(s, p, e) for s, p, e in df[~idxp][['st', 'pc', 'ed']].values ] df = df.groupby('pc').first() # get rid of unstranded duplicates cmax = 9 + N.log2(N.mean(scales)) bed = A2.path2bed12(df, cmax) # reset sc1 to tcov (from log2(tcov+2)*100) bed['sc1'] = bed['tcov'] GGB.write_bed(bed, path, ncols=12) return path
def testsampleinfo(datadir): si = UT.read_pandas(os.path.join(datadir, 'bedtools/test-si.txt')) si['bw_path'] = datadir + '/' + si['bwfile'] si['sjbed_path'] = datadir + '/' + si['sjbed'] return si
def calc_ecov(expath, cipath, bwpath, dstprefix, blocksize=100, override=False, np=4): """Calculate exon coverages. Args: expath: merged ex cipath: chopped interval for ex bwpath: bigwig file (sample) dstprefix: prefix for outputs Outputs: 1. dstprefix+'.covci.txt.gz': coverage for ci 2. dstprefix+'.ecov.txt.gz' : DataFrame(cols: eid, chr, st, ed, ecov) """ covcipath = dstprefix + 'covci.txt.gz' ecovpath = dstprefix + 'ecov.txt.gz' ex = UT.read_pandas(expath) if UT.notstale([expath, cipath], covcipath, override): cc = UT.read_pandas(covcipath) else: if UT.notstale(expath, cipath, False): # you do not want to override ci ci = UT.read_pandas(cipath, names=['chr', 'st', 'ed', 'name', 'id']) else: #ex = UT.read_pandas(expath) ci = UT.chopintervals(ex, cipath, idcol='_id') cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # ex = UT.read_pandas(expath) # if 'locus2' not in ex: # ex['locus2'] = UT.calc_locus_strand(ex) # if '_id' not in ex: # UT.set_ids(ex) # e2l = UT.df2dict(ex, '_id', 'locus2') # ex2 = ex.groupby('locus2').first().reset_index() # # maps: eid (_id) <=> locus2 # if UT.notstale([expath, cipath], covcipath, override): # cc = UT.read_pandas(covcipath) # else: # if UT.notstale(expath, cipath, False): # you do not want to override ci # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # else: # ci = UT.chopintervals(ex2, cipath, idcol='_id') # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # if override or (not os.path.exists(covcipath)): # # calc covci # if not os.path.exists(cipath): # ex = UT.read_pandas(expath) # ci = UT.chopintervals(ex, cipath, idcol='_id') # else: # ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id']) # cc = calc_cov_mp(ci, bwpath, covcipath, np=np) # else: # cc = UT.read_pandas(covcipath) if 'id' not in cc.columns: cc['id'] = cc['sc1'] if 'pid' not in cc.columns: cc['pid'] = cc['name'].astype(str).apply( lambda x: [int(y) for y in x.split(',')]) cc['name1'] = cc['pid'] #ccf = UT.flattendf(cc[['chr','st','ed','pid']], 'pid') #ccfg = ccf.groupby('eid') #df = ccfg[['chr']].first() #df['st'] = ccfg['st'].min() #df['ed'] = ccfg['ed'].max() #df.reset_index(inplace=True) df = ex[['_id', '_pid']].rename(columns={'_id': 'eid', '_pid': 'pid'}) e2cs = calc_ecov_mp(cc, None, np, blocksize) # pid => cov # l2cs = {e2l[x]: e2cs[x] for x in e2cs} # locus2 => cov # ex['ecov'] = [l2cs[x] for x in ex['locus2']] df['ecov'] = [e2cs[x] for x in df['pid']] # UT.save_tsv_nidx_whead(ex[['_id','ecov']], ecovpath) # return ex UT.save_tsv_nidx_whead(df[['eid', 'pid', 'ecov']], ecovpath) return df
def read_ovl(c, acols, bcols=None): if bcols is None: cols = acols+['b_'+x for x in acols]+['ovl'] else: cols = acols+['b_'+x for x in bcols]+['ovl'] return UT.read_pandas(c, names=cols)
def calc_cov_ovl_mp(srcname, bwname, dstname, np=1, covciname=None, ciname=None, colname='cov', override=False): """Calculate coverage (from BigWig) over intervals (from srcname). A column (default 'cov') which contains coverages is added to source dataframe and the source is overwritten. Args: srcname: path to exons tsv bwname: path to bigwig dstname: path for result np: number of processors covciname: path to covci (coverage for chopped interval dataframe) ciname: path to ci (chopped interval dataframe) colname: name for column which contain calculated coverages Returns: source dataframe with column (cov) added SideEffects: source tsv is overwritten with new column added """ if UT.isstring(srcname): exons = UT.read_pandas(srcname) else: exons = srcname # cache if covciname is None: assert (UT.isstring(srcname)) covciname = srcname[:-7] + '.covci.txt.gz' if ciname is None: assert (UT.isstring(srcname)) ciname = srcname[:-7] + '.ci.txt.gz' if override or (not os.path.exists(covciname)): LOG.debug('calculating covci...') _sttime = time.time() if override or not (os.path.exists(ciname)): ci = UT.chopintervals(exons, ciname) else: ci = UT.read_pandas(ciname, names=['chr', 'st', 'ed', 'name', 'id']) ci['name'] = ci['name'].astype(str) covci = calc_cov_mp(ci, bwname, covciname, np) LOG.debug(' time: {0:.3f}s'.format(time.time() - _sttime)) else: LOG.debug('loading cached covci...') covci = UT.read_pandas(covciname) covci['name'] = covci['name'].astype(str) # covci: chopped interval's cov => reverse # ci => exon id ====> revers exon => ci indices # exon cov = sum(cicov*cilen)/totlen LOG.debug('calculating exon cov...') if 'id' not in covci.columns: covci['id'] = covci['sc1'] _sttime = time.time() e2c = {} for i, name in covci[['id', 'name']].values: for eid in name.split(','): e2c.setdefault(int(eid), []).append(i) covci['len'] = covci['ed'] - covci['st'] covci['val'] = covci['cov'] * covci['len'] def _gen(): for eid in exons['_id']: for cid in e2c[eid]: yield (cid, eid) tmp = PD.DataFrame(list(set([x for x in _gen()])), columns=['cid', 'eid']) c2len = dict(covci[['id', 'len']].values) c2val = dict(covci[['id', 'val']].values) tmp['val'] = [c2val[x] for x in tmp['cid']] tmp['len'] = [c2len[x] for x in tmp['cid']] tmpg = tmp.groupby('eid')[['val', 'len']].sum().reset_index() tmpg['cov'] = tmpg['val'] / tmpg['len'] e2cov = dict(tmpg[['eid', 'cov']].values) exons[colname] = [e2cov[x] for x in exons['_id']] UT.save_tsv_nidx_whead(exons, dstname) return exons
def _process_mapbed_chr(dstpre, chrom, genome, chromdir, stranded): # 1st pass: calc dupdic bedpath = dstpre+'.{0}.bed'.format(chrom) dupids = UT.read_pandas(dstpre+'.dupitems.txt.gz', index_col=[0]).index # 2nd pass make wiggles gfc = FA.GenomeFASTAChroms(chromdir) chromsize = UT.df2dict(UT.chromdf(genome), 'chr', 'size')[chrom] # mqth MAPQ threshold there are ~6% <10 # generator which makes an array fp = open(bedpath,'rb') wigs = {} wigpaths = {} for kind in ['.ex','.sj']: wigs[kind] = {} wigpaths[kind] = {} for strand in ['.p','.n','.u']: wigs[kind][strand] = {} wigpaths[kind][strand] = {} for suf in ['','.uniq']: wigpath = dstpre+kind+suf+strand+'.{0}.wig'.format(chrom) if os.path.exists(wigpath): os.unlink(wigpath) wigpaths[kind][strand][suf] = wigpath wigs[kind][strand][suf] = N.zeros(chromsize, dtype=float) sjs = [] # path: (chr, st, ed, pcode, ucnt, strand, acnt) # pcode = a(apos)d(dpos) = a(ed)d(st) if strand=='+' else a(st)d(ed) # ucnt = unique read counts # acnt = multi-read adjusted all counts (=ucnt+Sum(mcnt(i)/dup(i))) # delete previous sjbed12 = dstpre+'.{0}.sjpath.bed'.format(chrom) if os.path.exists(sjbed12): os.unlink(sjbed12) def _write_arrays(): for kind in ['.ex','.sj']: for strand in ['.p','.n','.u']: for suf in ['','.uniq']: cybw.array2wiggle_chr64(wigs[kind][strand][suf], chrom, wigpaths[kind][strand][suf], 'w') def _write_sj(sjs): # sjs = [(chr,st,ed,pathcode(name),ureads(sc1),strand,tst,ted,areads(sc2),cse),...] sjdf = PD.DataFrame(sjs, columns=GGB.BEDCOLS[:9]+['cse']) sjdfgr = sjdf.groupby('name') sj = sjdfgr.first() sj['sc1'] = sjdfgr['sc1'].sum().astype(int) # ucnt sj['sc2'] = sjdfgr['sc2'].sum().astype(int) # jcnt=ucnt+mcnt sj['st'] = sjdfgr['st'].min() sj['ed'] = sjdfgr['ed'].max() sj['#exons'] = sj['cse'].apply(len)+1 sj['ests'] = [[0]+[z[1]-st for z in cse] for st,cse in sj[['st','cse']].values] sj['eeds'] = [[z[0]-st for z in cse]+[ed-st] for st,ed,cse in sj[['st','ed','cse']].values] esizes = [[u-v for u,v in zip(x,y)] for x,y in sj[['eeds','ests']].values] sj['estarts'] = ['{0},'.format(','.join([str(y) for y in x])) for x in sj['ests']] sj['esizes'] = ['{0},'.format(','.join([str(y) for y in x])) for x in esizes] sj['name'] = sj.index # sj = sj.reset_index() with open(sjbed12, 'w') as f: sj[GGB.BEDCOLS].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE) def _append_sj(cse, css, csj, chrom,ureads,areads): if (len(cse)>0): # spits out splice rec # chr,st,ed,pathcode,ureads,strand,tst,ted,areads tst = cse[0][0] ted = cse[-1][1] if len(css)>0: strand = Counter(css).most_common()[0][0] else: strand = '.' name = pathcode(cse, strand) st = int(csj[0][1]) # first segment start ed = int(csj[-1][2]) # last segment end sjs.append((chrom,st,ed,name,ureads,strand,tst,ted,areads,cse)) def _add_to_ex_arrays(st,ed,dup,strand): kind='.ex' strand = STRANDMAP[(strand,stranded)] dic = wigs[kind][strand] dic[''][st:ed] += 1 if not dup: dic['.uniq'][st:ed] += 1 def _add_to_sj_arrays(sst,sed,dup,strand): kind='.sj' s = {'+':'.p','-':'.n','.':'.u'}[strand] dic = wigs[kind][s] # add to the arrays dic[''][sst:sed] += 1 if not dup: dic['.uniq'][sst:sed] += 1 ureads,areads = 1,1 else: ureads,areads = 0,1 return ureads,areads csj = [] # current collection of spliced reads css = [] # current strands cse = [] # current (sst,sed) csn = 0 # current segment number ureads,areads = 1,1 # uniq, total reads it's either 1,1 or 0,1 pmid = None # previous map id common to spliced segments for line in fp: rec = line.strip().split(b'\t') # 7 column bed: chr(0), st(1), ed(2), name(3), mapq(4), strand(5), mapid(6) cchr = rec[0].decode() st,ed = int(rec[1]),int(rec[2]) dup = rec[3] in dupids #dic[rec[3]] estrand = rec[5] _add_to_ex_arrays(st,ed,dup,estrand) # process splice if pmid != rec[6]: # new map _append_sj(cse, css, csj, chrom, ureads, areads) csj,css,cse,csn = [rec],[],[],0 # reset running params else: # add segments csj.append(rec) prec = csj[-2] # previous rec sst = int(prec[2]) # ed of previous segment sed = int(rec[1]) # st of current segment cse.append((sst,sed)) # find strand sted = gfc.get(chrom,sst,sst+2)+gfc.get(chrom,sed-2,sed) strand = STED2STRAND.get(sted,'.') if strand != '.': css.append(strand) ureads,areads = _add_to_sj_arrays(sst,sed,dup,strand) pmid = rec[6] _append_sj(cse, css, csj, chrom, ureads, areads) _write_arrays() _write_sj(sjs)