def __init__(self, sjexpre, code, chromdir, rmskviz, outdir, **kw): self.sjexpre = sjexpre self.prefix = prefix = os.path.join(outdir, code) self.fnobj = FN.FileNamesBase(prefix) self.chromdir = chromdir self.rmskviz = rmskviz self.gfc = FA.GenomeFASTAChroms(chromdir) self.params = RMSKPARAMS.copy() self.params.update(kw) self.ex = UT.read_pandas(sjexpre + '.ex.txt.gz') self.sj = UT.read_pandas(sjexpre + '.sj.txt.gz') if 'glen' not in self.ex or 'tlen' not in self.ex: if not os.path.exists(sjexpre + '.ci.txt.gz'): ci = UT.chopintervals(ex, sjexpre + '.ci.txt.gz') else: ci = UT.read_ci(sjexpre + '.ci.txt.gz') UT.set_glen_tlen(self.ex, ci, gidx='_gidx') UT.write_pandas(self.ex, sjexpre + '.ex.txt.gz', 'h') uexpath = sjexpre + '.unionex.txt.gz' if os.path.exists(uexpath): self.uex = UT.read_pandas(uexpath) else: LOG.info('making union exons...saving to {0}'.format(uexpath)) self.uex = UT.make_unionex(self.ex, '_gidx') UT.write_pandas(self.uex, uexpath, 'h')
def _scan_make_map(paths, dstpath): cnt = defaultdict(set) #csp = defaultdict(int) for path in paths: if path[-3:]=='.gz': with gzip.open(path) as gz_file: with io.BufferedReader(gz_file) as fp: for line in fp: rec = line.strip().split(b'\t') if len(rec)==7: cnt[rec[3]].add(rec[6]) # for each read how many locations? else: print('wrong#fields:{0} in {1}'.format(len(rec),path)) else: with open(path,'rb') as fp: for line in fp: # chr,st,ed,name,sc1,strand,tst rec = line.strip().split(b'\t') # read_id:name(3), map_id:tst(6) if len(rec)==7: cnt[rec[3]].add(rec[6]) # for each read how many locations? else: print('wrong#fields:{0} in {1}'.format(len(rec),path)) # csp[rec[6]] += 1 # count # segments in a read if >1 spliced try:# py2 dup = PD.DataFrame({k:len(v) for k,v in cnt.iteritems() if len(v)>1}, index=['cnt']).T except: dup = PD.DataFrame({k:len(v) for k,v in cnt.items() if len(v)>1}, index=['cnt']).T UT.write_pandas(dup, dstpath,'ih')
def __init__(self, sj, me, filepre, depth=500, maxcnt=10000): MEGraph3.__init__(self, sj, me, depth, maxcnt) self.pre = filepre a = filepre + 'ex1.txt.gz' b = filepre + 'ex2.txt.gz' c = filepre + 'ov.txt.gz' # calculate exon overlap to self cols0 = ['chr', 'st', 'ed', 'strand', '_id'] # single cell data contains float in st,ed in ex ??? me = UT.check_int_nan(me) a = UT.write_pandas(me[cols0], a, '') b = UT.write_pandas(me[cols0], b, '') c = BT.bedtoolintersect(a, b, c, wao=True) cols1 = cols0 + ['b_' + x for x in cols0] + ['ovl'] self.ov = ov = UT.read_pandas(c, names=cols1) # select same strand overlap to non-self self.ov1 = ov1 = ov[(ov['_id'] != ov['b__id']) & (ov['strand'] == ov['b_strand'])] # make connected dictionary _id => [b__id's] tmp = ov1.groupby('_id')['b__id'].apply( lambda x: list(x)).reset_index() if 'index' in tmp.columns: tmp['_id'] = tmp['index'] #LOG.debug('graph.MEGraph4.__init__: tmp.columns={0}, len(tmp)={1}'.format(tmp.columns, len(tmp))) self.eoe = dict(UT.izipcols(tmp, ['_id', 'b__id'])) # cleanup os.unlink(a) os.unlink(b) os.unlink(c)
def as2exsj(dstpre, np=7): ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A2.EXDFCOLS) sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A2.SJDFCOLS) se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A2.EXDFCOLS) paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A2.PATHCOLS) #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.' #sj.loc[sj['strand'].isin(['.+','.-']),'strand'] = '.' sj['st'] = sj['st']+1 cols = A2.EXDFCOLS ex = PD.concat([ex[cols],se[cols]],ignore_index=True) UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' graphpre = dstpre+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing genes = GP.find_genes4(sj,ex, filepre=prefix, np=np, override=False, separatese=True) ex.loc[ex['kind']=='3','cat'] = '3' ex.loc[ex['kind']=='5','cat'] = '5' UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h') ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz') return sj, ex
def test_count_repeats_viz_mp(outdir, testbed): TESTDATA = StringIO("""st,ed,name,sc1,chr,strand,_id 0,10,a,0,chr1,+,1 5,20,a,1,chr1,-,2 25,30,a,1,chr1,+,3 40,45,b,2,chr1,-,4 45,50,b,2,chr1,+,5 49,55,c,2,chr1,+,6 255,260,d,3,chr2,-,7 260,270,d,4,chr2,+,8 370,380,e,4,chr2,-,9 380,390,e,5,chr2,+,10 """) TESTDATA2 = StringIO("""st,ed,name,sc1,chr,strand 0,5,a1,0,chr1,+ 9,20,a2,1,chr1,- 31,35,a3,1,chr1,+ 40,45,b1,2,chr1,- 45,47,b2,2,chr1,+ 56,70,c1,2,chr1,+ 200,210,d1,3,chr2,- 260,280,d2,4,chr2,+ 391,400,e1,4,chr2,- """) df = PD.DataFrame.from_csv(TESTDATA, sep=",", index_col=False) rmsk = PD.DataFrame.from_csv(TESTDATA2, sep=",", index_col=False) path = os.path.join(outdir, 'rmsktest.bed.gz') UT.write_pandas(rmsk[['chr', 'st', 'ed', 'name', 'sc1', 'strand']], path, '') print(df) print(rmsk) rslt = RP.count_repeats_viz_mp(df, path, expand=0) print(df) rslt = RP.count_repeats_viz_mp(df, path, expand=10) print(df)
def calc_exon_params(self, np=10, covfactor=0.05): zoom = self.zoom # get params neipath = self.dstpre + '.{0}.{1}.nei0.params.txt.gz'.format( self.refcode, covfactor) e53path = self.dstpre + '.{0}.{1}.e53.params.txt.gz'.format( self.refcode, covfactor) if os.path.exists(neipath): print('reading from cache {0}'.format(neipath)) nei = UT.read_pandas(neipath) else: nei = self.calc_params_mp(self.ne_i0, np=np, gapmode='i', covfactor=covfactor) # ~ 1min UT.write_pandas(nei, neipath, 'h') if os.path.exists(e53path): print('reading from cache {0}'.format(e53path)) e53 = UT.read_pandas(e53path) else: e53 = self.calc_params_mp( self.e53, np=np, gapmode='i', covfactor=covfactor) # ~ 10min don't do long ones stupid UT.write_pandas(e53, e53path, 'h') # logistic fit cols = [ 'chr', 'st', 'ed', 'gap', 'emax', 'emin', 'sIn', 'sOut', 'locus', 'kind', 'len', 'sdIn', 'sdOut', 'mp' ] nei['kind'] = 1 e53['kind'] = 0 nei['len'] = nei['ed'] - nei['st'] e53['len'] = e53['ed'] - e53['st'] D = PD.concat([nei[cols], e53[cols]], ignore_index=True) D['llen'] = N.log10((D['len'])) D['lgap'] = N.log10(D['gap'] + 1) D['lemax'] = N.log2(zoom * D['emax'] + 1) D1 = D[(D['emax'] > 0) & (D['sdIn'] != 0) & (D['sdOut'] != 0)] print(len(D), len(D1)) X = D1[['lemax', 'lgap', 'llen', 'mp']].values Y = D1['kind'].values lr = LogisticRegression() lr.fit(X, Y) Z = lr.predict(X) # write json ppath = self.dstpre + '.{0}.exonparams.json'.format(self.refcode) self.write_params(ppath, lr, Y, Z, ['lemax', 'lgap', 'llen', 'mp'], { 'zoom': zoom, 'th': covfactor }) # make fig spath = self.dstpre + '.{0}.exonparams'.format(self.refcode) title = self.dstpre.split('/')[-1] self.plot_exon_fit(spath + '.0.png', title, X, Y, Z, ptyp='both') self.plot_exon_fit(spath + '.pdf', title, X, Y, Z, ptyp='pdf') self.plot_exon_fit(spath + '.png', title, X, Y, Z, ptyp='png') return locals()
def count_repeats_viz_chr(bedpath, rmskpath, outpath): c = BT.bedtoolintersect(bedpath, rmskpath, outpath, wao=True) cols = [ 'chr', 'st', 'ed', 'name', 'b_chr', 'b_st', 'b_ed', 'b_name', 'strand', 'ovl' ] df = UT.read_pandas(c, names=cols) df['rn'] = df['b_name'] + '(' + df['strand'] + ')' # group and concat repname dg = df.groupby('name')['rn'].apply( lambda x: ','.join(list(x))).reset_index() UT.write_pandas(dg, outpath, 'h')
def make_sjex(gtfpath, dstpre, np=12): if UT.isstring(gtfpath): gtf = GGB.read_gtf(gtfpath) else: gtf = gtfpath sj,ex = gtf2exonsj(gtf, np=np) print(ex.groupby(['kind','cat']).size()) ex.loc[ex['kind']=='5','cat'] = '5' ex.loc[ex['kind']=='3','cat'] = '3' UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h') # make ci ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz') return {'sj':sj,'ex':ex}
def make_sjex(self, np=4): sjpath, expath = self.sjexpaths() if not os.path.exists(self.gtfpath): raise RuntimeError('file {0} does not exist'.format(self.gtfpath)) LOG.info('making sj,ex...') gtf = GGB.read_gtf(self.gtfpath) # ~ 1.5 min => # if 'cov' in gtf.iloc[0]['extra']: # gtf['cov'] = GGB.get_gtf_attr_col(gtf, 'cov') # convert gtf to sjex pre = self.fname('graphpre{0}_'.format(uuid.uuid4())) sj, ex = gtf2exonsj(gtf, np=np, graphpre=pre) # save UT.write_pandas(sj, sjpath, 'h') UT.write_pandas(ex, expath, 'h') return sj,ex
def test_trim_ex(g4sjex, tmpdir): sj0, ex0 = g4sjex cols = list(ex0.columns) if 'len' not in cols: ex0['len'] = ex0['ed'] - ex0['st'] gidxs = ex0['gene_id'].unique()[:20] ex = ex0[ex0['gene_id'].isin(gidxs)].copy() ex.loc[ex['gene_id'].isin(gidxs[10:]), 'chr'] = 'chr2' expath = os.path.join(str(tmpdir), 'ex.txt.gz') UT.write_pandas(ex, expath, 'h') dstpath = os.path.join(str(tmpdir), 'tex.txt.gz') dstcipath = os.path.join(str(tmpdir), 'texci.txt.gz') tex = TE.trim_ex(expath, dstpath, dstcipath, 1000, 'gene_id', 2) assert len(ex) == 58 assert len(tex) == 25 assert os.path.exists(dstpath) assert os.path.exists(dstcipath)
def extract_exi53(self): # internal exons overlapping with either 5 or 3 prime exons? cols0 = ['chr', 'st', 'ed', '_id', 'sc1', 'strand'] cols = cols0 + ['b_' + x for x in cols0] + ['ovl'] ex = self.ex exi = ex[ex['cat'] == 'i'] # internal exons ai = self.refpre + '.exi.bed.gz' ai = UT.write_pandas(exi[cols0], ai, '') e5 = ex[ex['cat'] == '5'] a5 = self.refpre + '.ex5.bed.gz' a5 = UT.write_pandas(e5[cols0], a5, '') e3 = ex[ex['cat'] == '3'] a3 = self.refpre + '.ex3.bed.gz' a3 = UT.write_pandas(e3[cols0], a3, '') a5i = self.refpre + '.ex5-ovl-exi.txt.gz' a3i = self.refpre + '.ex3-ovl-exi.txt.gz' nc = len(cols0) e5i0 = BT.calc_ovlratio(a5, ai, a5i, nc, nc) e3i0 = BT.calc_ovlratio(a3, ai, a3i, nc, nc) self.e5ia = e5ia = e5i0[e5i0['ovlratio'] == 1].rename( columns={'name': '_id'}) self.e3ia = e3ia = e3i0[e3i0['ovlratio'] == 1].rename( columns={'name': '_id'}) # find internal exons which shares st or ed with 5 or 3 exons a5i = BT.bedtoolintersect(ai, a5, a5i, wao=True) a3i = BT.bedtoolintersect(ai, a3, a3i, wao=True) # read tmp file a5idf = UT.read_pandas(a5i, names=cols) a3idf = UT.read_pandas(a3i, names=cols) idx5 = ((a5idf['strand']=='-')&(a5idf['st']==a5idf['b_st']))|\ ((a5idf['strand']=='+')&(a5idf['ed']==a5idf['b_ed'])) idx3 = ((a3idf['strand']=='-')&(a3idf['ed']==a3idf['b_ed']))|\ ((a3idf['strand']=='+')&(a3idf['st']==a3idf['b_st'])) self.e5i = a5idf[idx5].groupby('_id').first().reset_index() self.e3i = a3idf[idx3].groupby('_id').first().reset_index() LOG.info('#e5i={0}'.format(len(self.e5i))) LOG.info('#e3i={0}'.format(len(self.e3i)))
def extract_nonovl_exons(self): ex = self.ex sj = self.sj ex = ex[ex['st'] < ex['ed']] sj = sj[sj['st'] < sj['ed']] # nonovl exons # ex['gene_type'] = ex['extra'].str.split(';').str[2].str.split().str[1].str[1:-1] cols0 = ['chr', 'st', 'ed', '_id'] a = self.refpre + '.ex.bed.gz' a = UT.write_pandas(ex[cols0], a, '') b = self.refpre + '.sj.bed.gz' b = UT.write_pandas(sj[cols0], b, '') c1 = self.refpre + '.ex-ovl-sj.txt.gz' c2 = self.refpre + '.ex-ovl-ex.txt.gz' c1 = BT.bedtoolintersect(a, b, c1, wao=True) c2 = BT.bedtoolintersect(a, a, c2, wo=True) cols = cols0 + ['b_' + x for x in cols0] + ['ovl'] sov = UT.read_pandas(c1, names=cols) sov['len'] = sov['ed'] - sov['st'] sov['ovlratio'] = sov['ovl'] / sov['len'] sovg = sov.groupby('_id')['ovlratio'].max() snonov = sovg[sovg < 1.] # not completely covered by junction eov = UT.read_pandas(c2, names=cols) eovsize = eov.groupby('_id').size() enonov = eovsize[eovsize == 1] # only overlaps with self self.ne_i0 = ne_i0 = ex.set_index('_id').ix[enonov.index].sort_values( ['chr', 'st', 'ed']).reset_index() self.ne_i0['len'] = ne_i0['ed'] - ne_i0['st'] LOG.info('#non-ovl-ex0={0}'.format(len(enonov))) LOG.info('#non-ex-ovl-ex={0}, #non-sj-ovl-ex={1}'.format( len(enonov), len(snonov))) ids = set(enonov.index).intersection(snonov.index) LOG.info('#non-ovl-ex={0}'.format(len(ids))) self.nov_ex = novex = ex.set_index('_id').ix[ids].sort_values( ['chr', 'st', 'ed']).reset_index() novex['len'] = novex['ed'] - novex['st'] self.ne_i = novex[novex['cat'] == 'i'] self.ne_5 = novex[novex['cat'] == '5'] self.ne_3 = novex[novex['cat'] == '3'] self.ne_s = novex[novex['cat'] == 's']
def calc_overlaps(self): cref = self.cn_ref ctgt = self.cn_tgt a = ctgt.fname('cptmp.ex.bed.gz') b = cref.fname('cptmp.ex.bed.gz') c = ctgt.fname2('cptmp.ex.ovl.txt.gz', cref.code) cols = ['chr', 'st', 'ed', 'cat', '_id', '_gidx', 'len', 'strand'] self.ex_tgt = etgt = ctgt.model('ex') #UT.read_pandas(p1.ex) self.ex_ref = eref = cref.model('ex') #UT.read_pandas(p2.ex) eref['_gidx'] = eref[self.refgidxcol] if 'len' not in etgt.columns: etgt['len'] = etgt['ed'] - etgt['st'] if 'len' not in eref.columns: eref['len'] = eref['ed'] - eref['st'] a = UT.write_pandas(etgt[cols], a, '') b = UT.write_pandas(eref[cols], b, '') c = BT.bedtoolintersect(a, b, c, wao=True) ocols = cols + ['b_' + x for x in cols] + ['ovl'] self.ov = UT.read_pandas(c, names=ocols) # gene overlap gcols = ['chr', 'st', 'ed', 'strand'] def _gbed(ex): gr = ex.groupby('_gidx') g = gr[gcols].first() g['st'] = gr['st'].min() g['ed'] = gr['ed'].max() return g.reset_index() gtgt = _gbed(etgt) gref = _gbed(eref) gcols2 = gcols + ['_gidx'] a2 = ctgt.fname('cptmp.gene.bed.gz') b2 = cref.fname('cptmp.gene.bed.gz') c2 = ctgt.fname2('gene.ovl.txt.gz', cref.code) a2 = UT.write_pandas(gtgt[gcols2], a2, '') b2 = UT.write_pandas(gref[gcols2], b2, '') c2 = BT.bedtoolintersect(a2, b2, c2, wao=True) gocols = gcols2 + ['b_' + x for x in gcols2] + ['ovl'] self.gov = UT.read_pandas(c2, names=gocols)
def as3exsj(dstpre, minelen=150, np=7): ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A3.EXDFCOLS) sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A3.SJDFCOLS) se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A3.EXDFCOLS) paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A3.PATHCOLS) #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.' # sj.loc[sj['strand']=='.+','strand'] = '+' # sj.loc[sj['strand']=='.-','strand'] = '-' sj['st'] = sj['st']+1 cols = A3.EXDFCOLS ex = PD.concat([ex[cols],se[cols]],ignore_index=True) UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' graphpre = dstpre+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing # genes = GP.find_genes4(sj,ex, # filepre=prefix, # np=np, # override=False, # separatese=True) genes = GP.find_genes3(sj,ex, # don't use exon overlap as connection np=np, override=False) ex.loc[ex['kind']=='3','cat'] = '3' ex.loc[ex['kind']=='5','cat'] = '5' # remove these with elen smaller than minelen ex['len'] = ex['ed']-ex['st'] exsiz = ex.groupby('_gidx')['len'].sum() rgidx = exsiz[exsiz<minelen].index.values LOG.info('minelen filter #ex {0}=>{1}'.format(len(ex), len(ex)-len(rgidx))) ex2 = ex[~ex['_gidx'].isin(rgidx)] sj2 = sj[~sj['_gidx'].isin(rgidx)] # write UT.write_pandas(ex2, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj2, dstpre+'.sj.txt.gz', 'h') ci = UT.chopintervals(ex2, dstpre+'.ci.txt.gz') return sj2, ex2
def save(self): # [i,5,5b,3,3b,s,sb,j,glc,ecc,jcc] # light weight stats also usable from others ==> dict # auc, detected1, ..., sigmoid,...,maxx,avgx,avgy,... # ==> pickle or json decode = '{0}.{1}'.format(self.en1.code, self.datacode) fname1 = self.en2.fname2('stats.json', decode, category='output') UT.makedirs(os.path.dirname(fname1)) with open(fname1, 'w') as fp: json.dump(self.stats, fp) # [i,5,5b,3,3b,s,sb,j] cov(x),ratio(y) => in a dataframe # [glc,ecc,jcc] gcov(x), ratio(y) => in a dataframe # ==> put all in one four column dataframe (kind, id, x, y) fname2 = self.en2.fname2('ratios.txt.gz', decode, category='output') for k, v in self.ratios.items(): v['kind'] = k df = PD.concat(self.ratios.values(), ignore_index=True) UT.write_pandas(df, fname2, 'h') # DP dp = self.get_detection_percentages() fname3 = self.en2.fname2('dp.txt.gz', decode, category='output') UT.write_pandas(dp, fname3, 'ih')
def calculate(self): """ Calculate base pair overlap to repeat using UCSC genome mask of repeats to lower case, and exon level overlap to repeat using UCSC RepeatMaskerViz track. ALso make a dataframe containing summary. """ pr = self.params fn = self.fnobj uex = count_repeats_mp(self.uex, self.gfc, np=pr['np'], col='#repbp') uex = count_repeats_viz_mp(uex, self.rmskviz, np=pr['np'], idcol='_id', expand=0, col='repnames') self.ugb = ugb = self._make_gbed(self.ex, self.sj, uex, datacode=pr['datacode'], gname=pr['gname']) UT.write_pandas(ugb, fn.txtname('all.genes.stats', category='output'), 'h')
def filter_sjexdf(mdstpre, rdstpre): exdf = UT.read_pandas(mdstpre + '.exdf.txt.gz', names=A3.EXDFCOLS) sedf = UT.read_pandas(mdstpre + '.sedf.txt.gz', names=A3.EXDFCOLS) exdf = PD.concat([exdf, sedf], ignore_index=True) sjdf = UT.read_pandas(mdstpre + '.sjdf.txt.gz', names=A3.SJDFCOLS) ex = UT.read_pandas(rdstpre + '.ex.txt.gz') sj = UT.read_pandas(rdstpre + '.sj.txt.gz') def select_chromwise_df(exdf, ex): npchrs = [] for chrom in exdf['chr'].unique(): pchr = exdf[exdf['chr'] == chrom] echr = ex[ex['chr'] == chrom] exnames = set(echr['name'].values) idx = [x in exnames for x in pchr['name']] npchrs.append(pchr[idx]) return PD.concat(npchrs, ignore_index=True) nexdf = select_chromwise_df(exdf, ex) nsjdf = select_chromwise_df(sjdf, sj) UT.write_pandas(nexdf, rdstpre + '.exdf.txt.gz', '') UT.write_pandas(nsjdf, rdstpre + '.sjdf.txt.gz', '')
def gtf_from_bed12(modelpre, dstpath=None, source='.'): # path['gname'] contains gene id paths = GGB.read_bed(modelpre+'.paths.withse.bed.gz') ex = UT.read_pandas(modelpre+'.ex.txt.gz') ex['id'] = ex['chr']+':'+ex['name'] n2gn = UT.df2dict(ex, 'id', 'gname') # n2gn = UT.df2dict(ex, 'name', 'gname') # there may be same st,ed in different chromosome paths['id'] = paths['chr']+':'+paths['name'] paths['id0'] = paths['chr']+':'+paths['name'].str.split('|').str[0] paths['gname'] = [n2gn[x] for x in paths['id0']] g2cnt = {} tnames = [] for x in paths['gname']: i = g2cnt.get(x,1) tnames.append('{0}.{1}'.format(x,i)) g2cnt[x] = i+1 paths['tname'] = tnames txt = 'gene_id "{0}"; transcript_id "{1}"; exon_number "{2}";' def _gen(): cols = ['chr','st','ed','gname','tname','esizes','estarts','strand'] for c,s,e,gn,tn,esi,est,strand in paths[cols].values: esizes = [int(x) for x in esi.split(',')[:-1]] estarts = [int(x) for x in est.split(',')[:-1]] for i,(x,y) in enumerate(zip(esizes,estarts)): est = s+y eed = est+x extra = txt.format(gn,tn,i+1) yield (c,source,'exon',est+1,eed,'.',strand,'.',extra) df = PD.DataFrame([x for x in _gen()], columns=GGB.GTFCOLS) if dstpath is None: dstpath = bedpath.replace('.bed','.gtf') GGB.write_gtf(df, dstpath) idf = paths[['id','chr','name','tname','gname']] UT.write_pandas(idf, modelpre+'.idmap.txt.gz','h') return df
def make_sjexci(path, np): if path[-3:]=='.gz': bpath = path[:-3] else: bpath = path ext = bpath[-4:] if ext not in ['.gtf', '.bed', '.txt']: raise ValueError('unknown filetype {0}, should be either .gtf,.bed (bed12),.txt (ucsc knownGene)'.format(ext)) pathprefix = bpath[:-4] if not os.path.exists(path): raise ValueError('{0} file does not exists'.format(ext)) if ext=='.gtf': df = GGB.read_gtf(path).sort_values(['chr',]) sj, ex = gtf2exonsj(df, np=np) elif ext=='.bed': df = GGB.read_bed(path) sj, ex = bed2exonsj(df, np=np) elif ext=='.txt': # UCSC download if 'knownGene' in path: df = GGB.read_ucsc_knownGene(path) sj, ex = kg2exonsj(df, np=np) elif 'refGene' in path: df = GGB.read_ucsc_refGene(path) sj, ex = kg2exonsj(df, np=np) # same as kg # save LOG.info('saving sj to {0}'.format(pathprefix+'.sj.txt.gz')) UT.write_pandas(sj, pathprefix+'.sj.txt.gz', 'h') LOG.info('saving ex to {0}'.format(pathprefix+'.ex.txt.gz')) UT.write_pandas(ex, pathprefix+'.ex.txt.gz', 'h') # make ci ci = UT.chopintervals(ex, pathprefix+'.ci.txt.gz') return sj, ex
def write(self): pre = self.dstpre + '.{0}_{1}_{2}'.format(self.chrom, self.st, self.ed) # 1) exon, junctions, allpaths => csv (no header <= to concatenate bundles) ecols = A2.EXDFCOLS #['chr','st','ed','strand','name','kind','ecov'] UT.write_pandas(self.exdf[ecols], pre + '.covs.exdf.txt.gz', '') scols = A2.SJDFCOLS #['chr','st','ed','strand','name','kind','tcnt' ]#,'donor','acceptor','dp','ap'] UT.write_pandas(self.sjdf[scols], pre + '.covs.sjdf.txt.gz', '') pcols = A2.PATHCOLS #['chr','st','ed','name','strand','tst','ted','tcov0','tcov1','tcov'] UT.write_pandas(self.paths[pcols], pre + '.covs.paths.txt.gz', '') # write colored bed12 for tcov > th tgt = self.paths[self.paths['tcov'] >= self.tcovth].copy() self.bed12 = A2.path2bed12(tgt, cmax=9, covfld='tcov') GGB.write_bed(self.bed12, pre + '.covs.paths.bed.gz', ncols=12)
def savemodel(self, which, code2=None, category='temp'): """Save model. If code2 is None, overwrite original, if code2 is provided, writes to outdir/(code).(code2).(which).txt.gz. Args: which: 'sj','ex','ci' code2: 2nd identifier category: filename category (default 'temp') Returns: file path or None (if model is not loaded) """ if hasattr(self, which): if code2 is None: path = self.modelpath(which, None) else: path = self.fname2('{0}.txt.gz'.format(which), code2, category=category) return UT.write_pandas(getattr(self, which), path, 'h') return None
def count_repeats_viz_mp(beddf, rmskvizpath, idcol='_id', np=3, prefix=None, expand=0, col='repnames'): """Use rmsk-viz track and check each (unioned) exon overlaps with repeats and report repeat name(s). Uses Bedtools and calculates chromosome-wise. Args: beddf: Pandas DataFrame with chr,st,ed cols, when calculating repeats bp for genes, unioned bed should be used (use utils.make_unionex) idcol: colname for unique row id (default _id) rmskvizpath: path to repeat masker viz BED7 file (created using rmskviz2bed7) np: number of CPU to use prefix: path prefix for temp file, if not None temp files are kept. (default None) expand: how many bases to expand exon region in each side (default 0) col: column name to put in overlapping repeat names (if multiple comma separated) Outputs: are put into beddf columns with colname col(default repnames) """ cleanup = False if prefix is None: cleanup = True prefix = os.path.join(os.path.dirname(rmskvizpath), str(uuid.uuid4()) + '_') # chrom-wise chroms = sorted(beddf['chr'].unique()) # check whether rmskviz is already split splitrmsk = False for chrom in chroms: rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom) # reuse if not os.path.exists(rpath): splitrmsk = True break if splitrmsk: rmsk = GGB.read_bed(rmskvizpath) args = [] bfiles = [] ofiles = [] for chrom in chroms: bpath = prefix + 'tgt.{0}.bed'.format(chrom) # don't compress rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom) # reuse if expand > 0: bchr = beddf[beddf['chr'] == chrom].copy() bchr['st'] = bchr['st'] - expand bchr['ed'] = bchr['ed'] + expand bchr.loc[bchr['st'] < 0, 'st'] = 0 else: bchr = beddf[beddf['chr'] == chrom] UT.write_pandas(bchr[['chr', 'st', 'ed', idcol]], bpath, '') bfiles.append(bpath) if splitrmsk: rchr = rmsk[rmsk['chr'] == chrom] UT.write_pandas(rchr[['chr', 'st', 'ed', 'name', 'strand']], rpath, '') opath = prefix + 'out.{0}.bed'.format(chrom) ofiles.append(opath) args.append([bpath, rpath, opath]) rslts = UT.process_mp(count_repeats_viz_chr, args, np=np, doreduce=False) # gather outputs cols = ['name', 'repnames'] outs = [UT.read_pandas(f, names=cols) for f in ofiles] df = PD.concat(outs, ignore_index=True) df['name'] = df['name'].astype(str) i2rn = UT.df2dict(df, 'name', 'repnames') beddf[col] = [i2rn[str(x)] for x in beddf[idcol]] # cleanup if cleanup: for f in bfiles: os.unlink(f) for f in ofiles: os.unlink(f) return beddf
def write_txt(self, df, suffix, fm='h', category='temp', **kw): fname = self.txtname(suffix, category) return UT.write_pandas(df, fname, fm=fm, **kw)
def calc_53_params(self, sdiffth=1, np=10, alpha=0.1): # get parameters dic = {} zoom = self.zoom # seta = ['ne_i','ne_5','ne_3','e5i','e3i','e5ia','e3ia'] # setb = ['ne_5','ne_3','e5i','e3i','e5ia','e3ia'] # setc = ['ne_i','ne_3','ne_5','e5ia','e3ia'] seta = ['ne_i', 'ne_5', 'ne_3', 'e5i', 'e3i'] setb = ['ne_5', 'ne_3', 'e5i', 'e3i'] setc = ['ne_i', 'ne_3', 'ne_5'] for x in seta: fpath = self.dstpre + '.{0}.{1}.flux.txt.gz'.format( self.refcode, x) if os.path.exists(fpath): print('reading from cache {0}'.format(fpath)) dic[x] = UT.read_pandas(fpath) else: df = getattr(self, x) print('calculating {0}...'.format(x)) dic[x] = self.calc_flux_mp(df, np=np) UT.write_pandas(dic[x], fpath, 'h') dicb = {} FN0 = 0 for x in setb: f = dic[x] f['kind'] = 1 idx0 = N.abs( N.log2(zoom * f['sin'] + 1) - N.log2(zoom * f['sout'] + 1)) > sdiffth idx1 = (f['sdin'] != 0) | (f['sdout'] != 0 ) # should have either in or out idx = idx0 & idx1 FN0 += N.sum((~idx0) & idx1) # pre filtered positive dicb[x] = f[idx] f = dic['ne_i'] f['kind'] = 0 idx = (f['ecovmax'] > 1) & ( (f['sdin'] != 0) & (f['sdout'] != 0)) # should have both in&out dicb['ne_i'] = f[idx] D = PD.concat([dicb[x] for x in setc], ignore_index=True) D2 = PD.concat([dicb['ne_i'], dicb['e3i'], dicb['e5i']], ignore_index=True) # don't use e3i, e5i too many non-actives D['lsin'] = N.log2(zoom * D['sin'] + 1) D['lsout'] = N.log2(zoom * D['sout'] + 1) D['sdiff'] = N.abs(D['lsin'] - D['lsout']) D['smean'] = (D['lsin'] + D['lsout']) / 2. X = D[['sdiff', 'smean']].values Y = D['kind'].values lr = LogisticRegression() lr.fit(X, Y) Z = lr.predict(X) D2['lsin'] = N.log2(zoom * D2['sin'] + 1) D2['lsout'] = N.log2(zoom * D2['sout'] + 1) D2['sdiff'] = N.abs(D2['lsin'] - D2['lsout']) D2['smean'] = (D2['lsin'] + D2['lsout']) / 2. X2 = D2[['sdiff', 'smean']].values Z2 = lr.predict(X2) # save fit coefficients ppath = self.dstpre + '.{0}.e53params.json'.format(self.refcode) self.write_params(ppath, lr, Y, Z, ['sdiff', 'smean'], { 'sdiffth': sdiffth, 'zoom': zoom }, FN0=FN0) # save scatter plots spath = self.dstpre + '.{0}.e53params'.format(self.refcode) title = self.dstpre.split('/')[-1] self.plot_sin_sout(dic, D, Y, Z, D2, Z2, sdiffth, spath + '.0.png', title, alpha=alpha) self.plot_sin_sout(dic, D, Y, Z, D2, Z2, sdiffth, spath + '.pdf', title, ptyp='pdf', alpha=alpha) self.plot_sin_sout(dic, D, Y, Z, D2, Z2, sdiffth, spath + '.png', title, ptyp='png', alpha=alpha) return locals()
def calc_53gap_params(self, covfactor=0, np=10, emaxth=1, eth=1): zoom = self.zoom d5path = self.dstpre + '.{0}.{1}.gap5params.txt.gz'.format( self.refcode, covfactor) d3path = self.dstpre + '.{0}.{1}.gap3params.txt.gz'.format( self.refcode, covfactor) if os.path.exists(d5path): print('reading from cache {0}'.format(d5path)) d5 = UT.read_pandas(d5path) else: d5 = self.calc_params_mp(self.ne_5, win=8192, np=np, gapmode='53', direction='<', covfactor=covfactor) UT.write_pandas(d5, d5path, 'h') if os.path.exists(d3path): print('reading from cache {0}'.format(d3path)) d3 = UT.read_pandas(d3path) else: d3 = self.calc_params_mp(self.ne_3, win=8192, np=np, gapmode='53', direction='>', covfactor=covfactor) UT.write_pandas(d3, d3path, 'h') i5 = (d5['eOut'] > eth) & (d5['emax'] > emaxth) i3 = (d3['eIn'] > eth) & (d3['emax'] > emaxth) d50 = d5[i5] d30 = d3[i3] def _fitone(d0, x, y1, y2, rx='sin', lrx='lsin'): da = d0[[x, y1]].copy().rename(columns={y1: 'gap', x: rx}) db = d0[[x, y2]].copy().rename(columns={y2: 'gap', x: rx}) da['kind'] = 1 db['kind'] = 0 D = PD.concat([da, db], ignore_index=True) D[lrx] = N.log2(zoom * D[rx] + 1) D['lgap'] = N.log2(D['gap'] + 1) X = D[[lrx, 'lgap']].values Y = D['kind'].values lr = LogisticRegression() lr.fit(X, Y) Z = lr.predict(X) return locals() fit5 = _fitone(d50, 'eOut', 'gap', 'gapIn', 'ein', 'lein') fit3 = _fitone(d30, 'eIn', 'gap', 'gapOut', 'ein', 'lein') # max exon size m5 = N.max(self.ne_5['len']) m3 = N.max(self.ne_3['len']) rx, lrx = 'ein', 'lein' # save coefs p5path = self.dstpre + '.{0}.gap5params.json'.format(self.refcode) f = fit5 self.write_params(p5path, f['lr'], f['Y'], f['Z'], [lrx, 'lgap'], { 'th': covfactor, 'zoom': zoom, 'maxsize': int(m5) }) p3path = self.dstpre + '.{0}.gap3params.json'.format(self.refcode) f = fit3 self.write_params(p3path, f['lr'], f['Y'], f['Z'], [lrx, 'lgap'], { 'th': covfactor, 'zoom': zoom, 'maxsize': int(m3) }) # save scatter plots spath = self.dstpre + '.{0}.gap53params'.format(self.refcode) title = self.dstpre.split('/')[-1] self.plot_gap53_fit(fit5, fit3, spath + '.0.png', title, ptyp='both') self.plot_gap53_fit(fit5, fit3, spath + '.pdf', title, ptyp='pdf') self.plot_gap53_fit(fit5, fit3, spath + '.png', title, ptyp='png') return locals()
def extract_53_pair(self): # between genes ex = self.ex tmpprefix = self.refpre ex['_apos'] = ex['a_pos'].str.split(':').str[1].astype(int) ex['_dpos'] = ex['d_pos'].str.split(':').str[1].astype(int) ex.loc[ex['cat'] == '3', 'spos'] = ex['_apos'] ex.loc[ex['cat'] == '5', 'spos'] = ex['_dpos'] cols = ['chr', 'st', 'ed', 'name', 'strand', '_gidx1', '_gidx2'] def _find(ecs, chrom, strand): e53 = ecs[ecs['cat'].isin(['3', '5'])].sort_values('spos') #esorted = echrstrand.sort_values('_apos') v1 = e53.iloc[:-1][['spos', 'cat', '_gidx', '_id', 'st', 'ed']].values v2 = e53.iloc[1:][['spos', 'cat', '_gidx', '_id', 'st', 'ed']].values pairs = [] if strand == '+': for r1, r2 in zip(v1, v2): if r1[2] != r2[2]: # not same gene if (r1[1] == '3') & (r2[1] == '5') & ( r1[5] < r2[4]): # non overlapping 3=>5 name = '+g{0}e{1}|g{2}e{3}'.format( r1[2], r1[3], r2[2], r2[3]) pairs.append((chrom, r1[0], r2[0], name, strand, r1[2], r2[2])) else: for r1, r2 in zip(v1, v2): if r1[2] != r2[2]: if (r1[1] == '5') & (r2[1] == '3') & (r1[5] < r2[4]): # name = '-g{0}e{1}|g{2}e{3}'.format( r1[2], r1[3], r2[2], r2[3]) pairs.append((chrom, r1[0], r2[0], name, strand, r1[2], r2[2])) df = PD.DataFrame(pairs, columns=cols) return df rslts = [] for chrom in ex['chr'].unique(): for strand in ['+', '-']: echrstrand = ex[(ex['chr'] == chrom) & (ex['strand'] == strand)] rslts.append(_find(echrstrand, chrom, strand)) df = PD.concat(rslts, ignore_index=True).sort_values(['chr', 'st', 'ed']) # intersect with internal exons a = tmpprefix + '.53.exi.bed' # ncol 3 b = tmpprefix + '.53.bed' #ncol 5 c = tmpprefix + '.53.exi.ovl.txt' exi = ex[ex['cat'] == 'i'].sort_values(['chr', 'st', 'ed']) UT.write_pandas(exi[['chr', 'st', 'ed']], a, '') UT.write_pandas(df, b, '') c = BT.bedtoolintersect(b, a, c, wao=True) cols1 = cols + ['b_chr', 'b_st', 'b_ed', 'ovl'] cdf = UT.read_pandas(c, names=cols1) sdf = cdf[cdf['ovl'] == 0][cols] sdf['locus'] = UT.calc_locus(sdf) sdf['len'] = sdf['ed'] - sdf['st'] maxexonsize = self.ne_i['len'].max() sdf = sdf[(sdf['len'] > 20) & (sdf['len'] < max(2 * maxexonsize, 20000))] UT.write_pandas(sdf, tmpprefix + '.e53pair.bed.gz') sdf.index.name = '_id' self.e53 = sdf.reset_index()
def filter(self, **kw): """ Filter genes. base pair repeat overlap % >= th_bp_ovl (default 50) exon_repeat_overlap % >= th_ex_ovl (default 50) #union exon < th_uexon (default 4) That is, by default, it filters out 2,3 exon genes with both base pair and exon level overlap to repeats are greater or equal to 50%. Does not apply to single exons. """ d = self.ugb pr = self.params fn = self.fnobj pr.update(kw) idx1 = (d['rep%'] >= pr['th_bp_ovl']) & (d['rviz%'] > pr['th_ex_ovl']) idx2 = (d['#junc'].notnull()) & (d['#uexons'] < pr['th_uexon']) idx = ~(idx1 & idx2) self.ugb2 = ugb2 = d[idx] # filtered self.ugb3 = ugb3 = d[~idx] gids = ugb2.index.values ex0 = self.ex sj0 = self.sj uex = self.uex # filter ex,sj,uex self.ex2 = ex2 = ex0[ex0['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) self.sj2 = sj2 = sj0[sj0['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) self.uex2 = uex2 = uex[uex['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) gcovfld = 'gcov_' + pr['datacode'] if pr['datacode'] else 'gcov' self.gbed2 = gbed2 = GGB.unionex2bed12(uex2, name=pr['gname'], sc2=gcovfld, sc1='tlen') gbed2['sc2'] = gbed2['sc2'].astype(int) # write out filtered ex,sj,ci,unionex,gbed UT.write_pandas(ex2, fn.txtname('ex', category='output'), 'h') UT.write_pandas(sj2, fn.txtname('sj', category='output'), 'h') UT.chopintervals(ex2, fn.txtname('ci', category='output')) GGB.write_bed(ex2, fn.bedname('ex', category='output')) GGB.write_bed(sj2, fn.bedname('sj', category='output')) UT.write_pandas(uex2, fn.txtname('unionex', category='output'), 'h') UT.write_pandas(ugb2, fn.txtname('genes.stats', category='output'), 'h') UT.write_pandas(gbed2, fn.bedname('genes', category='output'), '') # BED12 # also write filtered out genes self.ex3 = ex3 = ex0[~ex0['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) self.sj3 = sj3 = sj0[~sj0['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) self.uex3 = uex3 = uex[~uex['_gidx'].isin(gids)].sort_values( ['chr', 'st', 'ed']) gcovfld = 'gcov_' + pr['datacode'] if pr['datacode'] else 'gcov' self.gbed3 = gbed3 = GGB.unionex2bed12(uex3, name=pr['gname'], sc2=gcovfld, sc1='tlen') gbed3['sc2'] = gbed3['sc2'].astype(int) # write out filtered ex,sj,ci,unionex,gbed UT.write_pandas(ex3, fn.txtname('removed.ex', category='output'), 'h') UT.write_pandas(sj3, fn.txtname('removed.sj', category='output'), 'h') UT.chopintervals(ex3, fn.txtname('removed.ci', category='output')) UT.write_pandas(uex3, fn.txtname('removed.unionex', category='output'), 'h') UT.write_pandas(ugb3, fn.txtname('removed.genes.stats', category='output'), 'h') UT.write_pandas(gbed3, fn.bedname('removed.genes', category='output'), '') # BED12
def find_match(self): en1 = self.en1 en2 = self.en2 # write internal,3,5,se exons separately for finding match a = en1.fname2( 'emtmp.ex.bed.gz', en2.code ) # need to be unique to avoid parallel conflict (en1 ref shared) b = en2.fname('emtmp.ex.bed.gz') c = en1.fname2('emtmp.ex.ovl.txt.gz', en2.code) self.e1 = e1 = en1.model('ex') self.e2 = e2 = en2.model('ex') ecovname = self.colname('ecov') cols = [ 'chr', 'st', 'ed', 'cat', '_id', ecovname, '_gidx', 'len', 'strand' ] a = UT.write_pandas(e1[cols], a, '') b = UT.write_pandas(e2[cols], b, '') c = BT.bedtoolintersect(a, b, c, wao=True) ocols = cols + ['b_' + x for x in cols] + ['ovl'] self.ov = ov = UT.read_pandas(c, names=ocols) # overlaps of exons idxchr = ov['chr'] == ov['b_chr'] # str vs. str idxstrand = ov['strand'] == ov['b_strand'] # str vs. str idxp = (ov['strand'] == '+') & idxstrand idxn = (ov['strand'] == '-') & idxstrand idxst = ov['st'] == ov['b_st'] # b_st column mixed? type? idxed = ov['ed'] == ov['b_ed'] # b_ed column mixed? type? idxcat = ov['cat'] == ov['b_cat'] idxcov = ov[ecovname] > 0 # exons with reads LOG.debug( '=' * 10 + 'calculating match between {0} and {1}'.format(en1.code, en2.code)) LOG.debug('len(ov):{0}'.format(len(ov))) for k in [ 'idxchr', 'idxstrand', 'idxp', 'idxn', 'idxst', 'idxed', 'idxcat', 'idxcov' ]: v = locals()[k] LOG.debug('#{0}:{1}'.format(k, N.sum(v))) # internal exon cat='i' and chr,st,ed,strand match self.ei = ei = ov[idxchr & idxstrand & idxst & idxed & idxcat & (ov['cat'] == 'i')].copy() # 5' cat='5' and chr,donor (+,ed)|(-,st) match, find closest self.e5 = e5 = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) & idxcat & (ov['cat'] == '5')].copy() # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match self.e3 = e3 = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) & idxcat & (ov['cat'] == '3')].copy() # se cat='s' and chr, self.es = es = ov[idxchr & (ov['cat'] == 's') & idxcat].copy() # allow overlap to ther categories self.e5b = e5b = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) & (ov['cat'] == '5')].copy() # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match self.e3b = e3b = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) & (ov['cat'] == '3')].copy() # se cat='s' and chr, self.esb = esb = ov[idxchr & (ov['cat'] == 's')].copy() # splice junction self.s1 = s1 = en1.model('sj') self.s2 = s2 = en2.model('sj') jcntname = self.colname('jcnt') l2c = UT.df2dict(s2, 'locus', jcntname) jhitname = self.colname2('jhit', en2.code) s1[jhitname] = [l2c.get(x, 0) for x in s1['locus']] # corresponding s2 count self.sj = sj = s1[ s1[jhitname] > 0].copy() # only consider s2 count > 0 # for batch processing self.e = { 'i': ei, '5': e5, '3': e3, 's': es, 'j': sj, '5b': e5b, '3b': e3b, 'sb': esb }
def save_bed_covstats(bedpath, dstpath, bed12=False, checkuniq=False): tdic,cdic = get_total_bp_bedfile(bedpath, bed12, returndics=True, checkuniq=checkuniq) df = PD.DataFrame({c: {'totbp':tdic[c], 'covbp':cdic[c]} for c in cdic}).T df['acov'] = df['totbp']/df['covbp'] df = df.sort_values('covbp',ascending=False) return UT.write_pandas(df, dstpath, 'ih')