def as2exsj(dstpre, np=7): ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A2.EXDFCOLS) sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A2.SJDFCOLS) se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A2.EXDFCOLS) paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A2.PATHCOLS) #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.' #sj.loc[sj['strand'].isin(['.+','.-']),'strand'] = '.' sj['st'] = sj['st']+1 cols = A2.EXDFCOLS ex = PD.concat([ex[cols],se[cols]],ignore_index=True) UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' graphpre = dstpre+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing genes = GP.find_genes4(sj,ex, filepre=prefix, np=np, override=False, separatese=True) ex.loc[ex['kind']=='3','cat'] = '3' ex.loc[ex['kind']=='5','cat'] = '5' UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h') ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz') return sj, ex
def __init__(self, sj, me, depth=500, maxcnt=990): UT.set_info(sj, me) self.sj = sj self.me = me self.depth = depth self.maxcnt = maxcnt # prepare joined table metbl = me[['a_id', '_id', 'd_id']] metbl_a = metbl[metbl['a_id'] != -1].rename(columns={ 'd_id': 'd_id_a', '_id': 'e_id_a' }) metbl_d = metbl[metbl['d_id'] != -1].rename(columns={ 'a_id': 'a_id_d', '_id': 'e_id_d' }) sjtbl = sj[['d_id', '_id', 'a_id']] # join on donor j1 = PD.merge(metbl_d, sjtbl, how='outer', on='d_id', sort=False) j2 = PD.merge(j1, metbl_a, how='outer', on='a_id', sort=False) # remove dangling exons, junctions j2nd = j2[j2['e_id_a'].notnull() & j2['e_id_d'].notnull()].copy() # groupby exon id j2nd['e_id_d'] = j2nd['e_id_d'].astype(int) j2nd['e_id_a'] = j2nd['e_id_a'].astype(int) self.a = j2nd.groupby('e_id_a')['e_id_d'] self.d = j2nd.groupby('e_id_d')['e_id_a'] self.j2 = j2 self.j2nd = j2nd self.ga = j2.groupby('a_id') # groupby acceptor self.gd = j2.groupby('d_id') # groupby donor self.exons = me
def bed2exonsj(bed12, np=4, graphpre=None): """Extract exons and junctions from BED12 Args: bed12: Pandas.DataFrame containing BED12 data Returns: sj, ex: Pandas.DataFrames containing junction and exons """ esizes = bed12['esizes'].apply(lambda x: N.array([int(y) for y in x.split(',') if y])) estarts0 = bed12['estarts'].apply(lambda x: N.array([int(y) for y in x.split(',') if y])) bed12['_estarts'] = bed12['st'] + estarts0 bed12['_eends'] = bed12['_estarts']+esizes #istarts = eends[:-1] #iends = estarts[1:] cols =['chr','st','ed','tname','strand'] def _egen(): for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']): if len(est)==1: yield (chrom,st,ed,tname,0,strand,'s') else: if strand=='+': yield (chrom,est[0],eed[0],tname,0,strand,'5') for st,ed in izip(est[1:-1],eed[1:-1]): yield (chrom,st,ed,tname,0,strand,'i') yield (chrom,est[-1],eed[-1],tname,0,strand,'3') else: #'-' yield (chrom,est[0],eed[0],tname,0,strand,'3') for st,ed in izip(est[1:-1],eed[1:-1]): yield (chrom,st,ed,tname,0,strand,'i') yield (chrom,est[-1],eed[-1],tname,0,strand,'5') def _igen(): for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']): #for st,ed in izip(eed[:-1],est[1:]): for st,ed in izip(eed[:-1],est[1:]): yield (chrom,st+1,ed,tname,0,strand,'j') # add 1 to match STAR SJ.tab.out ex = PD.DataFrame([x for x in _egen()], columns=GGB.BEDCOLS[:6]+['kind']) ex['locus'] = UT.calc_locus_strand(ex) ex = ex.groupby('locus').first().reset_index() sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6]+['kind']) sj['locus'] = UT.calc_locus_strand(sj) sj = sj.groupby('locus').first().reset_index() UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' if graphpre is None: graphpre = './'+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing genes = GP.find_genes4(sj,ex, filepre=prefix, np=np, override=False, separatese=True) return sj, ex
def as3exsj(dstpre, minelen=150, np=7): ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A3.EXDFCOLS) sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A3.SJDFCOLS) se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A3.EXDFCOLS) paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A3.PATHCOLS) #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.' # sj.loc[sj['strand']=='.+','strand'] = '+' # sj.loc[sj['strand']=='.-','strand'] = '-' sj['st'] = sj['st']+1 cols = A3.EXDFCOLS ex = PD.concat([ex[cols],se[cols]],ignore_index=True) UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' graphpre = dstpre+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing # genes = GP.find_genes4(sj,ex, # filepre=prefix, # np=np, # override=False, # separatese=True) genes = GP.find_genes3(sj,ex, # don't use exon overlap as connection np=np, override=False) ex.loc[ex['kind']=='3','cat'] = '3' ex.loc[ex['kind']=='5','cat'] = '5' # remove these with elen smaller than minelen ex['len'] = ex['ed']-ex['st'] exsiz = ex.groupby('_gidx')['len'].sum() rgidx = exsiz[exsiz<minelen].index.values LOG.info('minelen filter #ex {0}=>{1}'.format(len(ex), len(ex)-len(rgidx))) ex2 = ex[~ex['_gidx'].isin(rgidx)] sj2 = sj[~sj['_gidx'].isin(rgidx)] # write UT.write_pandas(ex2, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj2, dstpre+'.sj.txt.gz', 'h') ci = UT.chopintervals(ex2, dstpre+'.ci.txt.gz') return sj2, ex2
def gtf2exonsj(gtf, np=12, graphpre=None): """Extract exons and sj from GTF exon, junction coordinates = zero based (same as BED) junction start-1 = exon end junction end = exon start Args: gtf: Pandas.DataFrame Returns: sj, ex: Pandas.DataFrames for splice junctions and exons """ if len(gtf)==0: # edge case cols = GGB.BEDCOLS[:6]+['locus','_id','cat'] sj = UT.make_empty_df(cols) ex = UT.make_empty_df(cols) return sj,ex exons = gtf[gtf['typ']=='exon'].sort_values(['chr','st','ed']) exons['_id'] = N.arange(len(exons)) exons.sort_values(['transcript_id','st','ed'],inplace=True) # 5',3' ex_s = exons.groupby('transcript_id').size() tid_s = ex_s[ex_s==1].index id_m = ex_s[ex_s>1].index ex_m = exons[exons['transcript_id'].isin(id_m)].copy() ex_m.sort_values(['transcript_id','st','ed'], inplace=True) ex_f = ex_m.groupby('transcript_id').first() ex_l = ex_m.groupby('transcript_id').last() if5 = list(ex_f[ex_f['strand']=='+']['_id'].values) if3 = list(ex_f[ex_f['strand']=='-']['_id'].values) il5 = list(ex_l[ex_l['strand']=='-']['_id'].values) il3 = list(ex_l[ex_l['strand']=='+']['_id'].values) exons['kind'] = 'i' exons.loc[exons['transcript_id'].isin(tid_s),'kind'] = 's' exons.loc[exons['_id'].isin(if5+il5),'kind'] = '5' exons.loc[exons['_id'].isin(if3+il3),'kind'] = '3' # find junctions def _igen(): for k, g in exons.groupby('transcript_id'): if len(g)<2: continue g = g.sort_values(['st','ed']) chrom,strand,gid=g.iloc[0][['chr','strand','gene_id']] ists = g['ed'].values[:-1] + 1 ieds = g['st'].values[1:] - 1 for st,ed in izip(ists,ieds): # chr,st,ed,name=tid,sc1,strand,gene_id yield (chrom,st,ed,gid,0,strand) sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6]) sj['locus'] = UT.calc_locus_strand(sj) sj = sj.groupby('locus').first().reset_index() #cols = ['chr','st','ed','gene_id','sc1','strand'] #ex = exons #[cols] exons['locus'] = UT.calc_locus_strand(exons) ex = exons.groupby(['locus','kind']).first().reset_index() # remove duplicated ex['name'] = ex['gene_id'] ex['st'] = ex['st'] - 1 # position id == locus converted to number ex.sort_values(['chr','st','ed'],inplace=True) ex['_id'] = N.arange(len(ex)) exg = ex.groupby(['chr','st','ed'])[['_id','locus']].first() exg['_pid'] = N.arange(len(exg)) # position id cse2pid = dict(zip(exg.index,exg['_pid'])) ex['_pid'] = [cse2pid[tuple(x)] for x in ex[['chr','st','ed']].values] if len(sj)==0: ex['_gidx'] = N.arange(len(ex)) return sj, ex UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' if graphpre is None: graphpre = './'+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing genes = GP.find_genes4(sj,ex, filepre=prefix, np=np, override=False, separatese=True) return sj, ex