def as2exsj(dstpre, np=7): ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A2.EXDFCOLS) sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A2.SJDFCOLS) se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A2.EXDFCOLS) paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A2.PATHCOLS) #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.' #sj.loc[sj['strand'].isin(['.+','.-']),'strand'] = '.' sj['st'] = sj['st']+1 cols = A2.EXDFCOLS ex = PD.concat([ex[cols],se[cols]],ignore_index=True) UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' graphpre = dstpre+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing genes = GP.find_genes4(sj,ex, filepre=prefix, np=np, override=False, separatese=True) ex.loc[ex['kind']=='3','cat'] = '3' ex.loc[ex['kind']=='5','cat'] = '5' UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h') ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz') return sj, ex
def bed2exonsj(bed12, np=4, graphpre=None): """Extract exons and junctions from BED12 Args: bed12: Pandas.DataFrame containing BED12 data Returns: sj, ex: Pandas.DataFrames containing junction and exons """ esizes = bed12['esizes'].apply(lambda x: N.array([int(y) for y in x.split(',') if y])) estarts0 = bed12['estarts'].apply(lambda x: N.array([int(y) for y in x.split(',') if y])) bed12['_estarts'] = bed12['st'] + estarts0 bed12['_eends'] = bed12['_estarts']+esizes #istarts = eends[:-1] #iends = estarts[1:] cols =['chr','st','ed','tname','strand'] def _egen(): for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']): if len(est)==1: yield (chrom,st,ed,tname,0,strand,'s') else: if strand=='+': yield (chrom,est[0],eed[0],tname,0,strand,'5') for st,ed in izip(est[1:-1],eed[1:-1]): yield (chrom,st,ed,tname,0,strand,'i') yield (chrom,est[-1],eed[-1],tname,0,strand,'3') else: #'-' yield (chrom,est[0],eed[0],tname,0,strand,'3') for st,ed in izip(est[1:-1],eed[1:-1]): yield (chrom,st,ed,tname,0,strand,'i') yield (chrom,est[-1],eed[-1],tname,0,strand,'5') def _igen(): for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']): #for st,ed in izip(eed[:-1],est[1:]): for st,ed in izip(eed[:-1],est[1:]): yield (chrom,st+1,ed,tname,0,strand,'j') # add 1 to match STAR SJ.tab.out ex = PD.DataFrame([x for x in _egen()], columns=GGB.BEDCOLS[:6]+['kind']) ex['locus'] = UT.calc_locus_strand(ex) ex = ex.groupby('locus').first().reset_index() sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6]+['kind']) sj['locus'] = UT.calc_locus_strand(sj) sj = sj.groupby('locus').first().reset_index() UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' if graphpre is None: graphpre = './'+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing genes = GP.find_genes4(sj,ex, filepre=prefix, np=np, override=False, separatese=True) return sj, ex
def as3exsj(dstpre, minelen=150, np=7): ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A3.EXDFCOLS) sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A3.SJDFCOLS) se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A3.EXDFCOLS) paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A3.PATHCOLS) #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.' # sj.loc[sj['strand']=='.+','strand'] = '+' # sj.loc[sj['strand']=='.-','strand'] = '-' sj['st'] = sj['st']+1 cols = A3.EXDFCOLS ex = PD.concat([ex[cols],se[cols]],ignore_index=True) UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' graphpre = dstpre+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing # genes = GP.find_genes4(sj,ex, # filepre=prefix, # np=np, # override=False, # separatese=True) genes = GP.find_genes3(sj,ex, # don't use exon overlap as connection np=np, override=False) ex.loc[ex['kind']=='3','cat'] = '3' ex.loc[ex['kind']=='5','cat'] = '5' # remove these with elen smaller than minelen ex['len'] = ex['ed']-ex['st'] exsiz = ex.groupby('_gidx')['len'].sum() rgidx = exsiz[exsiz<minelen].index.values LOG.info('minelen filter #ex {0}=>{1}'.format(len(ex), len(ex)-len(rgidx))) ex2 = ex[~ex['_gidx'].isin(rgidx)] sj2 = sj[~sj['_gidx'].isin(rgidx)] # write UT.write_pandas(ex2, dstpre+'.ex.txt.gz', 'h') UT.write_pandas(sj2, dstpre+'.sj.txt.gz', 'h') ci = UT.chopintervals(ex2, dstpre+'.ci.txt.gz') return sj2, ex2
def test_selectseme(asm): f = AS.SELECTSEME(asm) UT.set_exon_category(asm.sj, asm.ae) f()
def gtf2exonsj(gtf, np=12, graphpre=None): """Extract exons and sj from GTF exon, junction coordinates = zero based (same as BED) junction start-1 = exon end junction end = exon start Args: gtf: Pandas.DataFrame Returns: sj, ex: Pandas.DataFrames for splice junctions and exons """ if len(gtf)==0: # edge case cols = GGB.BEDCOLS[:6]+['locus','_id','cat'] sj = UT.make_empty_df(cols) ex = UT.make_empty_df(cols) return sj,ex exons = gtf[gtf['typ']=='exon'].sort_values(['chr','st','ed']) exons['_id'] = N.arange(len(exons)) exons.sort_values(['transcript_id','st','ed'],inplace=True) # 5',3' ex_s = exons.groupby('transcript_id').size() tid_s = ex_s[ex_s==1].index id_m = ex_s[ex_s>1].index ex_m = exons[exons['transcript_id'].isin(id_m)].copy() ex_m.sort_values(['transcript_id','st','ed'], inplace=True) ex_f = ex_m.groupby('transcript_id').first() ex_l = ex_m.groupby('transcript_id').last() if5 = list(ex_f[ex_f['strand']=='+']['_id'].values) if3 = list(ex_f[ex_f['strand']=='-']['_id'].values) il5 = list(ex_l[ex_l['strand']=='-']['_id'].values) il3 = list(ex_l[ex_l['strand']=='+']['_id'].values) exons['kind'] = 'i' exons.loc[exons['transcript_id'].isin(tid_s),'kind'] = 's' exons.loc[exons['_id'].isin(if5+il5),'kind'] = '5' exons.loc[exons['_id'].isin(if3+il3),'kind'] = '3' # find junctions def _igen(): for k, g in exons.groupby('transcript_id'): if len(g)<2: continue g = g.sort_values(['st','ed']) chrom,strand,gid=g.iloc[0][['chr','strand','gene_id']] ists = g['ed'].values[:-1] + 1 ieds = g['st'].values[1:] - 1 for st,ed in izip(ists,ieds): # chr,st,ed,name=tid,sc1,strand,gene_id yield (chrom,st,ed,gid,0,strand) sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6]) sj['locus'] = UT.calc_locus_strand(sj) sj = sj.groupby('locus').first().reset_index() #cols = ['chr','st','ed','gene_id','sc1','strand'] #ex = exons #[cols] exons['locus'] = UT.calc_locus_strand(exons) ex = exons.groupby(['locus','kind']).first().reset_index() # remove duplicated ex['name'] = ex['gene_id'] ex['st'] = ex['st'] - 1 # position id == locus converted to number ex.sort_values(['chr','st','ed'],inplace=True) ex['_id'] = N.arange(len(ex)) exg = ex.groupby(['chr','st','ed'])[['_id','locus']].first() exg['_pid'] = N.arange(len(exg)) # position id cse2pid = dict(zip(exg.index,exg['_pid'])) ex['_pid'] = [cse2pid[tuple(x)] for x in ex[['chr','st','ed']].values] if len(sj)==0: ex['_gidx'] = N.arange(len(ex)) return sj, ex UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' if graphpre is None: graphpre = './'+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing genes = GP.find_genes4(sj,ex, filepre=prefix, np=np, override=False, separatese=True) return sj, ex
def find_genes4(sj, ae, filepre, cachename=None, np=1, override=False, depth=500, separatese=True): """ Adds _gidx column to ae Connection: 1) by junctions, 2) by overlap in the same strand Returns genes [set([_id,..]), ...] """ if '_id' not in ae.columns: LOG.info('setting ex _id...') UT.set_ids(ae) if '_id' not in sj.columns: LOG.info('setting sj _id...') UT.set_ids(sj) if 'cat' not in ae.columns: UT.set_exon_category(sj, ae) if 'a_id' not in ae.columns: UT.set_ad_info(sj, ae) ### FIND GENES if cachename and os.path.exists(cachename) and not override: LOG.info('loading cached genes (connected components)...') genes = pickle.load(open(cachename, 'rb')) else: LOG.info('finding genes (connected components)...') _sttime = time.time() if separatese: me, se = UT.mese(ae) genes = mcore_allcomponents4(sj, me, filepre, np, depth=depth) # SE genes genes += [set([x]) for x in se['_id']] else: genes = mcore_allcomponents4(sj, ae, filepre, np, depth=depth) # version 4 graph: uses overlaps in addition to junctions to connect # genes = [set([_id's]),...] if cachename: UT.makedirs(os.path.dirname(cachename)) pickle.dump(genes, open(cachename, 'wb')) LOG.info(' time: {0:.3f}s'.format(time.time() - _sttime)) ### WRITE EXONS W/ GENE number LOG.info('assigning gidx...') _sttime = time.time() i2g = {} # eid => _gidx i2gn = {} # eidt => gname g2gn = {} i2s = dict(UT.izipcols(ae, ['_id', 'strand'])) # eid => strand #i2c = dict(UT.izipcols(ae, ['_id','cat'])) # eid => category s2n = {'+': 'P', '-': 'N', '.': '', '.+': '', '.-': ''} c2n = {'s': 'S', 'i': 'G', '5': 'G', '3': 'G'} for i, ids in enumerate(genes): gid = i + 1 strand = s2n[i2s[list(ids)[0]]] cat = 'S' if len(ids) == 1 else 'G' if strand == 'N': # negative strand gid = -gid gname = 'J{0}{1}{2}'.format(strand, cat, abs(gid)) g2gn[gid] = gname for x in ids: i2g[x] = gid i2gn[x] = gname ae['_gidx'] = [i2g[x] for x in ae['_id']] ae['gname'] = [i2gn[x] for x in ae['_id']] ## set sj _gidx, use acceptor=>_gidx map (exon a_id, sj a_id) a2g = dict(UT.izipcols(ae, ['a_id', '_gidx'])) d2g = dict(UT.izipcols(ae, ['d_id', '_gidx'])) sj['_gidx'] = [ a2g.get(x, d2g.get(y, 0)) for x, y in UT.izipcols(sj, ['a_id', 'd_id']) ] sj['gname'] = [g2gn.get(x, '') for x in sj['_gidx']] # This shouldn't happen nidx = ae['_gidx'] == 0 if N.sum(nidx) > 0: LOG.warning( '###### WARNING!!!!!! exons with no gene assignment:{0}'.format( N.sum(nidx))) #ae.loc[nidx, '_gidx'] = N.arange(len(ae),len(ae)+N.sum(nidx)) return genes