def bed2exonsj(bed12, np=4, graphpre=None): """Extract exons and junctions from BED12 Args: bed12: Pandas.DataFrame containing BED12 data Returns: sj, ex: Pandas.DataFrames containing junction and exons """ esizes = bed12['esizes'].apply(lambda x: N.array([int(y) for y in x.split(',') if y])) estarts0 = bed12['estarts'].apply(lambda x: N.array([int(y) for y in x.split(',') if y])) bed12['_estarts'] = bed12['st'] + estarts0 bed12['_eends'] = bed12['_estarts']+esizes #istarts = eends[:-1] #iends = estarts[1:] cols =['chr','st','ed','tname','strand'] def _egen(): for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']): if len(est)==1: yield (chrom,st,ed,tname,0,strand,'s') else: if strand=='+': yield (chrom,est[0],eed[0],tname,0,strand,'5') for st,ed in izip(est[1:-1],eed[1:-1]): yield (chrom,st,ed,tname,0,strand,'i') yield (chrom,est[-1],eed[-1],tname,0,strand,'3') else: #'-' yield (chrom,est[0],eed[0],tname,0,strand,'3') for st,ed in izip(est[1:-1],eed[1:-1]): yield (chrom,st,ed,tname,0,strand,'i') yield (chrom,est[-1],eed[-1],tname,0,strand,'5') def _igen(): for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']): #for st,ed in izip(eed[:-1],est[1:]): for st,ed in izip(eed[:-1],est[1:]): yield (chrom,st+1,ed,tname,0,strand,'j') # add 1 to match STAR SJ.tab.out ex = PD.DataFrame([x for x in _egen()], columns=GGB.BEDCOLS[:6]+['kind']) ex['locus'] = UT.calc_locus_strand(ex) ex = ex.groupby('locus').first().reset_index() sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6]+['kind']) sj['locus'] = UT.calc_locus_strand(sj) sj = sj.groupby('locus').first().reset_index() UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' if graphpre is None: graphpre = './'+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing genes = GP.find_genes4(sj,ex, filepre=prefix, np=np, override=False, separatese=True) return sj, ex
def assign_tcode_sj(self): self.sj_tgt = stgt = self.cn_tgt.model( 'sj') #UT.read_pandas(self.p1.sj) self.sj_ref = sref = self.cn_ref.model( 'sj') #UT.read_pandas(self.p2.sj) if 'locus' not in stgt.columns: stgt['locus'] = UT.calc_locus_strand(stgt) if 'locus' not in sref.columns: sref['locus'] = UT.calc_locus_strand(sref) l2c = dict([(x, 'k.me') for x in sref['locus']]) rcode = self.cn_ref.code setfld = 'etcode_' + rcode sgtfld = 'gtcode_' + rcode stgt[setfld] = [l2c.get(x, 'u.me') for x in stgt['locus']] g2c = UT.df2dict(self.ex_tgt, '_gidx', 'gtcode_' + rcode) stgt[sgtfld] = [g2c.get(x, 'u.me') for x in stgt['_gidx']]
def test_calc_locus_strand(): df = PD.DataFrame({ 'chr': ['chr1', 'chr2', 'chr3'], 'st': [0, 1, 2], 'ed': [10, 20, 30], 'strand': ['+', '-', '+'] }) l = UT.calc_locus_strand(df) o = ['chr1:0-10:+', 'chr2:1-20:-', 'chr3:2-30:+'] assert o == list(l.values)
def test_make_sj_bed(sampleinfo, outdir): fni = MG.MergeInputNames(sampleinfo, 'Fev_merge_test', outdir) mi = MG.MergeInputs(fni, genome='mm10', np=1) mi.make_sj_bed() assert os.path.exists(fni.sj0_bed()) assert os.path.exists(fni.allsj_txt()) assert os.path.exists(fni.allsj_stats()) assert os.path.exists(fni.sj_bed('p')) assert os.path.exists(fni.sj_bed('n')) # sj include unstranded junction sjp = GGB.read_sj(fni.sj_bed('p')) sjp['locus'] = UT.calc_locus_strand(sjp)
def gtf2exonsj(gtf, np=12, graphpre=None): """Extract exons and sj from GTF exon, junction coordinates = zero based (same as BED) junction start-1 = exon end junction end = exon start Args: gtf: Pandas.DataFrame Returns: sj, ex: Pandas.DataFrames for splice junctions and exons """ if len(gtf)==0: # edge case cols = GGB.BEDCOLS[:6]+['locus','_id','cat'] sj = UT.make_empty_df(cols) ex = UT.make_empty_df(cols) return sj,ex exons = gtf[gtf['typ']=='exon'].sort_values(['chr','st','ed']) exons['_id'] = N.arange(len(exons)) exons.sort_values(['transcript_id','st','ed'],inplace=True) # 5',3' ex_s = exons.groupby('transcript_id').size() tid_s = ex_s[ex_s==1].index id_m = ex_s[ex_s>1].index ex_m = exons[exons['transcript_id'].isin(id_m)].copy() ex_m.sort_values(['transcript_id','st','ed'], inplace=True) ex_f = ex_m.groupby('transcript_id').first() ex_l = ex_m.groupby('transcript_id').last() if5 = list(ex_f[ex_f['strand']=='+']['_id'].values) if3 = list(ex_f[ex_f['strand']=='-']['_id'].values) il5 = list(ex_l[ex_l['strand']=='-']['_id'].values) il3 = list(ex_l[ex_l['strand']=='+']['_id'].values) exons['kind'] = 'i' exons.loc[exons['transcript_id'].isin(tid_s),'kind'] = 's' exons.loc[exons['_id'].isin(if5+il5),'kind'] = '5' exons.loc[exons['_id'].isin(if3+il3),'kind'] = '3' # find junctions def _igen(): for k, g in exons.groupby('transcript_id'): if len(g)<2: continue g = g.sort_values(['st','ed']) chrom,strand,gid=g.iloc[0][['chr','strand','gene_id']] ists = g['ed'].values[:-1] + 1 ieds = g['st'].values[1:] - 1 for st,ed in izip(ists,ieds): # chr,st,ed,name=tid,sc1,strand,gene_id yield (chrom,st,ed,gid,0,strand) sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6]) sj['locus'] = UT.calc_locus_strand(sj) sj = sj.groupby('locus').first().reset_index() #cols = ['chr','st','ed','gene_id','sc1','strand'] #ex = exons #[cols] exons['locus'] = UT.calc_locus_strand(exons) ex = exons.groupby(['locus','kind']).first().reset_index() # remove duplicated ex['name'] = ex['gene_id'] ex['st'] = ex['st'] - 1 # position id == locus converted to number ex.sort_values(['chr','st','ed'],inplace=True) ex['_id'] = N.arange(len(ex)) exg = ex.groupby(['chr','st','ed'])[['_id','locus']].first() exg['_pid'] = N.arange(len(exg)) # position id cse2pid = dict(zip(exg.index,exg['_pid'])) ex['_pid'] = [cse2pid[tuple(x)] for x in ex[['chr','st','ed']].values] if len(sj)==0: ex['_gidx'] = N.arange(len(ex)) return sj, ex UT.set_info(sj,ex) UT.set_exon_category(sj, ex) # find genes (connected components) set '_gidx' if graphpre is None: graphpre = './'+str(uuid.uuid4())+'_' prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing genes = GP.find_genes4(sj,ex, filepre=prefix, np=np, override=False, separatese=True) return sj, ex
def prep_sjex(self, en, np=1, savesjex=True, calccovs=True): """ Assign ecov, gcov, jcnt """ dcode = self.datacode sj = en.model('sj', dcode) ex = en.model('ex', dcode) savesj = False saveex = False # check support if len(sj) > 0: dids = set(ex['d_id'].values) aids = set(ex['a_id'].values) idx = sj['a_id'].isin(aids) & sj['d_id'].isin(dids) sj = sj[idx].copy() en.sj = sj if '_id' not in ex.columns: # edge case (len(sj)==0) ex['_id'] = N.arange(len(ex)) if '_gidx' not in ex.columns: # edge case (len(sj)==0) ex['_gidx'] = N.arange(len(ex)) # length if 'len' not in sj.columns: sj['len'] = sj['ed'] - sj['st'] savesj = True if 'len' not in ex.columns: ex['len'] = ex['ed'] - ex['st'] saveex = True # ecov if calccovs: print('calccov for {0}'.format(en.code)) ecovname = self.colname('ecov') if ecovname not in ex.columns: ecov = CC.calc_ecov( expath=en.modelpath('ex'), cipath=en.modelpath('ci'), bwpath=self.bigwig, dstprefix=en.fname2( '', self.datacode), # cov is data dependent override=False, # override previous? np=np) ex[ecovname] = ecov.set_index('eid').ix[ ex['_id'].values]['ecov'].values saveex = True # gcov, glen gcovname = self.colname('gcov') if gcovname not in ex.columns: gcov = CC.calc_gcov( expath=en.modelpath('ex'), cipath=en.modelpath('ci'), bwpath=self.bigwig, dstprefix=en.fname2('', self.datacode), override=False, # reuse covci from ecov calc np=np) tmp = gcov.set_index('_gidx').ix[ex['_gidx'].values] ex[gcovname] = tmp['gcov'].values if 'glen' in tmp: ex['glen'] = tmp[ 'glen'].values # glen is only dependent on model not data saveex = True else: ecovname = self.colname('ecov') if ecovname not in ex.columns: ex[ecovname] = 0 gcovname = self.colname('gcov') if gcovname not in ex.columns: ex[gcovname] = 0 # sjcnt ucntname = self.colname('ucnt') mcntname = self.colname('mcnt') jcntname = self.colname('jcnt') sjfile = self.sjfile if ucntname not in sj.columns: if sjfile.endswith('.bed') or sjfile.endswith( '.bed.gz'): # no header dsj = UT.read_pandas(sjfile, names=[ 'chr', 'st', 'ed', 'name', 'ucnt', 'strand', 'mcnt' ]) else: # assume txt file with header dsj = UT.read_pandas(sjfile) # locus based matching dsj['locus'] = UT.calc_locus_strand(dsj) sj['locus'] = UT.calc_locus_strand(sj) l2u = UT.df2dict(dsj, 'locus', 'ucnt') l2m = UT.df2dict(dsj, 'locus', 'mcnt') sj[ucntname] = [l2u.get(x, 0) for x in sj['locus']] sj[mcntname] = [l2m.get(x, 0) for x in sj['locus']] sj[jcntname] = [x or y for x, y in sj[[ucntname, mcntname]].values] savesj = True if saveex and savesjex: en.savemodel('ex', dcode, category='output') if savesj and savesjex: en.savemodel('sj', dcode, category='output')