Esempio n. 1
0
def bed2exonsj(bed12, np=4, graphpre=None):
    """Extract exons and junctions from BED12

    Args:
        bed12: Pandas.DataFrame containing BED12 data

    Returns:
        sj, ex: Pandas.DataFrames containing junction and exons

    """
    esizes = bed12['esizes'].apply(lambda x: N.array([int(y) for y in x.split(',') if y]))
    estarts0 = bed12['estarts'].apply(lambda x: N.array([int(y) for y in x.split(',') if y]))
    bed12['_estarts'] = bed12['st'] + estarts0
    bed12['_eends'] = bed12['_estarts']+esizes
    #istarts = eends[:-1]
    #iends = estarts[1:]
    cols =['chr','st','ed','tname','strand']
    def _egen():
        for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']):
            if len(est)==1:
                yield (chrom,st,ed,tname,0,strand,'s')
            else:
                if strand=='+':
                    yield (chrom,est[0],eed[0],tname,0,strand,'5')
                    for st,ed in izip(est[1:-1],eed[1:-1]):
                        yield (chrom,st,ed,tname,0,strand,'i')
                    yield (chrom,est[-1],eed[-1],tname,0,strand,'3')
                else: #'-'
                    yield (chrom,est[0],eed[0],tname,0,strand,'3')
                    for st,ed in izip(est[1:-1],eed[1:-1]):
                        yield (chrom,st,ed,tname,0,strand,'i')
                    yield (chrom,est[-1],eed[-1],tname,0,strand,'5')
    def _igen():
        for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']):
            #for st,ed in izip(eed[:-1],est[1:]):
            for st,ed in izip(eed[:-1],est[1:]):
                yield (chrom,st+1,ed,tname,0,strand,'j')
                # add 1 to match STAR SJ.tab.out 
    ex = PD.DataFrame([x for x in _egen()], columns=GGB.BEDCOLS[:6]+['kind'])
    ex['locus'] = UT.calc_locus_strand(ex)
    ex = ex.groupby('locus').first().reset_index()
    sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6]+['kind'])
    sj['locus'] = UT.calc_locus_strand(sj)
    sj = sj.groupby('locus').first().reset_index()

    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    if graphpre is None:
        graphpre = './'+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    genes = GP.find_genes4(sj,ex,
        filepre=prefix,
        np=np,
        override=False,
        separatese=True)

    return sj, ex
Esempio n. 2
0
 def assign_tcode_sj(self):
     self.sj_tgt = stgt = self.cn_tgt.model(
         'sj')  #UT.read_pandas(self.p1.sj)
     self.sj_ref = sref = self.cn_ref.model(
         'sj')  #UT.read_pandas(self.p2.sj)
     if 'locus' not in stgt.columns:
         stgt['locus'] = UT.calc_locus_strand(stgt)
     if 'locus' not in sref.columns:
         sref['locus'] = UT.calc_locus_strand(sref)
     l2c = dict([(x, 'k.me') for x in sref['locus']])
     rcode = self.cn_ref.code
     setfld = 'etcode_' + rcode
     sgtfld = 'gtcode_' + rcode
     stgt[setfld] = [l2c.get(x, 'u.me') for x in stgt['locus']]
     g2c = UT.df2dict(self.ex_tgt, '_gidx', 'gtcode_' + rcode)
     stgt[sgtfld] = [g2c.get(x, 'u.me') for x in stgt['_gidx']]
Esempio n. 3
0
def test_calc_locus_strand():
    df = PD.DataFrame({
        'chr': ['chr1', 'chr2', 'chr3'],
        'st': [0, 1, 2],
        'ed': [10, 20, 30],
        'strand': ['+', '-', '+']
    })
    l = UT.calc_locus_strand(df)
    o = ['chr1:0-10:+', 'chr2:1-20:-', 'chr3:2-30:+']
    assert o == list(l.values)
Esempio n. 4
0
def test_make_sj_bed(sampleinfo, outdir):
	fni = MG.MergeInputNames(sampleinfo, 'Fev_merge_test', outdir)
	mi = MG.MergeInputs(fni, genome='mm10', np=1)
	mi.make_sj_bed()
	assert os.path.exists(fni.sj0_bed())
	assert os.path.exists(fni.allsj_txt())
	assert os.path.exists(fni.allsj_stats())
	assert os.path.exists(fni.sj_bed('p'))
	assert os.path.exists(fni.sj_bed('n'))
	# sj include unstranded junction
	sjp = GGB.read_sj(fni.sj_bed('p'))
	sjp['locus'] = UT.calc_locus_strand(sjp)
Esempio n. 5
0
def gtf2exonsj(gtf, np=12, graphpre=None):
    """Extract exons and sj from GTF
    exon, junction coordinates = zero based (same as BED)
    junction start-1 = exon end
    junction end = exon start

    Args:
        gtf: Pandas.DataFrame 

    Returns:
        sj, ex: Pandas.DataFrames for splice junctions and exons

    """
    if len(gtf)==0: # edge case
        cols = GGB.BEDCOLS[:6]+['locus','_id','cat']
        sj = UT.make_empty_df(cols)
        ex = UT.make_empty_df(cols)
        return sj,ex
    exons = gtf[gtf['typ']=='exon'].sort_values(['chr','st','ed'])
    exons['_id'] = N.arange(len(exons))
    exons.sort_values(['transcript_id','st','ed'],inplace=True)
    # 5',3'
    ex_s = exons.groupby('transcript_id').size()
    tid_s = ex_s[ex_s==1].index
    id_m = ex_s[ex_s>1].index
    ex_m = exons[exons['transcript_id'].isin(id_m)].copy()
    ex_m.sort_values(['transcript_id','st','ed'], inplace=True)
    ex_f = ex_m.groupby('transcript_id').first()
    ex_l = ex_m.groupby('transcript_id').last()
    if5 = list(ex_f[ex_f['strand']=='+']['_id'].values)
    if3 = list(ex_f[ex_f['strand']=='-']['_id'].values)
    il5 = list(ex_l[ex_l['strand']=='-']['_id'].values)
    il3 = list(ex_l[ex_l['strand']=='+']['_id'].values)
    exons['kind'] = 'i'
    exons.loc[exons['transcript_id'].isin(tid_s),'kind'] = 's'
    exons.loc[exons['_id'].isin(if5+il5),'kind'] = '5'
    exons.loc[exons['_id'].isin(if3+il3),'kind'] = '3'

    # find junctions
    def _igen():
        for k, g in exons.groupby('transcript_id'):
            if len(g)<2:
                continue
            g = g.sort_values(['st','ed'])
            chrom,strand,gid=g.iloc[0][['chr','strand','gene_id']]
            ists = g['ed'].values[:-1] + 1
            ieds = g['st'].values[1:] - 1
            for st,ed in izip(ists,ieds):
                # chr,st,ed,name=tid,sc1,strand,gene_id
                yield (chrom,st,ed,gid,0,strand)
    sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6])
    sj['locus'] = UT.calc_locus_strand(sj)
    sj = sj.groupby('locus').first().reset_index()

    #cols = ['chr','st','ed','gene_id','sc1','strand']
    #ex = exons #[cols]
    exons['locus'] = UT.calc_locus_strand(exons)
    ex = exons.groupby(['locus','kind']).first().reset_index() # remove duplicated
    ex['name'] = ex['gene_id']
    ex['st'] = ex['st'] - 1
    # position id == locus converted to number
    ex.sort_values(['chr','st','ed'],inplace=True) 
    ex['_id'] = N.arange(len(ex))
    exg = ex.groupby(['chr','st','ed'])[['_id','locus']].first()
    exg['_pid'] = N.arange(len(exg)) # position id
    cse2pid = dict(zip(exg.index,exg['_pid']))
    ex['_pid'] = [cse2pid[tuple(x)] for x in ex[['chr','st','ed']].values]

    if len(sj)==0:
        ex['_gidx'] = N.arange(len(ex))
        return sj, ex

    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    if graphpre is None:
        graphpre = './'+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    genes = GP.find_genes4(sj,ex,
        filepre=prefix,
        np=np,
        override=False,
        separatese=True)

    return sj, ex
Esempio n. 6
0
    def prep_sjex(self, en, np=1, savesjex=True, calccovs=True):
        """ Assign ecov, gcov, jcnt """
        dcode = self.datacode
        sj = en.model('sj', dcode)
        ex = en.model('ex', dcode)
        savesj = False
        saveex = False
        # check support
        if len(sj) > 0:
            dids = set(ex['d_id'].values)
            aids = set(ex['a_id'].values)
            idx = sj['a_id'].isin(aids) & sj['d_id'].isin(dids)
            sj = sj[idx].copy()
            en.sj = sj
        if '_id' not in ex.columns:  # edge case (len(sj)==0)
            ex['_id'] = N.arange(len(ex))
        if '_gidx' not in ex.columns:  # edge case (len(sj)==0)
            ex['_gidx'] = N.arange(len(ex))

        # length
        if 'len' not in sj.columns:
            sj['len'] = sj['ed'] - sj['st']
            savesj = True
        if 'len' not in ex.columns:
            ex['len'] = ex['ed'] - ex['st']
            saveex = True
        # ecov
        if calccovs:
            print('calccov for {0}'.format(en.code))
            ecovname = self.colname('ecov')
            if ecovname not in ex.columns:
                ecov = CC.calc_ecov(
                    expath=en.modelpath('ex'),
                    cipath=en.modelpath('ci'),
                    bwpath=self.bigwig,
                    dstprefix=en.fname2(
                        '', self.datacode),  # cov is data dependent
                    override=False,  # override previous?
                    np=np)
                ex[ecovname] = ecov.set_index('eid').ix[
                    ex['_id'].values]['ecov'].values
                saveex = True
            # gcov, glen
            gcovname = self.colname('gcov')
            if gcovname not in ex.columns:
                gcov = CC.calc_gcov(
                    expath=en.modelpath('ex'),
                    cipath=en.modelpath('ci'),
                    bwpath=self.bigwig,
                    dstprefix=en.fname2('', self.datacode),
                    override=False,  # reuse covci from ecov calc
                    np=np)
                tmp = gcov.set_index('_gidx').ix[ex['_gidx'].values]
                ex[gcovname] = tmp['gcov'].values
                if 'glen' in tmp:
                    ex['glen'] = tmp[
                        'glen'].values  # glen is only dependent on model not data
                saveex = True
        else:
            ecovname = self.colname('ecov')
            if ecovname not in ex.columns:
                ex[ecovname] = 0
            gcovname = self.colname('gcov')
            if gcovname not in ex.columns:
                ex[gcovname] = 0
        # sjcnt
        ucntname = self.colname('ucnt')
        mcntname = self.colname('mcnt')
        jcntname = self.colname('jcnt')
        sjfile = self.sjfile
        if ucntname not in sj.columns:
            if sjfile.endswith('.bed') or sjfile.endswith(
                    '.bed.gz'):  # no header
                dsj = UT.read_pandas(sjfile,
                                     names=[
                                         'chr', 'st', 'ed', 'name', 'ucnt',
                                         'strand', 'mcnt'
                                     ])
            else:  # assume txt file with header
                dsj = UT.read_pandas(sjfile)
            # locus based matching
            dsj['locus'] = UT.calc_locus_strand(dsj)
            sj['locus'] = UT.calc_locus_strand(sj)
            l2u = UT.df2dict(dsj, 'locus', 'ucnt')
            l2m = UT.df2dict(dsj, 'locus', 'mcnt')
            sj[ucntname] = [l2u.get(x, 0) for x in sj['locus']]
            sj[mcntname] = [l2m.get(x, 0) for x in sj['locus']]
            sj[jcntname] = [x or y for x, y in sj[[ucntname, mcntname]].values]
            savesj = True
        if saveex and savesjex:
            en.savemodel('ex', dcode, category='output')
        if savesj and savesjex:
            en.savemodel('sj', dcode, category='output')