Exemple #1
0
def as2exsj(dstpre, np=7):
    ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A2.EXDFCOLS)
    sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A2.SJDFCOLS)
    se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A2.EXDFCOLS)
    paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A2.PATHCOLS)
    #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.'
    #sj.loc[sj['strand'].isin(['.+','.-']),'strand'] = '.'
    sj['st'] = sj['st']+1 
    cols = A2.EXDFCOLS
    ex = PD.concat([ex[cols],se[cols]],ignore_index=True)
    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    graphpre = dstpre+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    genes = GP.find_genes4(sj,ex,
        filepre=prefix,
        np=np,
        override=False,
        separatese=True)
    ex.loc[ex['kind']=='3','cat'] = '3'
    ex.loc[ex['kind']=='5','cat'] = '5'
    UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h')
    ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz')
    return sj, ex
Exemple #2
0
    def __init__(self, sj, me, depth=500, maxcnt=990):
        UT.set_info(sj, me)
        self.sj = sj
        self.me = me
        self.depth = depth
        self.maxcnt = maxcnt

        # prepare joined table
        metbl = me[['a_id', '_id', 'd_id']]
        metbl_a = metbl[metbl['a_id'] != -1].rename(columns={
            'd_id': 'd_id_a',
            '_id': 'e_id_a'
        })
        metbl_d = metbl[metbl['d_id'] != -1].rename(columns={
            'a_id': 'a_id_d',
            '_id': 'e_id_d'
        })
        sjtbl = sj[['d_id', '_id', 'a_id']]
        # join on donor
        j1 = PD.merge(metbl_d, sjtbl, how='outer', on='d_id', sort=False)
        j2 = PD.merge(j1, metbl_a, how='outer', on='a_id', sort=False)
        # remove dangling exons, junctions
        j2nd = j2[j2['e_id_a'].notnull() & j2['e_id_d'].notnull()].copy()

        # groupby exon id
        j2nd['e_id_d'] = j2nd['e_id_d'].astype(int)
        j2nd['e_id_a'] = j2nd['e_id_a'].astype(int)
        self.a = j2nd.groupby('e_id_a')['e_id_d']
        self.d = j2nd.groupby('e_id_d')['e_id_a']
        self.j2 = j2
        self.j2nd = j2nd
        self.ga = j2.groupby('a_id')  # groupby acceptor
        self.gd = j2.groupby('d_id')  # groupby donor
        self.exons = me
Exemple #3
0
def bed2exonsj(bed12, np=4, graphpre=None):
    """Extract exons and junctions from BED12

    Args:
        bed12: Pandas.DataFrame containing BED12 data

    Returns:
        sj, ex: Pandas.DataFrames containing junction and exons

    """
    esizes = bed12['esizes'].apply(lambda x: N.array([int(y) for y in x.split(',') if y]))
    estarts0 = bed12['estarts'].apply(lambda x: N.array([int(y) for y in x.split(',') if y]))
    bed12['_estarts'] = bed12['st'] + estarts0
    bed12['_eends'] = bed12['_estarts']+esizes
    #istarts = eends[:-1]
    #iends = estarts[1:]
    cols =['chr','st','ed','tname','strand']
    def _egen():
        for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']):
            if len(est)==1:
                yield (chrom,st,ed,tname,0,strand,'s')
            else:
                if strand=='+':
                    yield (chrom,est[0],eed[0],tname,0,strand,'5')
                    for st,ed in izip(est[1:-1],eed[1:-1]):
                        yield (chrom,st,ed,tname,0,strand,'i')
                    yield (chrom,est[-1],eed[-1],tname,0,strand,'3')
                else: #'-'
                    yield (chrom,est[0],eed[0],tname,0,strand,'3')
                    for st,ed in izip(est[1:-1],eed[1:-1]):
                        yield (chrom,st,ed,tname,0,strand,'i')
                    yield (chrom,est[-1],eed[-1],tname,0,strand,'5')
    def _igen():
        for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']):
            #for st,ed in izip(eed[:-1],est[1:]):
            for st,ed in izip(eed[:-1],est[1:]):
                yield (chrom,st+1,ed,tname,0,strand,'j')
                # add 1 to match STAR SJ.tab.out 
    ex = PD.DataFrame([x for x in _egen()], columns=GGB.BEDCOLS[:6]+['kind'])
    ex['locus'] = UT.calc_locus_strand(ex)
    ex = ex.groupby('locus').first().reset_index()
    sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6]+['kind'])
    sj['locus'] = UT.calc_locus_strand(sj)
    sj = sj.groupby('locus').first().reset_index()

    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    if graphpre is None:
        graphpre = './'+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    genes = GP.find_genes4(sj,ex,
        filepre=prefix,
        np=np,
        override=False,
        separatese=True)

    return sj, ex
Exemple #4
0
def as3exsj(dstpre, minelen=150, np=7):
    ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A3.EXDFCOLS)
    sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A3.SJDFCOLS)
    se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A3.EXDFCOLS)
    paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A3.PATHCOLS)
    #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.'
    # sj.loc[sj['strand']=='.+','strand'] = '+'
    # sj.loc[sj['strand']=='.-','strand'] = '-'
    sj['st'] = sj['st']+1 
    cols = A3.EXDFCOLS
    ex = PD.concat([ex[cols],se[cols]],ignore_index=True)
    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    graphpre = dstpre+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    # genes = GP.find_genes4(sj,ex,
    #     filepre=prefix,
    #     np=np,
    #     override=False,
    #     separatese=True)
    genes = GP.find_genes3(sj,ex, # don't use exon overlap as connection
        np=np,
        override=False)
    ex.loc[ex['kind']=='3','cat'] = '3'
    ex.loc[ex['kind']=='5','cat'] = '5'

    # remove these with elen smaller than minelen
    ex['len'] = ex['ed']-ex['st']
    exsiz = ex.groupby('_gidx')['len'].sum()
    rgidx = exsiz[exsiz<minelen].index.values
    LOG.info('minelen filter #ex {0}=>{1}'.format(len(ex), len(ex)-len(rgidx)))
    ex2 = ex[~ex['_gidx'].isin(rgidx)]
    sj2 = sj[~sj['_gidx'].isin(rgidx)]
    # write
    UT.write_pandas(ex2, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj2, dstpre+'.sj.txt.gz', 'h')
    ci = UT.chopintervals(ex2, dstpre+'.ci.txt.gz')
    return sj2, ex2
Exemple #5
0
def gtf2exonsj(gtf, np=12, graphpre=None):
    """Extract exons and sj from GTF
    exon, junction coordinates = zero based (same as BED)
    junction start-1 = exon end
    junction end = exon start

    Args:
        gtf: Pandas.DataFrame 

    Returns:
        sj, ex: Pandas.DataFrames for splice junctions and exons

    """
    if len(gtf)==0: # edge case
        cols = GGB.BEDCOLS[:6]+['locus','_id','cat']
        sj = UT.make_empty_df(cols)
        ex = UT.make_empty_df(cols)
        return sj,ex
    exons = gtf[gtf['typ']=='exon'].sort_values(['chr','st','ed'])
    exons['_id'] = N.arange(len(exons))
    exons.sort_values(['transcript_id','st','ed'],inplace=True)
    # 5',3'
    ex_s = exons.groupby('transcript_id').size()
    tid_s = ex_s[ex_s==1].index
    id_m = ex_s[ex_s>1].index
    ex_m = exons[exons['transcript_id'].isin(id_m)].copy()
    ex_m.sort_values(['transcript_id','st','ed'], inplace=True)
    ex_f = ex_m.groupby('transcript_id').first()
    ex_l = ex_m.groupby('transcript_id').last()
    if5 = list(ex_f[ex_f['strand']=='+']['_id'].values)
    if3 = list(ex_f[ex_f['strand']=='-']['_id'].values)
    il5 = list(ex_l[ex_l['strand']=='-']['_id'].values)
    il3 = list(ex_l[ex_l['strand']=='+']['_id'].values)
    exons['kind'] = 'i'
    exons.loc[exons['transcript_id'].isin(tid_s),'kind'] = 's'
    exons.loc[exons['_id'].isin(if5+il5),'kind'] = '5'
    exons.loc[exons['_id'].isin(if3+il3),'kind'] = '3'

    # find junctions
    def _igen():
        for k, g in exons.groupby('transcript_id'):
            if len(g)<2:
                continue
            g = g.sort_values(['st','ed'])
            chrom,strand,gid=g.iloc[0][['chr','strand','gene_id']]
            ists = g['ed'].values[:-1] + 1
            ieds = g['st'].values[1:] - 1
            for st,ed in izip(ists,ieds):
                # chr,st,ed,name=tid,sc1,strand,gene_id
                yield (chrom,st,ed,gid,0,strand)
    sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6])
    sj['locus'] = UT.calc_locus_strand(sj)
    sj = sj.groupby('locus').first().reset_index()

    #cols = ['chr','st','ed','gene_id','sc1','strand']
    #ex = exons #[cols]
    exons['locus'] = UT.calc_locus_strand(exons)
    ex = exons.groupby(['locus','kind']).first().reset_index() # remove duplicated
    ex['name'] = ex['gene_id']
    ex['st'] = ex['st'] - 1
    # position id == locus converted to number
    ex.sort_values(['chr','st','ed'],inplace=True) 
    ex['_id'] = N.arange(len(ex))
    exg = ex.groupby(['chr','st','ed'])[['_id','locus']].first()
    exg['_pid'] = N.arange(len(exg)) # position id
    cse2pid = dict(zip(exg.index,exg['_pid']))
    ex['_pid'] = [cse2pid[tuple(x)] for x in ex[['chr','st','ed']].values]

    if len(sj)==0:
        ex['_gidx'] = N.arange(len(ex))
        return sj, ex

    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    if graphpre is None:
        graphpre = './'+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    genes = GP.find_genes4(sj,ex,
        filepre=prefix,
        np=np,
        override=False,
        separatese=True)

    return sj, ex