Exemple #1
0
def as2exsj(dstpre, np=7):
    ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A2.EXDFCOLS)
    sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A2.SJDFCOLS)
    se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A2.EXDFCOLS)
    paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A2.PATHCOLS)
    #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.'
    #sj.loc[sj['strand'].isin(['.+','.-']),'strand'] = '.'
    sj['st'] = sj['st']+1 
    cols = A2.EXDFCOLS
    ex = PD.concat([ex[cols],se[cols]],ignore_index=True)
    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    graphpre = dstpre+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    genes = GP.find_genes4(sj,ex,
        filepre=prefix,
        np=np,
        override=False,
        separatese=True)
    ex.loc[ex['kind']=='3','cat'] = '3'
    ex.loc[ex['kind']=='5','cat'] = '5'
    UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h')
    ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz')
    return sj, ex
Exemple #2
0
def bed2exonsj(bed12, np=4, graphpre=None):
    """Extract exons and junctions from BED12

    Args:
        bed12: Pandas.DataFrame containing BED12 data

    Returns:
        sj, ex: Pandas.DataFrames containing junction and exons

    """
    esizes = bed12['esizes'].apply(lambda x: N.array([int(y) for y in x.split(',') if y]))
    estarts0 = bed12['estarts'].apply(lambda x: N.array([int(y) for y in x.split(',') if y]))
    bed12['_estarts'] = bed12['st'] + estarts0
    bed12['_eends'] = bed12['_estarts']+esizes
    #istarts = eends[:-1]
    #iends = estarts[1:]
    cols =['chr','st','ed','tname','strand']
    def _egen():
        for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']):
            if len(est)==1:
                yield (chrom,st,ed,tname,0,strand,'s')
            else:
                if strand=='+':
                    yield (chrom,est[0],eed[0],tname,0,strand,'5')
                    for st,ed in izip(est[1:-1],eed[1:-1]):
                        yield (chrom,st,ed,tname,0,strand,'i')
                    yield (chrom,est[-1],eed[-1],tname,0,strand,'3')
                else: #'-'
                    yield (chrom,est[0],eed[0],tname,0,strand,'3')
                    for st,ed in izip(est[1:-1],eed[1:-1]):
                        yield (chrom,st,ed,tname,0,strand,'i')
                    yield (chrom,est[-1],eed[-1],tname,0,strand,'5')
    def _igen():
        for chrom,tname,strand,est,eed in UT.izipcols(bed12,['chr','name','strand','_estarts','_eends']):
            #for st,ed in izip(eed[:-1],est[1:]):
            for st,ed in izip(eed[:-1],est[1:]):
                yield (chrom,st+1,ed,tname,0,strand,'j')
                # add 1 to match STAR SJ.tab.out 
    ex = PD.DataFrame([x for x in _egen()], columns=GGB.BEDCOLS[:6]+['kind'])
    ex['locus'] = UT.calc_locus_strand(ex)
    ex = ex.groupby('locus').first().reset_index()
    sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6]+['kind'])
    sj['locus'] = UT.calc_locus_strand(sj)
    sj = sj.groupby('locus').first().reset_index()

    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    if graphpre is None:
        graphpre = './'+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    genes = GP.find_genes4(sj,ex,
        filepre=prefix,
        np=np,
        override=False,
        separatese=True)

    return sj, ex
Exemple #3
0
def as3exsj(dstpre, minelen=150, np=7):
    ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A3.EXDFCOLS)
    sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A3.SJDFCOLS)
    se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A3.EXDFCOLS)
    paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A3.PATHCOLS)
    #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.'
    # sj.loc[sj['strand']=='.+','strand'] = '+'
    # sj.loc[sj['strand']=='.-','strand'] = '-'
    sj['st'] = sj['st']+1 
    cols = A3.EXDFCOLS
    ex = PD.concat([ex[cols],se[cols]],ignore_index=True)
    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    graphpre = dstpre+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    # genes = GP.find_genes4(sj,ex,
    #     filepre=prefix,
    #     np=np,
    #     override=False,
    #     separatese=True)
    genes = GP.find_genes3(sj,ex, # don't use exon overlap as connection
        np=np,
        override=False)
    ex.loc[ex['kind']=='3','cat'] = '3'
    ex.loc[ex['kind']=='5','cat'] = '5'

    # remove these with elen smaller than minelen
    ex['len'] = ex['ed']-ex['st']
    exsiz = ex.groupby('_gidx')['len'].sum()
    rgidx = exsiz[exsiz<minelen].index.values
    LOG.info('minelen filter #ex {0}=>{1}'.format(len(ex), len(ex)-len(rgidx)))
    ex2 = ex[~ex['_gidx'].isin(rgidx)]
    sj2 = sj[~sj['_gidx'].isin(rgidx)]
    # write
    UT.write_pandas(ex2, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj2, dstpre+'.sj.txt.gz', 'h')
    ci = UT.chopintervals(ex2, dstpre+'.ci.txt.gz')
    return sj2, ex2
Exemple #4
0
def test_selectseme(asm):
    f = AS.SELECTSEME(asm)
    UT.set_exon_category(asm.sj, asm.ae)
    f()
Exemple #5
0
def gtf2exonsj(gtf, np=12, graphpre=None):
    """Extract exons and sj from GTF
    exon, junction coordinates = zero based (same as BED)
    junction start-1 = exon end
    junction end = exon start

    Args:
        gtf: Pandas.DataFrame 

    Returns:
        sj, ex: Pandas.DataFrames for splice junctions and exons

    """
    if len(gtf)==0: # edge case
        cols = GGB.BEDCOLS[:6]+['locus','_id','cat']
        sj = UT.make_empty_df(cols)
        ex = UT.make_empty_df(cols)
        return sj,ex
    exons = gtf[gtf['typ']=='exon'].sort_values(['chr','st','ed'])
    exons['_id'] = N.arange(len(exons))
    exons.sort_values(['transcript_id','st','ed'],inplace=True)
    # 5',3'
    ex_s = exons.groupby('transcript_id').size()
    tid_s = ex_s[ex_s==1].index
    id_m = ex_s[ex_s>1].index
    ex_m = exons[exons['transcript_id'].isin(id_m)].copy()
    ex_m.sort_values(['transcript_id','st','ed'], inplace=True)
    ex_f = ex_m.groupby('transcript_id').first()
    ex_l = ex_m.groupby('transcript_id').last()
    if5 = list(ex_f[ex_f['strand']=='+']['_id'].values)
    if3 = list(ex_f[ex_f['strand']=='-']['_id'].values)
    il5 = list(ex_l[ex_l['strand']=='-']['_id'].values)
    il3 = list(ex_l[ex_l['strand']=='+']['_id'].values)
    exons['kind'] = 'i'
    exons.loc[exons['transcript_id'].isin(tid_s),'kind'] = 's'
    exons.loc[exons['_id'].isin(if5+il5),'kind'] = '5'
    exons.loc[exons['_id'].isin(if3+il3),'kind'] = '3'

    # find junctions
    def _igen():
        for k, g in exons.groupby('transcript_id'):
            if len(g)<2:
                continue
            g = g.sort_values(['st','ed'])
            chrom,strand,gid=g.iloc[0][['chr','strand','gene_id']]
            ists = g['ed'].values[:-1] + 1
            ieds = g['st'].values[1:] - 1
            for st,ed in izip(ists,ieds):
                # chr,st,ed,name=tid,sc1,strand,gene_id
                yield (chrom,st,ed,gid,0,strand)
    sj = PD.DataFrame([x for x in _igen()], columns=GGB.BEDCOLS[:6])
    sj['locus'] = UT.calc_locus_strand(sj)
    sj = sj.groupby('locus').first().reset_index()

    #cols = ['chr','st','ed','gene_id','sc1','strand']
    #ex = exons #[cols]
    exons['locus'] = UT.calc_locus_strand(exons)
    ex = exons.groupby(['locus','kind']).first().reset_index() # remove duplicated
    ex['name'] = ex['gene_id']
    ex['st'] = ex['st'] - 1
    # position id == locus converted to number
    ex.sort_values(['chr','st','ed'],inplace=True) 
    ex['_id'] = N.arange(len(ex))
    exg = ex.groupby(['chr','st','ed'])[['_id','locus']].first()
    exg['_pid'] = N.arange(len(exg)) # position id
    cse2pid = dict(zip(exg.index,exg['_pid']))
    ex['_pid'] = [cse2pid[tuple(x)] for x in ex[['chr','st','ed']].values]

    if len(sj)==0:
        ex['_gidx'] = N.arange(len(ex))
        return sj, ex

    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    if graphpre is None:
        graphpre = './'+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    genes = GP.find_genes4(sj,ex,
        filepre=prefix,
        np=np,
        override=False,
        separatese=True)

    return sj, ex
Exemple #6
0
def find_genes4(sj,
                ae,
                filepre,
                cachename=None,
                np=1,
                override=False,
                depth=500,
                separatese=True):
    """ 
    Adds _gidx column to ae
    Connection: 1) by junctions, 2) by overlap in the same strand

    Returns genes [set([_id,..]), ...]
    """
    if '_id' not in ae.columns:
        LOG.info('setting ex _id...')
        UT.set_ids(ae)
    if '_id' not in sj.columns:
        LOG.info('setting sj _id...')
        UT.set_ids(sj)
    if 'cat' not in ae.columns:
        UT.set_exon_category(sj, ae)
    if 'a_id' not in ae.columns:
        UT.set_ad_info(sj, ae)

    ### FIND GENES
    if cachename and os.path.exists(cachename) and not override:
        LOG.info('loading cached genes (connected components)...')
        genes = pickle.load(open(cachename, 'rb'))
    else:
        LOG.info('finding genes (connected components)...')
        _sttime = time.time()
        if separatese:
            me, se = UT.mese(ae)
            genes = mcore_allcomponents4(sj, me, filepre, np, depth=depth)
            # SE genes
            genes += [set([x]) for x in se['_id']]
        else:
            genes = mcore_allcomponents4(sj, ae, filepre, np, depth=depth)
        # version 4 graph: uses overlaps in addition to junctions to connect
        # genes = [set([_id's]),...]
        if cachename:
            UT.makedirs(os.path.dirname(cachename))
            pickle.dump(genes, open(cachename, 'wb'))
        LOG.info(' time: {0:.3f}s'.format(time.time() - _sttime))

    ### WRITE EXONS W/ GENE number
    LOG.info('assigning gidx...')
    _sttime = time.time()
    i2g = {}  # eid => _gidx
    i2gn = {}  # eidt => gname
    g2gn = {}
    i2s = dict(UT.izipcols(ae, ['_id', 'strand']))  # eid => strand
    #i2c = dict(UT.izipcols(ae, ['_id','cat'])) # eid => category
    s2n = {'+': 'P', '-': 'N', '.': '', '.+': '', '.-': ''}
    c2n = {'s': 'S', 'i': 'G', '5': 'G', '3': 'G'}
    for i, ids in enumerate(genes):
        gid = i + 1
        strand = s2n[i2s[list(ids)[0]]]
        cat = 'S' if len(ids) == 1 else 'G'
        if strand == 'N':  # negative strand
            gid = -gid
        gname = 'J{0}{1}{2}'.format(strand, cat, abs(gid))
        g2gn[gid] = gname
        for x in ids:
            i2g[x] = gid
            i2gn[x] = gname

    ae['_gidx'] = [i2g[x] for x in ae['_id']]
    ae['gname'] = [i2gn[x] for x in ae['_id']]

    ## set sj _gidx, use acceptor=>_gidx map (exon a_id, sj a_id)
    a2g = dict(UT.izipcols(ae, ['a_id', '_gidx']))
    d2g = dict(UT.izipcols(ae, ['d_id', '_gidx']))
    sj['_gidx'] = [
        a2g.get(x, d2g.get(y, 0))
        for x, y in UT.izipcols(sj, ['a_id', 'd_id'])
    ]
    sj['gname'] = [g2gn.get(x, '') for x in sj['_gidx']]

    # This shouldn't happen
    nidx = ae['_gidx'] == 0
    if N.sum(nidx) > 0:
        LOG.warning(
            '###### WARNING!!!!!! exons with no gene assignment:{0}'.format(
                N.sum(nidx)))
        #ae.loc[nidx, '_gidx'] = N.arange(len(ae),len(ae)+N.sum(nidx))

    return genes