Ejemplo n.º 1
0
 def __init__(self, sj, me, filepre, depth=500, maxcnt=10000):
     MEGraph3.__init__(self, sj, me, depth, maxcnt)
     self.pre = filepre
     a = filepre + 'ex1.txt.gz'
     b = filepre + 'ex2.txt.gz'
     c = filepre + 'ov.txt.gz'
     # calculate exon overlap to self
     cols0 = ['chr', 'st', 'ed', 'strand', '_id']
     # single cell data contains float in st,ed in ex ???
     me = UT.check_int_nan(me)
     a = UT.write_pandas(me[cols0], a, '')
     b = UT.write_pandas(me[cols0], b, '')
     c = BT.bedtoolintersect(a, b, c, wao=True)
     cols1 = cols0 + ['b_' + x for x in cols0] + ['ovl']
     self.ov = ov = UT.read_pandas(c, names=cols1)
     # select same strand overlap to non-self
     self.ov1 = ov1 = ov[(ov['_id'] != ov['b__id'])
                         & (ov['strand'] == ov['b_strand'])]
     # make connected dictionary _id => [b__id's]
     tmp = ov1.groupby('_id')['b__id'].apply(
         lambda x: list(x)).reset_index()
     if 'index' in tmp.columns:
         tmp['_id'] = tmp['index']
     #LOG.debug('graph.MEGraph4.__init__: tmp.columns={0}, len(tmp)={1}'.format(tmp.columns, len(tmp)))
     self.eoe = dict(UT.izipcols(tmp, ['_id', 'b__id']))
     # cleanup
     os.unlink(a)
     os.unlink(b)
     os.unlink(c)
Ejemplo n.º 2
0
 def __init__(self, mg, se=None):
     self.mg = mg
     self.ex = ex = mg.exons.set_index('_id')
     #self.genes = genes = ex.groupby('_gidx')['_id'].groups # _gidx => [_id] dict
     #WARNING above does not give the desired dict it maps to index instead of _id
     self.genes = ex.groupby('_gidx').groups
     self.se = se
     self.i2g = dict(UT.izipcols(mg.exons, ['_id','gname']))
Ejemplo n.º 3
0
def calc_ecov_mp(covci, fname, np, blocksize=100):
    """
    WARNING: this assumes _id is assinged according to sorted (chr,st,ed)
    """
    LOG.debug('calc_ecov...')
    chroms = sorted(covci['chr'].unique())
    if 'name1' not in covci.columns:
        covci['name1'] = covci['name'].astype(str).apply(
            lambda x: [int(y) for y in x.split(',')])
    if 'eidmax' not in covci.columns:
        covci['eidmax'] = covci['name1'].apply(lambda x: max(x))
    if 'eidmin' not in covci.columns:
        covci['eidmin'] = covci['name1'].apply(lambda x: min(x))
    args = [(covci[covci['chr'] == c].copy(), blocksize) for c in chroms]
    e2cs = {}
    if np == 1:
        # for c,bwname,chrom,d in data:
        for arg in args:
            e2cs.update(calc_ecov_chrom(*arg))
    else:
        try:
            p = multiprocessing.Pool(np)
            rslts = p.map(mp_worker, zip(repeat(calc_ecov_chrom), args))
        finally:
            LOG.debug('closing pool')
            p.close()
        for x in rslts:
            e2cs.update(x)
    LOG.debug('writing rslts...')
    if fname is None:
        return e2cs
    ccf = UT.flattendf(covci, 'name1')
    ccfg = ccf.groupby('name1')
    e2chr = dict(
        UT.izipcols(ccfg['chr'].first().reset_index(), ['name1', 'chr']))
    e2st = dict(UT.izipcols(ccfg['st'].min().reset_index(), ['name1', 'st']))
    e2ed = dict(UT.izipcols(ccfg['ed'].max().reset_index(), ['name1', 'ed']))
    df = PD.DataFrame(e2cs, index=['ecov']).T
    df.index.name = 'eid'
    df = df.reset_index()
    df['chr'] = [e2chr[x] for x in df['eid']]
    df['st'] = [e2st[x] for x in df['eid']]
    df['ed'] = [e2ed[x] for x in df['eid']]
    UT.save_tsv_nidx_whead(df[['eid', 'chr', 'st', 'ed', 'ecov']], fname)
    return df
Ejemplo n.º 4
0
 def _gen():
     for x in UT.izipcols(df, cols):
         rec = [x[0], 0, 0, x[3], x[4], x[5], x[-1]]
         bsizes = [int(y) for y in x[-3].split(',')]
         bstarts = [int(y) for y in x[-2].split(',')]
         for y, z in zip(bstarts, bsizes):
             if y >= 0:
                 rec[1] = x[1] + y
                 rec[2] = x[1] + y + z
                 yield rec.copy()
Ejemplo n.º 5
0
 def __init__(self, sjexpre, th=0.1):
     self.sjexpre = sjexpre
     self.th = th
     self.sj = sj = UT.read_pandas(sjexpre+'.sj.txt.gz')
     self.ex = ex = UT.read_pandas(sjexpre+'.ex.txt.gz')
     self.mg = mg = GP.MEGraph3(sj,ex) # only consider splice junction connections
     self.exg = ex.set_index('_gidx')
     self.exi = ex.set_index('_id')
     self.nullidx = UT.find_nullidx(self.ex)
     self.e2c = dict(UT.izipcols(ex, ['_id', 'cat']))
     self.precalc_branch_p()
Ejemplo n.º 6
0
 def _egen():
     for chrom,tname,strand,est,eed in UT.izipcols(kg,['chr','name','strand','_ests','_eeds']):
         if len(est)==1:
             yield (chrom,st,ed,tname,0,strand,'s')
         else:
             if strand=='+':
                 yield (chrom,est[0],eed[0],tname,0,strand,'5')
                 for st,ed in izip(est[1:-1],eed[1:-1]):
                     yield (chrom,st,ed,tname,0,strand,'i')
                 yield (chrom,est[-1],eed[-1],tname,0,strand,'3')
             else: #'-'
                 yield (chrom,est[0],eed[0],tname,0,strand,'3')
                 for st,ed in izip(est[1:-1],eed[1:-1]):
                     yield (chrom,st,ed,tname,0,strand,'i')
                 yield (chrom,est[-1],eed[-1],tname,0,strand,'5')
Ejemplo n.º 7
0
def calc_glen(ex, cipath):
    ci = GGB.read_bed(cipath)  # 5 col bed, name:eids, sc1:cid
    ci['len'] = ci['ed'] - ci['st']
    ci['cid'] = ci['sc1']
    c2l = dict(UT.izipcols(ci, ['cid', 'len']))
    if 'cid' not in ex.columns:
        e2c = {}
        for i, name in ci[['cid', 'name']].values:
            for eid in name.split(','):
                e2c.setdefault(int(eid), []).append(i)
        ex['cid'] = [e2c[x] for x in ex['_id']]

    def _gen():
        for g, cids in UT.izipcols(ex, ['_gidx', 'cid']):
            for c in cids:
                yield (c, g)

    df = PD.DataFrame(list(set([x for x in _gen()])), columns=['cid', '_gidx'])
    df['len'] = [c2l[x] for x in df['cid']]
    glen = df.groupby('_gidx')['len'].sum()
    return dict(zip(glen.index, glen.values))
Ejemplo n.º 8
0
 def _gen():
     for g, cids in UT.izipcols(ex, ['_gidx', 'cid']):
         for c in cids:
             yield (c, g)
Ejemplo n.º 9
0
 def _set_df2prop(src, tgt, default):
     dic = dict(UT.izipcols(df2, ['_gidx', src]))
     df[tgt] = [dic.get(x, default) for x in df['_gidx']]
Ejemplo n.º 10
0
def calc_gcov(expath, cipath, bwpath, dstprefix, override=False, np=4):
    """Calculate gene coverages.

    Args:
        expath: merged ex
        cipath: chopped interval for ex
        bwpath: bigwig file (sample)
        dstprefix: prefix for outputs

    Outputs:
        1. dstprefix+'.covci.txt.gz'
        2. dstprefix+'.gcov.txt.gz' : DataFrame(col:_gidx,len,val,gcov,len2,gcov2,cids)
            len2: calculate length from ci with cov > 0
            (normal length = use entire ci's belonging to the gene)
            gcov2 = val/len2
            cids: cid with cov > for the gene ','.joined
    """
    ex = UT.read_pandas(expath)
    covcipath = dstprefix + 'covci.txt.gz'
    gcovpath = dstprefix + 'gcov.txt.gz'

    if UT.notstale([expath, cipath], covcipath, override):
        cc = UT.read_pandas(covcipath)
    else:
        if UT.notstale(expath, cipath, False):
            ci = UT.read_pandas(cipath,
                                names=['chr', 'st', 'ed', 'name', 'id'])
        else:
            ci = UT.chopintervals(ex, cipath, idcol='_id')
        cc = calc_cov_mp(ci, bwpath, covcipath, np=np)

    # if override or (not os.path.exists(covcipath)):
    #     # calc covci
    #     if not os.path.exists(cipath):
    #         ci = UT.chopintervals(ex, cipath, idcol='_id')
    #     else:
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # else:
    #     cc = UT.read_pandas(covcipath)

    if 'id' not in cc.columns:
        cc['id'] = cc['sc1']
    if 'eid' not in cc.columns:
        cc['eid'] = cc['name'].astype(str).apply(
            lambda x: [int(y) for y in x.split(',')])
    cc['len'] = cc['ed'] - cc['st']
    cc['val'] = cc['cov'] * cc['len']
    ccf = UT.flattendf(cc[['id', 'eid', 'len', 'val', 'st', 'ed']], 'eid')
    e2g = dict(UT.izipcols(ex, ['_id', '_gidx']))
    ccf['_gidx'] = [e2g[x] for x in ccf['eid']]
    # for normal gcov: take unique combination of (gid, id) (id=cid)
    # for gocv2 : first select ccf with val>0
    ccf2 = ccf[ccf['val'] > 0].groupby(['_gidx', 'id']).first().reset_index()
    ccf2g = ccf2.groupby('_gidx')
    df2 = ccf2g[['len', 'val']].sum()
    df2['gcov2'] = df2['val'] / df2['len']
    df2['cids'] = ccf2g['id'].apply(lambda x: ','.join([str(y) for y in x]))
    df2['gst2'] = ccf2g['st'].min()
    df2['ged2'] = ccf2g['ed'].max()
    df2['glen2'] = df2['ged2'] - df2['gst2']

    df2 = df2.reset_index()

    ccf1 = ccf.groupby(['_gidx', 'id']).first().reset_index()
    ccf1g = ccf1.groupby('_gidx')
    df = ccf1g[['len', 'val']].sum()
    df['gcov'] = df['val'] / df['len']
    df['st'] = ccf1g['st'].min()
    df['ed'] = ccf1g['ed'].max()
    df['glen'] = df['ed'] - df['st']
    df = df.reset_index()
    g2chr = dict(UT.izipcols(ex, ['_gidx', 'chr']))
    df['chr'] = [g2chr[x] for x in df['_gidx']]

    def _set_df2prop(src, tgt, default):
        dic = dict(UT.izipcols(df2, ['_gidx', src]))
        df[tgt] = [dic.get(x, default) for x in df['_gidx']]

    _set_df2prop('gcov2', 'gcov2', 0)
    _set_df2prop('len', 'len2', 0)
    _set_df2prop('cids', 'cids', '')
    _set_df2prop('gst2', 'st2', -1)
    _set_df2prop('ged2', 'ed2', -1)
    _set_df2prop('glen2', 'glen2', 0)

    cols = [
        '_gidx', 'chr', 'st', 'ed', 'len', 'val', 'gcov', 'glen', 'len2',
        'gcov2', 'cids', 'st2', 'ed2', 'glen2'
    ]
    cols = ['_gidx', 'gcov']
    df = df[cols]
    UT.save_tsv_nidx_whead(df, gcovpath)
    return df
Ejemplo n.º 11
0
 def ex_d_e2p(self,eid):
     try:
         return dict(UT.izipcols(self.d2ep.get_group(eid), ['e_id_a','p']))
     except:
         return {}
Ejemplo n.º 12
0
 def _igen():
     for chrom,tname,strand,est,eed in UT.izipcols(kg,['chr','name','strand','_ests','_eeds']):
         for st,ed in izip(eed[:-1],est[1:]):
             yield (chrom,st+1,ed,tname,0,strand,'j')
Ejemplo n.º 13
0
def find_genes4(sj,
                ae,
                filepre,
                cachename=None,
                np=1,
                override=False,
                depth=500,
                separatese=True):
    """ 
    Adds _gidx column to ae
    Connection: 1) by junctions, 2) by overlap in the same strand

    Returns genes [set([_id,..]), ...]
    """
    if '_id' not in ae.columns:
        LOG.info('setting ex _id...')
        UT.set_ids(ae)
    if '_id' not in sj.columns:
        LOG.info('setting sj _id...')
        UT.set_ids(sj)
    if 'cat' not in ae.columns:
        UT.set_exon_category(sj, ae)
    if 'a_id' not in ae.columns:
        UT.set_ad_info(sj, ae)

    ### FIND GENES
    if cachename and os.path.exists(cachename) and not override:
        LOG.info('loading cached genes (connected components)...')
        genes = pickle.load(open(cachename, 'rb'))
    else:
        LOG.info('finding genes (connected components)...')
        _sttime = time.time()
        if separatese:
            me, se = UT.mese(ae)
            genes = mcore_allcomponents4(sj, me, filepre, np, depth=depth)
            # SE genes
            genes += [set([x]) for x in se['_id']]
        else:
            genes = mcore_allcomponents4(sj, ae, filepre, np, depth=depth)
        # version 4 graph: uses overlaps in addition to junctions to connect
        # genes = [set([_id's]),...]
        if cachename:
            UT.makedirs(os.path.dirname(cachename))
            pickle.dump(genes, open(cachename, 'wb'))
        LOG.info(' time: {0:.3f}s'.format(time.time() - _sttime))

    ### WRITE EXONS W/ GENE number
    LOG.info('assigning gidx...')
    _sttime = time.time()
    i2g = {}  # eid => _gidx
    i2gn = {}  # eidt => gname
    g2gn = {}
    i2s = dict(UT.izipcols(ae, ['_id', 'strand']))  # eid => strand
    #i2c = dict(UT.izipcols(ae, ['_id','cat'])) # eid => category
    s2n = {'+': 'P', '-': 'N', '.': '', '.+': '', '.-': ''}
    c2n = {'s': 'S', 'i': 'G', '5': 'G', '3': 'G'}
    for i, ids in enumerate(genes):
        gid = i + 1
        strand = s2n[i2s[list(ids)[0]]]
        cat = 'S' if len(ids) == 1 else 'G'
        if strand == 'N':  # negative strand
            gid = -gid
        gname = 'J{0}{1}{2}'.format(strand, cat, abs(gid))
        g2gn[gid] = gname
        for x in ids:
            i2g[x] = gid
            i2gn[x] = gname

    ae['_gidx'] = [i2g[x] for x in ae['_id']]
    ae['gname'] = [i2gn[x] for x in ae['_id']]

    ## set sj _gidx, use acceptor=>_gidx map (exon a_id, sj a_id)
    a2g = dict(UT.izipcols(ae, ['a_id', '_gidx']))
    d2g = dict(UT.izipcols(ae, ['d_id', '_gidx']))
    sj['_gidx'] = [
        a2g.get(x, d2g.get(y, 0))
        for x, y in UT.izipcols(sj, ['a_id', 'd_id'])
    ]
    sj['gname'] = [g2gn.get(x, '') for x in sj['_gidx']]

    # This shouldn't happen
    nidx = ae['_gidx'] == 0
    if N.sum(nidx) > 0:
        LOG.warning(
            '###### WARNING!!!!!! exons with no gene assignment:{0}'.format(
                N.sum(nidx)))
        #ae.loc[nidx, '_gidx'] = N.arange(len(ae),len(ae)+N.sum(nidx))

    return genes
Ejemplo n.º 14
0
    def __init__(self,
                 ex,
                 sj,
                 xmargin=None,
                 ymargin=0.25,
                 compress=True,
                 ecov='ecov',
                 ucnt='ucnt',
                 mcnt='mcnt',
                 minlw=1,
                 drawscalebar=True,
                 ecovth=None,
                 jcntth=None,
                 origin=None,
                 sortexby=None,
                 fontsize=7):
        self.ymargin = ymargin
        self.ecov = ecov
        self.ucnt = ucnt
        self.jcnt = jcnt = 'jcnt'
        self.mcnt = mcnt
        self.minlw = minlw
        self.drawscalebar = drawscalebar
        self.ecovth = ecovth
        self.jcntth = jcntth
        self.ex = ex = ex.copy()
        self.sj = sj = sj.copy()
        self.compress = compress
        self.fontsize = fontsize
        if sortexby is None:
            self.sortexby = ecov
        else:
            self.sortexby = sortexby  # when plotting multiple and comparing, you want to use same sorting
        # start and end, strand

        if ex.iloc[0]['strand'] == '+':
            if origin is None:
                origin = ex['st'].min()
            ex['xst'] = ex['st'] - origin
            ex['xed'] = ex['ed'] - origin
            self.strand = '+'
            self.origin = origin
        else:
            if origin is None:
                origin = ex['ed'].max()
            ex['xst'] = origin - ex['ed']
            ex['xed'] = origin - ex['st']
            self.strand = '-'
            self.origin = origin
        # fix old a_id null
        if (ex['a_id'].min() == -1) and (N.sum(ex['a_id'] == 0) == 0):
            ex.loc[ex['a_id'] == -1, 'a_id'] = 0
            ex.loc[ex['d_id'] == -1, 'd_id'] = 0
            sj.loc[sj['a_id'] == -1, 'a_id'] = 0
            sj.loc[sj['d_id'] == -1, 'd_id'] = 0

        ex['len'] = ex['xed'] - ex['xst']
        if xmargin is None:
            xmargin = int(ex['len'].mean())
        self.xmargin = xmargin

        if ecov not in ex.columns:
            ex[ecov] = 1
        if (ucnt not in sj.columns) or (mcnt not in sj.columns):
            sj[jcnt] = 1
            sj[jcnt + '_ls'] = 'solid'
        else:
            # sj uniq, mult
            sj[jcnt] = [x or y for x, y in sj[[ucnt, mcnt]].values]
            sj[jcnt + '_ls'] = ['solid' if x else 'dashed' for x in sj[ucnt]]

        if ecovth is not None:
            self.ex = ex = ex[ex[ecov] > ecovth].copy()
        if jcntth is not None:
            self.sj = sj = sj[sj[jcnt] > jcntth].copy()
        if len(ex) == 0:
            return

        # find exon groups
        if 'asize' not in ex.columns:
            a2size = dict(
                UT.izipcols(
                    ex.groupby('a_id').size().reset_index(), ['a_id', 0]))
            d2size = dict(
                UT.izipcols(
                    ex.groupby('d_id').size().reset_index(), ['d_id', 0]))
            a2size[0] = 0
            d2size[0] = 0
            ex['asize'] = [a2size[x] for x in ex['a_id']]
            ex['dsize'] = [d2size[x] for x in ex['d_id']]
        ex['group'] = [
            'a{0}'.format(ai) if (a != 0 and a > d) else 'd{0}'.format(di)
            for a, ai, d, di in ex[['asize', 'a_id', 'dsize', 'd_id']].values
        ]
        # find exon group st, ed
        exg = ex.groupby('group')
        g2st = dict(
            UT.izipcols(exg['xst'].min().reset_index(), ['group', 'xst']))
        g2ed = dict(
            UT.izipcols(exg['xed'].max().reset_index(), ['group', 'xed']))
        g2size = dict(UT.izipcols(exg.size().reset_index(), ['group', 0]))
        ex['gst'] = [g2st[x] for x in ex['group']]
        ex['ged'] = [g2ed[x] for x in ex['group']]
        ex['gsize'] = [g2size[x] for x in ex['group']]
        #self.ex = ex = ex.sort_values(['group',ecov]) #'gst','ged','xst','xed'])
        self.ex = ex = ex.sort_values(['group', self.sortexby
                                       ])  #'gst','ged','xst','xed'])

        # find exon y pos within group
        def _eypos(gs):
            g0, s0 = gs[0]  # first g
            cnt = 0
            yield cnt - (s0 - 1) / 2.
            for g1, s1 in gs[1:]:
                if g1 == g0:
                    cnt += 1
                else:
                    cnt = 0
                yield cnt - (s1 - 1) / 2.
                g0 = g1

        ex['eypos'] = [x for x in _eypos(ex[['group', 'gsize']].values)]
        # find group y center pos
        self.gr = gr = ex.groupby('group')[['gst', 'ged',
                                            'gsize']].first().sort_values(
                                                ['gst', 'ged'])
        gr['len'] = gr['ged'] - gr['gst']

        def _gypos(gr):
            side = 1
            r0 = gr.iloc[0]
            h = r0['gsize'] / 2.
            ged0 = r0['ged']
            gy0 = {1: h, -1: -h}  # remember filled height both side (1,-1)
            yield 0  # first one gets center
            for gst1, ged1, gsiz1 in gr[['gst', 'ged', 'gsize']].values[1:]:
                h = gsiz1 / 2.
                if ged0 <= gst1:  # no overlap
                    gy0 = {1: h, -1: -h}
                    yield 0
                else:
                    gy1 = gy0[side] + side * gsiz1 / 2.
                    gy0[side] = gy0[side] + side * gsiz1
                    side = -1 * side  # flip side
                    yield gy1
                gst0 = gst1
                ged0 = max(ged0, ged1)

        gr['gypos'] = [x for x in _gypos(gr)]
        # compress x coord
        if compress:

            def _gxst(gr):
                r0 = gr.iloc[0]
                delta = 0
                yield r0['gst'] - delta  # 0
                ged0 = r0['ged']
                for i, r1 in gr.iloc[1:].iterrows():
                    gst1 = r1['gst']
                    if gst1 - ged0 > self.xmargin:
                        delta += (gst1 - ged0 - self.xmargin)
                    yield gst1 - delta
                    ged0 = r1['ged']

            gr['cst'] = [x for x in _gxst(gr)]
        else:
            gr['cst'] = gr['gst']
        #gr['ced'] = gr['cst']+gr['len']
        ex['cst0'] = [
            gr['cst'].ix[g] + (xst - gst)
            for g, xst, gst in ex[['group', 'xst', 'gst']].values
        ]
        ex['ced0'] = ex['cst0'] + ex['len']
        if self.strand == '+':
            ex['cst'] = origin + ex['cst0']
            ex['ced'] = origin + ex['ced0']
        else:
            ex['cst'] = origin - ex['ced0']
            ex['ced'] = origin - ex['cst0']
        ex['ey'] = [
            ey + gr['gypos'].ix[g] for ey, g in ex[['eypos', 'group']].values
        ]