Beispiel #1
0
    def __init__(self, sjexpre, code, chromdir, rmskviz, outdir, **kw):
        self.sjexpre = sjexpre
        self.prefix = prefix = os.path.join(outdir, code)
        self.fnobj = FN.FileNamesBase(prefix)
        self.chromdir = chromdir
        self.rmskviz = rmskviz
        self.gfc = FA.GenomeFASTAChroms(chromdir)

        self.params = RMSKPARAMS.copy()
        self.params.update(kw)

        self.ex = UT.read_pandas(sjexpre + '.ex.txt.gz')
        self.sj = UT.read_pandas(sjexpre + '.sj.txt.gz')
        if 'glen' not in self.ex or 'tlen' not in self.ex:
            if not os.path.exists(sjexpre + '.ci.txt.gz'):
                ci = UT.chopintervals(ex, sjexpre + '.ci.txt.gz')
            else:
                ci = UT.read_ci(sjexpre + '.ci.txt.gz')
            UT.set_glen_tlen(self.ex, ci, gidx='_gidx')
            UT.write_pandas(self.ex, sjexpre + '.ex.txt.gz', 'h')
        uexpath = sjexpre + '.unionex.txt.gz'
        if os.path.exists(uexpath):
            self.uex = UT.read_pandas(uexpath)
        else:
            LOG.info('making union exons...saving to {0}'.format(uexpath))
            self.uex = UT.make_unionex(self.ex, '_gidx')
            UT.write_pandas(self.uex, uexpath, 'h')
Beispiel #2
0
def _scan_make_map(paths, dstpath):
    cnt = defaultdict(set)
    #csp = defaultdict(int)
    for path in paths:
        if path[-3:]=='.gz':
            with gzip.open(path) as gz_file:
                with io.BufferedReader(gz_file) as fp:
                    for line in fp:
                        rec = line.strip().split(b'\t')
                        if len(rec)==7:
                            cnt[rec[3]].add(rec[6]) # for each read how many locations?
                        else:
                            print('wrong#fields:{0} in {1}'.format(len(rec),path))
        else:
            with open(path,'rb') as fp:
                for line in fp: # chr,st,ed,name,sc1,strand,tst
                    rec = line.strip().split(b'\t') # read_id:name(3), map_id:tst(6)
                    if len(rec)==7:
                        cnt[rec[3]].add(rec[6]) # for each read how many locations?
                    else:
                        print('wrong#fields:{0} in {1}'.format(len(rec),path))
                    # csp[rec[6]] += 1 # count # segments in a read if >1 spliced
    try:# py2
        dup = PD.DataFrame({k:len(v) for k,v in cnt.iteritems() if len(v)>1}, index=['cnt']).T
    except:
        dup = PD.DataFrame({k:len(v) for k,v in cnt.items() if len(v)>1}, index=['cnt']).T
    UT.write_pandas(dup, dstpath,'ih')
Beispiel #3
0
 def __init__(self, sj, me, filepre, depth=500, maxcnt=10000):
     MEGraph3.__init__(self, sj, me, depth, maxcnt)
     self.pre = filepre
     a = filepre + 'ex1.txt.gz'
     b = filepre + 'ex2.txt.gz'
     c = filepre + 'ov.txt.gz'
     # calculate exon overlap to self
     cols0 = ['chr', 'st', 'ed', 'strand', '_id']
     # single cell data contains float in st,ed in ex ???
     me = UT.check_int_nan(me)
     a = UT.write_pandas(me[cols0], a, '')
     b = UT.write_pandas(me[cols0], b, '')
     c = BT.bedtoolintersect(a, b, c, wao=True)
     cols1 = cols0 + ['b_' + x for x in cols0] + ['ovl']
     self.ov = ov = UT.read_pandas(c, names=cols1)
     # select same strand overlap to non-self
     self.ov1 = ov1 = ov[(ov['_id'] != ov['b__id'])
                         & (ov['strand'] == ov['b_strand'])]
     # make connected dictionary _id => [b__id's]
     tmp = ov1.groupby('_id')['b__id'].apply(
         lambda x: list(x)).reset_index()
     if 'index' in tmp.columns:
         tmp['_id'] = tmp['index']
     #LOG.debug('graph.MEGraph4.__init__: tmp.columns={0}, len(tmp)={1}'.format(tmp.columns, len(tmp)))
     self.eoe = dict(UT.izipcols(tmp, ['_id', 'b__id']))
     # cleanup
     os.unlink(a)
     os.unlink(b)
     os.unlink(c)
Beispiel #4
0
def as2exsj(dstpre, np=7):
    ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A2.EXDFCOLS)
    sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A2.SJDFCOLS)
    se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A2.EXDFCOLS)
    paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A2.PATHCOLS)
    #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.'
    #sj.loc[sj['strand'].isin(['.+','.-']),'strand'] = '.'
    sj['st'] = sj['st']+1 
    cols = A2.EXDFCOLS
    ex = PD.concat([ex[cols],se[cols]],ignore_index=True)
    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    graphpre = dstpre+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    genes = GP.find_genes4(sj,ex,
        filepre=prefix,
        np=np,
        override=False,
        separatese=True)
    ex.loc[ex['kind']=='3','cat'] = '3'
    ex.loc[ex['kind']=='5','cat'] = '5'
    UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h')
    ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz')
    return sj, ex
Beispiel #5
0
def test_count_repeats_viz_mp(outdir, testbed):
    TESTDATA = StringIO("""st,ed,name,sc1,chr,strand,_id
0,10,a,0,chr1,+,1
5,20,a,1,chr1,-,2
25,30,a,1,chr1,+,3
40,45,b,2,chr1,-,4
45,50,b,2,chr1,+,5
49,55,c,2,chr1,+,6
255,260,d,3,chr2,-,7
260,270,d,4,chr2,+,8
370,380,e,4,chr2,-,9
380,390,e,5,chr2,+,10
	""")
    TESTDATA2 = StringIO("""st,ed,name,sc1,chr,strand
0,5,a1,0,chr1,+
9,20,a2,1,chr1,-
31,35,a3,1,chr1,+
40,45,b1,2,chr1,-
45,47,b2,2,chr1,+
56,70,c1,2,chr1,+
200,210,d1,3,chr2,-
260,280,d2,4,chr2,+
391,400,e1,4,chr2,-
	""")
    df = PD.DataFrame.from_csv(TESTDATA, sep=",", index_col=False)
    rmsk = PD.DataFrame.from_csv(TESTDATA2, sep=",", index_col=False)
    path = os.path.join(outdir, 'rmsktest.bed.gz')
    UT.write_pandas(rmsk[['chr', 'st', 'ed', 'name', 'sc1', 'strand']], path,
                    '')
    print(df)
    print(rmsk)
    rslt = RP.count_repeats_viz_mp(df, path, expand=0)
    print(df)
    rslt = RP.count_repeats_viz_mp(df, path, expand=10)
    print(df)
Beispiel #6
0
    def calc_exon_params(self, np=10, covfactor=0.05):
        zoom = self.zoom
        # get params
        neipath = self.dstpre + '.{0}.{1}.nei0.params.txt.gz'.format(
            self.refcode, covfactor)
        e53path = self.dstpre + '.{0}.{1}.e53.params.txt.gz'.format(
            self.refcode, covfactor)
        if os.path.exists(neipath):
            print('reading from cache {0}'.format(neipath))
            nei = UT.read_pandas(neipath)
        else:
            nei = self.calc_params_mp(self.ne_i0,
                                      np=np,
                                      gapmode='i',
                                      covfactor=covfactor)  # ~ 1min
            UT.write_pandas(nei, neipath, 'h')
        if os.path.exists(e53path):
            print('reading from cache {0}'.format(e53path))
            e53 = UT.read_pandas(e53path)
        else:
            e53 = self.calc_params_mp(
                self.e53, np=np, gapmode='i',
                covfactor=covfactor)  # ~ 10min don't do long ones stupid
            UT.write_pandas(e53, e53path, 'h')
        # logistic fit
        cols = [
            'chr', 'st', 'ed', 'gap', 'emax', 'emin', 'sIn', 'sOut', 'locus',
            'kind', 'len', 'sdIn', 'sdOut', 'mp'
        ]
        nei['kind'] = 1
        e53['kind'] = 0
        nei['len'] = nei['ed'] - nei['st']
        e53['len'] = e53['ed'] - e53['st']
        D = PD.concat([nei[cols], e53[cols]], ignore_index=True)
        D['llen'] = N.log10((D['len']))
        D['lgap'] = N.log10(D['gap'] + 1)
        D['lemax'] = N.log2(zoom * D['emax'] + 1)
        D1 = D[(D['emax'] > 0) & (D['sdIn'] != 0) & (D['sdOut'] != 0)]
        print(len(D), len(D1))
        X = D1[['lemax', 'lgap', 'llen', 'mp']].values
        Y = D1['kind'].values
        lr = LogisticRegression()
        lr.fit(X, Y)
        Z = lr.predict(X)
        # write json
        ppath = self.dstpre + '.{0}.exonparams.json'.format(self.refcode)
        self.write_params(ppath, lr, Y, Z, ['lemax', 'lgap', 'llen', 'mp'], {
            'zoom': zoom,
            'th': covfactor
        })
        # make fig
        spath = self.dstpre + '.{0}.exonparams'.format(self.refcode)
        title = self.dstpre.split('/')[-1]
        self.plot_exon_fit(spath + '.0.png', title, X, Y, Z, ptyp='both')
        self.plot_exon_fit(spath + '.pdf', title, X, Y, Z, ptyp='pdf')
        self.plot_exon_fit(spath + '.png', title, X, Y, Z, ptyp='png')

        return locals()
Beispiel #7
0
def count_repeats_viz_chr(bedpath, rmskpath, outpath):
    c = BT.bedtoolintersect(bedpath, rmskpath, outpath, wao=True)
    cols = [
        'chr', 'st', 'ed', 'name', 'b_chr', 'b_st', 'b_ed', 'b_name', 'strand',
        'ovl'
    ]
    df = UT.read_pandas(c, names=cols)
    df['rn'] = df['b_name'] + '(' + df['strand'] + ')'
    # group and concat repname
    dg = df.groupby('name')['rn'].apply(
        lambda x: ','.join(list(x))).reset_index()
    UT.write_pandas(dg, outpath, 'h')
Beispiel #8
0
def make_sjex(gtfpath, dstpre, np=12):
    if UT.isstring(gtfpath):
        gtf = GGB.read_gtf(gtfpath)
    else:
        gtf = gtfpath
    sj,ex = gtf2exonsj(gtf, np=np)
    print(ex.groupby(['kind','cat']).size())
    ex.loc[ex['kind']=='5','cat'] = '5'
    ex.loc[ex['kind']=='3','cat'] = '3'
    UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h')
    # make ci
    ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz')
    return {'sj':sj,'ex':ex}
Beispiel #9
0
 def make_sjex(self, np=4):
     sjpath, expath = self.sjexpaths()
     if not os.path.exists(self.gtfpath):
         raise RuntimeError('file {0} does not exist'.format(self.gtfpath))
     LOG.info('making sj,ex...')
     gtf = GGB.read_gtf(self.gtfpath) # ~ 1.5 min => 
     # if 'cov' in gtf.iloc[0]['extra']:
     #     gtf['cov'] = GGB.get_gtf_attr_col(gtf, 'cov')
     # convert gtf to sjex
     pre = self.fname('graphpre{0}_'.format(uuid.uuid4()))
     sj, ex = gtf2exonsj(gtf, np=np, graphpre=pre)
     # save
     UT.write_pandas(sj, sjpath, 'h')
     UT.write_pandas(ex, expath, 'h')
     return sj,ex
Beispiel #10
0
def test_trim_ex(g4sjex, tmpdir):
    sj0, ex0 = g4sjex
    cols = list(ex0.columns)
    if 'len' not in cols:
        ex0['len'] = ex0['ed'] - ex0['st']
    gidxs = ex0['gene_id'].unique()[:20]
    ex = ex0[ex0['gene_id'].isin(gidxs)].copy()
    ex.loc[ex['gene_id'].isin(gidxs[10:]), 'chr'] = 'chr2'
    expath = os.path.join(str(tmpdir), 'ex.txt.gz')
    UT.write_pandas(ex, expath, 'h')
    dstpath = os.path.join(str(tmpdir), 'tex.txt.gz')
    dstcipath = os.path.join(str(tmpdir), 'texci.txt.gz')
    tex = TE.trim_ex(expath, dstpath, dstcipath, 1000, 'gene_id', 2)
    assert len(ex) == 58
    assert len(tex) == 25
    assert os.path.exists(dstpath)
    assert os.path.exists(dstcipath)
Beispiel #11
0
    def extract_exi53(self):
        # internal exons overlapping with either 5 or 3 prime exons?
        cols0 = ['chr', 'st', 'ed', '_id', 'sc1', 'strand']
        cols = cols0 + ['b_' + x for x in cols0] + ['ovl']
        ex = self.ex

        exi = ex[ex['cat'] == 'i']  # internal exons
        ai = self.refpre + '.exi.bed.gz'
        ai = UT.write_pandas(exi[cols0], ai, '')

        e5 = ex[ex['cat'] == '5']
        a5 = self.refpre + '.ex5.bed.gz'
        a5 = UT.write_pandas(e5[cols0], a5, '')

        e3 = ex[ex['cat'] == '3']
        a3 = self.refpre + '.ex3.bed.gz'
        a3 = UT.write_pandas(e3[cols0], a3, '')

        a5i = self.refpre + '.ex5-ovl-exi.txt.gz'
        a3i = self.refpre + '.ex3-ovl-exi.txt.gz'

        nc = len(cols0)
        e5i0 = BT.calc_ovlratio(a5, ai, a5i, nc, nc)
        e3i0 = BT.calc_ovlratio(a3, ai, a3i, nc, nc)

        self.e5ia = e5ia = e5i0[e5i0['ovlratio'] == 1].rename(
            columns={'name': '_id'})
        self.e3ia = e3ia = e3i0[e3i0['ovlratio'] == 1].rename(
            columns={'name': '_id'})

        # find internal exons which shares st or ed with 5 or 3 exons
        a5i = BT.bedtoolintersect(ai, a5, a5i, wao=True)
        a3i = BT.bedtoolintersect(ai, a3, a3i, wao=True)
        # read tmp file
        a5idf = UT.read_pandas(a5i, names=cols)
        a3idf = UT.read_pandas(a3i, names=cols)
        idx5 = ((a5idf['strand']=='-')&(a5idf['st']==a5idf['b_st']))|\
               ((a5idf['strand']=='+')&(a5idf['ed']==a5idf['b_ed']))
        idx3 = ((a3idf['strand']=='-')&(a3idf['ed']==a3idf['b_ed']))|\
               ((a3idf['strand']=='+')&(a3idf['st']==a3idf['b_st']))
        self.e5i = a5idf[idx5].groupby('_id').first().reset_index()
        self.e3i = a3idf[idx3].groupby('_id').first().reset_index()
        LOG.info('#e5i={0}'.format(len(self.e5i)))
        LOG.info('#e3i={0}'.format(len(self.e3i)))
Beispiel #12
0
    def extract_nonovl_exons(self):
        ex = self.ex
        sj = self.sj
        ex = ex[ex['st'] < ex['ed']]
        sj = sj[sj['st'] < sj['ed']]
        # nonovl exons
        # ex['gene_type'] = ex['extra'].str.split(';').str[2].str.split().str[1].str[1:-1]
        cols0 = ['chr', 'st', 'ed', '_id']
        a = self.refpre + '.ex.bed.gz'
        a = UT.write_pandas(ex[cols0], a, '')
        b = self.refpre + '.sj.bed.gz'
        b = UT.write_pandas(sj[cols0], b, '')
        c1 = self.refpre + '.ex-ovl-sj.txt.gz'
        c2 = self.refpre + '.ex-ovl-ex.txt.gz'
        c1 = BT.bedtoolintersect(a, b, c1, wao=True)
        c2 = BT.bedtoolintersect(a, a, c2, wo=True)

        cols = cols0 + ['b_' + x for x in cols0] + ['ovl']
        sov = UT.read_pandas(c1, names=cols)
        sov['len'] = sov['ed'] - sov['st']
        sov['ovlratio'] = sov['ovl'] / sov['len']
        sovg = sov.groupby('_id')['ovlratio'].max()
        snonov = sovg[sovg < 1.]  # not completely covered by junction

        eov = UT.read_pandas(c2, names=cols)
        eovsize = eov.groupby('_id').size()
        enonov = eovsize[eovsize == 1]  # only overlaps with self
        self.ne_i0 = ne_i0 = ex.set_index('_id').ix[enonov.index].sort_values(
            ['chr', 'st', 'ed']).reset_index()
        self.ne_i0['len'] = ne_i0['ed'] - ne_i0['st']
        LOG.info('#non-ovl-ex0={0}'.format(len(enonov)))

        LOG.info('#non-ex-ovl-ex={0}, #non-sj-ovl-ex={1}'.format(
            len(enonov), len(snonov)))
        ids = set(enonov.index).intersection(snonov.index)
        LOG.info('#non-ovl-ex={0}'.format(len(ids)))
        self.nov_ex = novex = ex.set_index('_id').ix[ids].sort_values(
            ['chr', 'st', 'ed']).reset_index()
        novex['len'] = novex['ed'] - novex['st']
        self.ne_i = novex[novex['cat'] == 'i']
        self.ne_5 = novex[novex['cat'] == '5']
        self.ne_3 = novex[novex['cat'] == '3']
        self.ne_s = novex[novex['cat'] == 's']
Beispiel #13
0
    def calc_overlaps(self):
        cref = self.cn_ref
        ctgt = self.cn_tgt
        a = ctgt.fname('cptmp.ex.bed.gz')
        b = cref.fname('cptmp.ex.bed.gz')
        c = ctgt.fname2('cptmp.ex.ovl.txt.gz', cref.code)
        cols = ['chr', 'st', 'ed', 'cat', '_id', '_gidx', 'len', 'strand']
        self.ex_tgt = etgt = ctgt.model('ex')  #UT.read_pandas(p1.ex)
        self.ex_ref = eref = cref.model('ex')  #UT.read_pandas(p2.ex)

        eref['_gidx'] = eref[self.refgidxcol]

        if 'len' not in etgt.columns:
            etgt['len'] = etgt['ed'] - etgt['st']
        if 'len' not in eref.columns:
            eref['len'] = eref['ed'] - eref['st']
        a = UT.write_pandas(etgt[cols], a, '')
        b = UT.write_pandas(eref[cols], b, '')
        c = BT.bedtoolintersect(a, b, c, wao=True)
        ocols = cols + ['b_' + x for x in cols] + ['ovl']
        self.ov = UT.read_pandas(c, names=ocols)

        # gene overlap
        gcols = ['chr', 'st', 'ed', 'strand']

        def _gbed(ex):
            gr = ex.groupby('_gidx')
            g = gr[gcols].first()
            g['st'] = gr['st'].min()
            g['ed'] = gr['ed'].max()
            return g.reset_index()

        gtgt = _gbed(etgt)
        gref = _gbed(eref)
        gcols2 = gcols + ['_gidx']
        a2 = ctgt.fname('cptmp.gene.bed.gz')
        b2 = cref.fname('cptmp.gene.bed.gz')
        c2 = ctgt.fname2('gene.ovl.txt.gz', cref.code)
        a2 = UT.write_pandas(gtgt[gcols2], a2, '')
        b2 = UT.write_pandas(gref[gcols2], b2, '')
        c2 = BT.bedtoolintersect(a2, b2, c2, wao=True)
        gocols = gcols2 + ['b_' + x for x in gcols2] + ['ovl']
        self.gov = UT.read_pandas(c2, names=gocols)
Beispiel #14
0
def as3exsj(dstpre, minelen=150, np=7):
    ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A3.EXDFCOLS)
    sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A3.SJDFCOLS)
    se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A3.EXDFCOLS)
    paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A3.PATHCOLS)
    #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.'
    # sj.loc[sj['strand']=='.+','strand'] = '+'
    # sj.loc[sj['strand']=='.-','strand'] = '-'
    sj['st'] = sj['st']+1 
    cols = A3.EXDFCOLS
    ex = PD.concat([ex[cols],se[cols]],ignore_index=True)
    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    graphpre = dstpre+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    # genes = GP.find_genes4(sj,ex,
    #     filepre=prefix,
    #     np=np,
    #     override=False,
    #     separatese=True)
    genes = GP.find_genes3(sj,ex, # don't use exon overlap as connection
        np=np,
        override=False)
    ex.loc[ex['kind']=='3','cat'] = '3'
    ex.loc[ex['kind']=='5','cat'] = '5'

    # remove these with elen smaller than minelen
    ex['len'] = ex['ed']-ex['st']
    exsiz = ex.groupby('_gidx')['len'].sum()
    rgidx = exsiz[exsiz<minelen].index.values
    LOG.info('minelen filter #ex {0}=>{1}'.format(len(ex), len(ex)-len(rgidx)))
    ex2 = ex[~ex['_gidx'].isin(rgidx)]
    sj2 = sj[~sj['_gidx'].isin(rgidx)]
    # write
    UT.write_pandas(ex2, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj2, dstpre+'.sj.txt.gz', 'h')
    ci = UT.chopintervals(ex2, dstpre+'.ci.txt.gz')
    return sj2, ex2
Beispiel #15
0
 def save(self):
     # [i,5,5b,3,3b,s,sb,j,glc,ecc,jcc]
     # light weight stats also usable from others ==> dict
     #   auc, detected1, ..., sigmoid,...,maxx,avgx,avgy,...
     # ==> pickle or json
     decode = '{0}.{1}'.format(self.en1.code, self.datacode)
     fname1 = self.en2.fname2('stats.json', decode, category='output')
     UT.makedirs(os.path.dirname(fname1))
     with open(fname1, 'w') as fp:
         json.dump(self.stats, fp)
     # [i,5,5b,3,3b,s,sb,j] cov(x),ratio(y) => in a dataframe
     # [glc,ecc,jcc] gcov(x), ratio(y) => in a dataframe
     # ==> put all in one four column dataframe (kind, id, x, y)
     fname2 = self.en2.fname2('ratios.txt.gz', decode, category='output')
     for k, v in self.ratios.items():
         v['kind'] = k
     df = PD.concat(self.ratios.values(), ignore_index=True)
     UT.write_pandas(df, fname2, 'h')
     # DP
     dp = self.get_detection_percentages()
     fname3 = self.en2.fname2('dp.txt.gz', decode, category='output')
     UT.write_pandas(dp, fname3, 'ih')
Beispiel #16
0
    def calculate(self):
        """ Calculate base pair overlap to repeat using UCSC genome mask of repeats to lower case, 
        and exon level overlap to repeat using UCSC RepeatMaskerViz track. 
        ALso make a dataframe containing summary. 
        """
        pr = self.params
        fn = self.fnobj

        uex = count_repeats_mp(self.uex, self.gfc, np=pr['np'], col='#repbp')
        uex = count_repeats_viz_mp(uex,
                                   self.rmskviz,
                                   np=pr['np'],
                                   idcol='_id',
                                   expand=0,
                                   col='repnames')
        self.ugb = ugb = self._make_gbed(self.ex,
                                         self.sj,
                                         uex,
                                         datacode=pr['datacode'],
                                         gname=pr['gname'])
        UT.write_pandas(ugb, fn.txtname('all.genes.stats', category='output'),
                        'h')
Beispiel #17
0
def filter_sjexdf(mdstpre, rdstpre):
    exdf = UT.read_pandas(mdstpre + '.exdf.txt.gz', names=A3.EXDFCOLS)
    sedf = UT.read_pandas(mdstpre + '.sedf.txt.gz', names=A3.EXDFCOLS)
    exdf = PD.concat([exdf, sedf], ignore_index=True)
    sjdf = UT.read_pandas(mdstpre + '.sjdf.txt.gz', names=A3.SJDFCOLS)

    ex = UT.read_pandas(rdstpre + '.ex.txt.gz')
    sj = UT.read_pandas(rdstpre + '.sj.txt.gz')

    def select_chromwise_df(exdf, ex):
        npchrs = []
        for chrom in exdf['chr'].unique():
            pchr = exdf[exdf['chr'] == chrom]
            echr = ex[ex['chr'] == chrom]
            exnames = set(echr['name'].values)
            idx = [x in exnames for x in pchr['name']]
            npchrs.append(pchr[idx])
        return PD.concat(npchrs, ignore_index=True)

    nexdf = select_chromwise_df(exdf, ex)
    nsjdf = select_chromwise_df(sjdf, sj)
    UT.write_pandas(nexdf, rdstpre + '.exdf.txt.gz', '')
    UT.write_pandas(nsjdf, rdstpre + '.sjdf.txt.gz', '')
Beispiel #18
0
def gtf_from_bed12(modelpre, dstpath=None, source='.'):
    # path['gname'] contains gene id
    paths = GGB.read_bed(modelpre+'.paths.withse.bed.gz')
    ex = UT.read_pandas(modelpre+'.ex.txt.gz')
    ex['id'] = ex['chr']+':'+ex['name']
    n2gn = UT.df2dict(ex, 'id', 'gname')
    # n2gn = UT.df2dict(ex, 'name', 'gname') # there may be same st,ed in different chromosome
    paths['id'] = paths['chr']+':'+paths['name']
    paths['id0'] = paths['chr']+':'+paths['name'].str.split('|').str[0]
    paths['gname'] = [n2gn[x] for x in paths['id0']]
    g2cnt = {}
    tnames = []
    for x in paths['gname']:
        i = g2cnt.get(x,1)
        tnames.append('{0}.{1}'.format(x,i))
        g2cnt[x] = i+1
    paths['tname'] = tnames    
    txt = 'gene_id "{0}"; transcript_id "{1}"; exon_number "{2}";'
    def _gen():
        cols = ['chr','st','ed','gname','tname','esizes','estarts','strand']
        for c,s,e,gn,tn,esi,est,strand in paths[cols].values:
            esizes = [int(x) for x in esi.split(',')[:-1]]
            estarts = [int(x) for x in est.split(',')[:-1]]
            for i,(x,y) in enumerate(zip(esizes,estarts)):
                est = s+y
                eed = est+x
                extra = txt.format(gn,tn,i+1)
                yield (c,source,'exon',est+1,eed,'.',strand,'.',extra)
    df = PD.DataFrame([x for x in _gen()], columns=GGB.GTFCOLS)
    if dstpath is None:
        dstpath = bedpath.replace('.bed','.gtf')
    GGB.write_gtf(df, dstpath)
    
    idf = paths[['id','chr','name','tname','gname']]
    UT.write_pandas(idf, modelpre+'.idmap.txt.gz','h')
    return df
Beispiel #19
0
def make_sjexci(path, np):
    if path[-3:]=='.gz':
        bpath = path[:-3]
    else:
        bpath = path
    ext = bpath[-4:]
    if ext not in ['.gtf', '.bed', '.txt']:
        raise ValueError('unknown filetype {0}, should be either .gtf,.bed (bed12),.txt (ucsc knownGene)'.format(ext))
    pathprefix = bpath[:-4]

    if not os.path.exists(path):
        raise ValueError('{0} file does not exists'.format(ext))

    if ext=='.gtf':
        df = GGB.read_gtf(path).sort_values(['chr',])
        sj, ex = gtf2exonsj(df, np=np)
    elif ext=='.bed': 
        df = GGB.read_bed(path)
        sj, ex = bed2exonsj(df, np=np)
    elif ext=='.txt': # UCSC download
        if 'knownGene' in path:
            df = GGB.read_ucsc_knownGene(path)
            sj, ex = kg2exonsj(df, np=np)
        elif 'refGene' in path:
            df = GGB.read_ucsc_refGene(path)
            sj, ex = kg2exonsj(df, np=np) # same as kg
    
    # save
    LOG.info('saving sj to {0}'.format(pathprefix+'.sj.txt.gz'))
    UT.write_pandas(sj, pathprefix+'.sj.txt.gz', 'h')
    LOG.info('saving ex to {0}'.format(pathprefix+'.ex.txt.gz'))
    UT.write_pandas(ex, pathprefix+'.ex.txt.gz', 'h')    

    # make ci
    ci = UT.chopintervals(ex, pathprefix+'.ci.txt.gz')
    return sj, ex
Beispiel #20
0
 def write(self):
     pre = self.dstpre + '.{0}_{1}_{2}'.format(self.chrom, self.st, self.ed)
     # 1) exon, junctions, allpaths => csv (no header <= to concatenate bundles)
     ecols = A2.EXDFCOLS  #['chr','st','ed','strand','name','kind','ecov']
     UT.write_pandas(self.exdf[ecols], pre + '.covs.exdf.txt.gz', '')
     scols = A2.SJDFCOLS  #['chr','st','ed','strand','name','kind','tcnt'  ]#,'donor','acceptor','dp','ap']
     UT.write_pandas(self.sjdf[scols], pre + '.covs.sjdf.txt.gz', '')
     pcols = A2.PATHCOLS  #['chr','st','ed','name','strand','tst','ted','tcov0','tcov1','tcov']
     UT.write_pandas(self.paths[pcols], pre + '.covs.paths.txt.gz', '')
     # write colored bed12 for tcov > th
     tgt = self.paths[self.paths['tcov'] >= self.tcovth].copy()
     self.bed12 = A2.path2bed12(tgt, cmax=9, covfld='tcov')
     GGB.write_bed(self.bed12, pre + '.covs.paths.bed.gz', ncols=12)
Beispiel #21
0
    def savemodel(self, which, code2=None, category='temp'):
        """Save model. If code2 is None, overwrite original, if code2 is provided,
        writes to outdir/(code).(code2).(which).txt.gz. 

        Args:
            which: 'sj','ex','ci'
            code2: 2nd identifier
            category: filename category (default 'temp')

        Returns:
            file path or None (if model is not loaded)

        """
        if hasattr(self, which):
            if code2 is None:
                path = self.modelpath(which, None)
            else:
                path = self.fname2('{0}.txt.gz'.format(which),
                                   code2,
                                   category=category)
            return UT.write_pandas(getattr(self, which), path, 'h')
        return None
Beispiel #22
0
def count_repeats_viz_mp(beddf,
                         rmskvizpath,
                         idcol='_id',
                         np=3,
                         prefix=None,
                         expand=0,
                         col='repnames'):
    """Use rmsk-viz track and check each (unioned) exon overlaps with repeats and report repeat name(s).
    Uses Bedtools and calculates chromosome-wise.  

    Args:
        beddf: Pandas DataFrame with chr,st,ed cols, when calculating repeats bp
         for genes, unioned bed should be used (use utils.make_unionex)
        idcol: colname for unique row id (default _id)
        rmskvizpath: path to repeat masker viz BED7 file (created using rmskviz2bed7)
        np: number of CPU to use
        prefix: path prefix for temp file, if not None temp files are kept. (default None)
        expand: how many bases to expand exon region in each side (default 0)
        col: column name to put in overlapping repeat names (if multiple comma separated)

    Outputs:
        are put into beddf columns with colname col(default repnames)

    """
    cleanup = False
    if prefix is None:
        cleanup = True
        prefix = os.path.join(os.path.dirname(rmskvizpath),
                              str(uuid.uuid4()) + '_')

    # chrom-wise
    chroms = sorted(beddf['chr'].unique())
    # check whether rmskviz is already split
    splitrmsk = False
    for chrom in chroms:
        rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom)  # reuse
        if not os.path.exists(rpath):
            splitrmsk = True
            break
    if splitrmsk:
        rmsk = GGB.read_bed(rmskvizpath)

    args = []
    bfiles = []
    ofiles = []
    for chrom in chroms:
        bpath = prefix + 'tgt.{0}.bed'.format(chrom)  # don't compress
        rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom)  # reuse
        if expand > 0:
            bchr = beddf[beddf['chr'] == chrom].copy()
            bchr['st'] = bchr['st'] - expand
            bchr['ed'] = bchr['ed'] + expand
            bchr.loc[bchr['st'] < 0, 'st'] = 0
        else:
            bchr = beddf[beddf['chr'] == chrom]
        UT.write_pandas(bchr[['chr', 'st', 'ed', idcol]], bpath, '')
        bfiles.append(bpath)
        if splitrmsk:
            rchr = rmsk[rmsk['chr'] == chrom]
            UT.write_pandas(rchr[['chr', 'st', 'ed', 'name', 'strand']], rpath,
                            '')
        opath = prefix + 'out.{0}.bed'.format(chrom)
        ofiles.append(opath)
        args.append([bpath, rpath, opath])

    rslts = UT.process_mp(count_repeats_viz_chr, args, np=np, doreduce=False)

    # gather outputs
    cols = ['name', 'repnames']
    outs = [UT.read_pandas(f, names=cols) for f in ofiles]
    df = PD.concat(outs, ignore_index=True)
    df['name'] = df['name'].astype(str)
    i2rn = UT.df2dict(df, 'name', 'repnames')
    beddf[col] = [i2rn[str(x)] for x in beddf[idcol]]

    # cleanup
    if cleanup:
        for f in bfiles:
            os.unlink(f)
        for f in ofiles:
            os.unlink(f)

    return beddf
Beispiel #23
0
 def write_txt(self, df, suffix, fm='h', category='temp', **kw):
     fname = self.txtname(suffix, category)
     return UT.write_pandas(df, fname, fm=fm, **kw)
Beispiel #24
0
    def calc_53_params(self, sdiffth=1, np=10, alpha=0.1):
        # get parameters
        dic = {}
        zoom = self.zoom
        # seta = ['ne_i','ne_5','ne_3','e5i','e3i','e5ia','e3ia']
        # setb = ['ne_5','ne_3','e5i','e3i','e5ia','e3ia']
        # setc = ['ne_i','ne_3','ne_5','e5ia','e3ia']
        seta = ['ne_i', 'ne_5', 'ne_3', 'e5i', 'e3i']
        setb = ['ne_5', 'ne_3', 'e5i', 'e3i']
        setc = ['ne_i', 'ne_3', 'ne_5']
        for x in seta:
            fpath = self.dstpre + '.{0}.{1}.flux.txt.gz'.format(
                self.refcode, x)
            if os.path.exists(fpath):
                print('reading from cache {0}'.format(fpath))
                dic[x] = UT.read_pandas(fpath)
            else:
                df = getattr(self, x)
                print('calculating {0}...'.format(x))
                dic[x] = self.calc_flux_mp(df, np=np)
                UT.write_pandas(dic[x], fpath, 'h')
        dicb = {}
        FN0 = 0
        for x in setb:
            f = dic[x]
            f['kind'] = 1
            idx0 = N.abs(
                N.log2(zoom * f['sin'] + 1) -
                N.log2(zoom * f['sout'] + 1)) > sdiffth
            idx1 = (f['sdin'] != 0) | (f['sdout'] != 0
                                       )  # should have either in or out
            idx = idx0 & idx1
            FN0 += N.sum((~idx0) & idx1)  # pre filtered positive
            dicb[x] = f[idx]
        f = dic['ne_i']
        f['kind'] = 0
        idx = (f['ecovmax'] > 1) & (
            (f['sdin'] != 0) & (f['sdout'] != 0))  # should have both in&out
        dicb['ne_i'] = f[idx]
        D = PD.concat([dicb[x] for x in setc], ignore_index=True)
        D2 = PD.concat([dicb['ne_i'], dicb['e3i'], dicb['e5i']],
                       ignore_index=True)
        # don't use e3i, e5i too many non-actives

        D['lsin'] = N.log2(zoom * D['sin'] + 1)
        D['lsout'] = N.log2(zoom * D['sout'] + 1)
        D['sdiff'] = N.abs(D['lsin'] - D['lsout'])
        D['smean'] = (D['lsin'] + D['lsout']) / 2.
        X = D[['sdiff', 'smean']].values
        Y = D['kind'].values
        lr = LogisticRegression()
        lr.fit(X, Y)
        Z = lr.predict(X)
        D2['lsin'] = N.log2(zoom * D2['sin'] + 1)
        D2['lsout'] = N.log2(zoom * D2['sout'] + 1)
        D2['sdiff'] = N.abs(D2['lsin'] - D2['lsout'])
        D2['smean'] = (D2['lsin'] + D2['lsout']) / 2.
        X2 = D2[['sdiff', 'smean']].values
        Z2 = lr.predict(X2)
        # save fit coefficients
        ppath = self.dstpre + '.{0}.e53params.json'.format(self.refcode)
        self.write_params(ppath,
                          lr,
                          Y,
                          Z, ['sdiff', 'smean'], {
                              'sdiffth': sdiffth,
                              'zoom': zoom
                          },
                          FN0=FN0)
        # save scatter plots
        spath = self.dstpre + '.{0}.e53params'.format(self.refcode)
        title = self.dstpre.split('/')[-1]
        self.plot_sin_sout(dic,
                           D,
                           Y,
                           Z,
                           D2,
                           Z2,
                           sdiffth,
                           spath + '.0.png',
                           title,
                           alpha=alpha)
        self.plot_sin_sout(dic,
                           D,
                           Y,
                           Z,
                           D2,
                           Z2,
                           sdiffth,
                           spath + '.pdf',
                           title,
                           ptyp='pdf',
                           alpha=alpha)
        self.plot_sin_sout(dic,
                           D,
                           Y,
                           Z,
                           D2,
                           Z2,
                           sdiffth,
                           spath + '.png',
                           title,
                           ptyp='png',
                           alpha=alpha)
        return locals()
Beispiel #25
0
    def calc_53gap_params(self, covfactor=0, np=10, emaxth=1, eth=1):
        zoom = self.zoom
        d5path = self.dstpre + '.{0}.{1}.gap5params.txt.gz'.format(
            self.refcode, covfactor)
        d3path = self.dstpre + '.{0}.{1}.gap3params.txt.gz'.format(
            self.refcode, covfactor)
        if os.path.exists(d5path):
            print('reading from cache {0}'.format(d5path))
            d5 = UT.read_pandas(d5path)
        else:
            d5 = self.calc_params_mp(self.ne_5,
                                     win=8192,
                                     np=np,
                                     gapmode='53',
                                     direction='<',
                                     covfactor=covfactor)
            UT.write_pandas(d5, d5path, 'h')
        if os.path.exists(d3path):
            print('reading from cache {0}'.format(d3path))
            d3 = UT.read_pandas(d3path)
        else:
            d3 = self.calc_params_mp(self.ne_3,
                                     win=8192,
                                     np=np,
                                     gapmode='53',
                                     direction='>',
                                     covfactor=covfactor)
            UT.write_pandas(d3, d3path, 'h')

        i5 = (d5['eOut'] > eth) & (d5['emax'] > emaxth)
        i3 = (d3['eIn'] > eth) & (d3['emax'] > emaxth)
        d50 = d5[i5]
        d30 = d3[i3]

        def _fitone(d0, x, y1, y2, rx='sin', lrx='lsin'):
            da = d0[[x, y1]].copy().rename(columns={y1: 'gap', x: rx})
            db = d0[[x, y2]].copy().rename(columns={y2: 'gap', x: rx})
            da['kind'] = 1
            db['kind'] = 0
            D = PD.concat([da, db], ignore_index=True)
            D[lrx] = N.log2(zoom * D[rx] + 1)
            D['lgap'] = N.log2(D['gap'] + 1)
            X = D[[lrx, 'lgap']].values
            Y = D['kind'].values
            lr = LogisticRegression()
            lr.fit(X, Y)
            Z = lr.predict(X)
            return locals()

        fit5 = _fitone(d50, 'eOut', 'gap', 'gapIn', 'ein', 'lein')
        fit3 = _fitone(d30, 'eIn', 'gap', 'gapOut', 'ein', 'lein')

        # max exon size
        m5 = N.max(self.ne_5['len'])
        m3 = N.max(self.ne_3['len'])

        rx, lrx = 'ein', 'lein'
        # save coefs
        p5path = self.dstpre + '.{0}.gap5params.json'.format(self.refcode)
        f = fit5
        self.write_params(p5path, f['lr'], f['Y'], f['Z'], [lrx, 'lgap'], {
            'th': covfactor,
            'zoom': zoom,
            'maxsize': int(m5)
        })
        p3path = self.dstpre + '.{0}.gap3params.json'.format(self.refcode)
        f = fit3
        self.write_params(p3path, f['lr'], f['Y'], f['Z'], [lrx, 'lgap'], {
            'th': covfactor,
            'zoom': zoom,
            'maxsize': int(m3)
        })

        # save scatter plots
        spath = self.dstpre + '.{0}.gap53params'.format(self.refcode)
        title = self.dstpre.split('/')[-1]
        self.plot_gap53_fit(fit5, fit3, spath + '.0.png', title, ptyp='both')
        self.plot_gap53_fit(fit5, fit3, spath + '.pdf', title, ptyp='pdf')
        self.plot_gap53_fit(fit5, fit3, spath + '.png', title, ptyp='png')

        return locals()
Beispiel #26
0
    def extract_53_pair(self):
        # between genes
        ex = self.ex
        tmpprefix = self.refpre
        ex['_apos'] = ex['a_pos'].str.split(':').str[1].astype(int)
        ex['_dpos'] = ex['d_pos'].str.split(':').str[1].astype(int)
        ex.loc[ex['cat'] == '3', 'spos'] = ex['_apos']
        ex.loc[ex['cat'] == '5', 'spos'] = ex['_dpos']
        cols = ['chr', 'st', 'ed', 'name', 'strand', '_gidx1', '_gidx2']

        def _find(ecs, chrom, strand):
            e53 = ecs[ecs['cat'].isin(['3', '5'])].sort_values('spos')
            #esorted = echrstrand.sort_values('_apos')
            v1 = e53.iloc[:-1][['spos', 'cat', '_gidx', '_id', 'st',
                                'ed']].values
            v2 = e53.iloc[1:][['spos', 'cat', '_gidx', '_id', 'st',
                               'ed']].values
            pairs = []
            if strand == '+':
                for r1, r2 in zip(v1, v2):
                    if r1[2] != r2[2]:  # not same gene
                        if (r1[1] == '3') & (r2[1] == '5') & (
                                r1[5] < r2[4]):  # non overlapping 3=>5
                            name = '+g{0}e{1}|g{2}e{3}'.format(
                                r1[2], r1[3], r2[2], r2[3])
                            pairs.append((chrom, r1[0], r2[0], name, strand,
                                          r1[2], r2[2]))
            else:
                for r1, r2 in zip(v1, v2):
                    if r1[2] != r2[2]:
                        if (r1[1] == '5') & (r2[1]
                                             == '3') & (r1[5] < r2[4]):  #
                            name = '-g{0}e{1}|g{2}e{3}'.format(
                                r1[2], r1[3], r2[2], r2[3])
                            pairs.append((chrom, r1[0], r2[0], name, strand,
                                          r1[2], r2[2]))

            df = PD.DataFrame(pairs, columns=cols)
            return df

        rslts = []
        for chrom in ex['chr'].unique():
            for strand in ['+', '-']:
                echrstrand = ex[(ex['chr'] == chrom)
                                & (ex['strand'] == strand)]
                rslts.append(_find(echrstrand, chrom, strand))
        df = PD.concat(rslts,
                       ignore_index=True).sort_values(['chr', 'st', 'ed'])
        # intersect with internal exons
        a = tmpprefix + '.53.exi.bed'  # ncol 3
        b = tmpprefix + '.53.bed'  #ncol 5
        c = tmpprefix + '.53.exi.ovl.txt'
        exi = ex[ex['cat'] == 'i'].sort_values(['chr', 'st', 'ed'])
        UT.write_pandas(exi[['chr', 'st', 'ed']], a, '')
        UT.write_pandas(df, b, '')
        c = BT.bedtoolintersect(b, a, c, wao=True)
        cols1 = cols + ['b_chr', 'b_st', 'b_ed', 'ovl']
        cdf = UT.read_pandas(c, names=cols1)
        sdf = cdf[cdf['ovl'] == 0][cols]
        sdf['locus'] = UT.calc_locus(sdf)
        sdf['len'] = sdf['ed'] - sdf['st']
        maxexonsize = self.ne_i['len'].max()
        sdf = sdf[(sdf['len'] > 20)
                  & (sdf['len'] < max(2 * maxexonsize, 20000))]
        UT.write_pandas(sdf, tmpprefix + '.e53pair.bed.gz')
        sdf.index.name = '_id'

        self.e53 = sdf.reset_index()
Beispiel #27
0
    def filter(self, **kw):
        """ Filter genes.  
        base pair repeat overlap % >= th_bp_ovl (default 50)
        exon_repeat_overlap % >= th_ex_ovl (default 50)
        #union exon < th_uexon (default 4)

        That is, by default, it filters out 2,3 exon genes with both base pair and exon level
        overlap to repeats are greater or equal to 50%. Does not apply to single exons. 

        """
        d = self.ugb
        pr = self.params
        fn = self.fnobj
        pr.update(kw)

        idx1 = (d['rep%'] >= pr['th_bp_ovl']) & (d['rviz%'] > pr['th_ex_ovl'])
        idx2 = (d['#junc'].notnull()) & (d['#uexons'] < pr['th_uexon'])
        idx = ~(idx1 & idx2)
        self.ugb2 = ugb2 = d[idx]  # filtered
        self.ugb3 = ugb3 = d[~idx]

        gids = ugb2.index.values
        ex0 = self.ex
        sj0 = self.sj
        uex = self.uex
        # filter ex,sj,uex
        self.ex2 = ex2 = ex0[ex0['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        self.sj2 = sj2 = sj0[sj0['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        self.uex2 = uex2 = uex[uex['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        gcovfld = 'gcov_' + pr['datacode'] if pr['datacode'] else 'gcov'
        self.gbed2 = gbed2 = GGB.unionex2bed12(uex2,
                                               name=pr['gname'],
                                               sc2=gcovfld,
                                               sc1='tlen')
        gbed2['sc2'] = gbed2['sc2'].astype(int)
        # write out filtered ex,sj,ci,unionex,gbed
        UT.write_pandas(ex2, fn.txtname('ex', category='output'), 'h')
        UT.write_pandas(sj2, fn.txtname('sj', category='output'), 'h')
        UT.chopintervals(ex2, fn.txtname('ci', category='output'))

        GGB.write_bed(ex2, fn.bedname('ex', category='output'))
        GGB.write_bed(sj2, fn.bedname('sj', category='output'))

        UT.write_pandas(uex2, fn.txtname('unionex', category='output'), 'h')
        UT.write_pandas(ugb2, fn.txtname('genes.stats', category='output'),
                        'h')
        UT.write_pandas(gbed2, fn.bedname('genes', category='output'),
                        '')  # BED12

        # also write filtered out genes
        self.ex3 = ex3 = ex0[~ex0['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        self.sj3 = sj3 = sj0[~sj0['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        self.uex3 = uex3 = uex[~uex['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        gcovfld = 'gcov_' + pr['datacode'] if pr['datacode'] else 'gcov'
        self.gbed3 = gbed3 = GGB.unionex2bed12(uex3,
                                               name=pr['gname'],
                                               sc2=gcovfld,
                                               sc1='tlen')
        gbed3['sc2'] = gbed3['sc2'].astype(int)
        # write out filtered ex,sj,ci,unionex,gbed
        UT.write_pandas(ex3, fn.txtname('removed.ex', category='output'), 'h')
        UT.write_pandas(sj3, fn.txtname('removed.sj', category='output'), 'h')
        UT.chopintervals(ex3, fn.txtname('removed.ci', category='output'))
        UT.write_pandas(uex3, fn.txtname('removed.unionex', category='output'),
                        'h')
        UT.write_pandas(ugb3,
                        fn.txtname('removed.genes.stats', category='output'),
                        'h')
        UT.write_pandas(gbed3, fn.bedname('removed.genes', category='output'),
                        '')  # BED12
Beispiel #28
0
    def find_match(self):
        en1 = self.en1
        en2 = self.en2
        # write internal,3,5,se exons separately for finding match
        a = en1.fname2(
            'emtmp.ex.bed.gz', en2.code
        )  # need to be unique to avoid parallel conflict (en1 ref shared)
        b = en2.fname('emtmp.ex.bed.gz')
        c = en1.fname2('emtmp.ex.ovl.txt.gz', en2.code)
        self.e1 = e1 = en1.model('ex')
        self.e2 = e2 = en2.model('ex')
        ecovname = self.colname('ecov')
        cols = [
            'chr', 'st', 'ed', 'cat', '_id', ecovname, '_gidx', 'len', 'strand'
        ]
        a = UT.write_pandas(e1[cols], a, '')
        b = UT.write_pandas(e2[cols], b, '')
        c = BT.bedtoolintersect(a, b, c, wao=True)
        ocols = cols + ['b_' + x for x in cols] + ['ovl']
        self.ov = ov = UT.read_pandas(c, names=ocols)  # overlaps of exons

        idxchr = ov['chr'] == ov['b_chr']  # str vs. str
        idxstrand = ov['strand'] == ov['b_strand']  # str vs. str
        idxp = (ov['strand'] == '+') & idxstrand
        idxn = (ov['strand'] == '-') & idxstrand
        idxst = ov['st'] == ov['b_st']  # b_st column mixed? type?
        idxed = ov['ed'] == ov['b_ed']  # b_ed column mixed? type?
        idxcat = ov['cat'] == ov['b_cat']
        idxcov = ov[ecovname] > 0  # exons with reads
        LOG.debug(
            '=' * 10 +
            'calculating match between {0} and {1}'.format(en1.code, en2.code))
        LOG.debug('len(ov):{0}'.format(len(ov)))
        for k in [
                'idxchr', 'idxstrand', 'idxp', 'idxn', 'idxst', 'idxed',
                'idxcat', 'idxcov'
        ]:
            v = locals()[k]
            LOG.debug('#{0}:{1}'.format(k, N.sum(v)))

        # internal exon cat='i' and chr,st,ed,strand match
        self.ei = ei = ov[idxchr & idxstrand & idxst & idxed & idxcat &
                          (ov['cat'] == 'i')].copy()
        # 5' cat='5' and chr,donor (+,ed)|(-,st) match, find closest
        self.e5 = e5 = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) & idxcat &
                          (ov['cat'] == '5')].copy()
        # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match
        self.e3 = e3 = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) & idxcat &
                          (ov['cat'] == '3')].copy()
        # se cat='s' and chr,
        self.es = es = ov[idxchr & (ov['cat'] == 's') & idxcat].copy()

        # allow overlap to ther categories
        self.e5b = e5b = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) &
                            (ov['cat'] == '5')].copy()
        # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match
        self.e3b = e3b = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) &
                            (ov['cat'] == '3')].copy()
        # se cat='s' and chr,
        self.esb = esb = ov[idxchr & (ov['cat'] == 's')].copy()

        # splice junction
        self.s1 = s1 = en1.model('sj')
        self.s2 = s2 = en2.model('sj')
        jcntname = self.colname('jcnt')
        l2c = UT.df2dict(s2, 'locus', jcntname)
        jhitname = self.colname2('jhit', en2.code)
        s1[jhitname] = [l2c.get(x, 0)
                        for x in s1['locus']]  # corresponding s2 count
        self.sj = sj = s1[
            s1[jhitname] > 0].copy()  # only consider s2 count > 0

        # for batch processing
        self.e = {
            'i': ei,
            '5': e5,
            '3': e3,
            's': es,
            'j': sj,
            '5b': e5b,
            '3b': e3b,
            'sb': esb
        }
Beispiel #29
0
def save_bed_covstats(bedpath, dstpath, bed12=False, checkuniq=False):
    tdic,cdic = get_total_bp_bedfile(bedpath, bed12, returndics=True, checkuniq=checkuniq)
    df = PD.DataFrame({c: {'totbp':tdic[c], 'covbp':cdic[c]} for c in cdic}).T
    df['acov'] = df['totbp']/df['covbp']
    df = df.sort_values('covbp',ascending=False)
    return UT.write_pandas(df, dstpath, 'ih')