Exemple #1
0
    def extract_nonovl_exons(self):
        ex = self.ex
        sj = self.sj
        ex = ex[ex['st'] < ex['ed']]
        sj = sj[sj['st'] < sj['ed']]
        # nonovl exons
        # ex['gene_type'] = ex['extra'].str.split(';').str[2].str.split().str[1].str[1:-1]
        cols0 = ['chr', 'st', 'ed', '_id']
        a = self.refpre + '.ex.bed.gz'
        a = UT.write_pandas(ex[cols0], a, '')
        b = self.refpre + '.sj.bed.gz'
        b = UT.write_pandas(sj[cols0], b, '')
        c1 = self.refpre + '.ex-ovl-sj.txt.gz'
        c2 = self.refpre + '.ex-ovl-ex.txt.gz'
        c1 = BT.bedtoolintersect(a, b, c1, wao=True)
        c2 = BT.bedtoolintersect(a, a, c2, wo=True)

        cols = cols0 + ['b_' + x for x in cols0] + ['ovl']
        sov = UT.read_pandas(c1, names=cols)
        sov['len'] = sov['ed'] - sov['st']
        sov['ovlratio'] = sov['ovl'] / sov['len']
        sovg = sov.groupby('_id')['ovlratio'].max()
        snonov = sovg[sovg < 1.]  # not completely covered by junction

        eov = UT.read_pandas(c2, names=cols)
        eovsize = eov.groupby('_id').size()
        enonov = eovsize[eovsize == 1]  # only overlaps with self
        self.ne_i0 = ne_i0 = ex.set_index('_id').ix[enonov.index].sort_values(
            ['chr', 'st', 'ed']).reset_index()
        self.ne_i0['len'] = ne_i0['ed'] - ne_i0['st']
        LOG.info('#non-ovl-ex0={0}'.format(len(enonov)))

        LOG.info('#non-ex-ovl-ex={0}, #non-sj-ovl-ex={1}'.format(
            len(enonov), len(snonov)))
        ids = set(enonov.index).intersection(snonov.index)
        LOG.info('#non-ovl-ex={0}'.format(len(ids)))
        self.nov_ex = novex = ex.set_index('_id').ix[ids].sort_values(
            ['chr', 'st', 'ed']).reset_index()
        novex['len'] = novex['ed'] - novex['st']
        self.ne_i = novex[novex['cat'] == 'i']
        self.ne_5 = novex[novex['cat'] == '5']
        self.ne_3 = novex[novex['cat'] == '3']
        self.ne_s = novex[novex['cat'] == 's']
Exemple #2
0
def count_repeats_viz_chr(bedpath, rmskpath, outpath):
    c = BT.bedtoolintersect(bedpath, rmskpath, outpath, wao=True)
    cols = [
        'chr', 'st', 'ed', 'name', 'b_chr', 'b_st', 'b_ed', 'b_name', 'strand',
        'ovl'
    ]
    df = UT.read_pandas(c, names=cols)
    df['rn'] = df['b_name'] + '(' + df['strand'] + ')'
    # group and concat repname
    dg = df.groupby('name')['rn'].apply(
        lambda x: ','.join(list(x))).reset_index()
    UT.write_pandas(dg, outpath, 'h')
Exemple #3
0
def as3exsj(dstpre, minelen=150, np=7):
    ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A3.EXDFCOLS)
    sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A3.SJDFCOLS)
    se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A3.EXDFCOLS)
    paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A3.PATHCOLS)
    #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.'
    # sj.loc[sj['strand']=='.+','strand'] = '+'
    # sj.loc[sj['strand']=='.-','strand'] = '-'
    sj['st'] = sj['st']+1 
    cols = A3.EXDFCOLS
    ex = PD.concat([ex[cols],se[cols]],ignore_index=True)
    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    graphpre = dstpre+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    # genes = GP.find_genes4(sj,ex,
    #     filepre=prefix,
    #     np=np,
    #     override=False,
    #     separatese=True)
    genes = GP.find_genes3(sj,ex, # don't use exon overlap as connection
        np=np,
        override=False)
    ex.loc[ex['kind']=='3','cat'] = '3'
    ex.loc[ex['kind']=='5','cat'] = '5'

    # remove these with elen smaller than minelen
    ex['len'] = ex['ed']-ex['st']
    exsiz = ex.groupby('_gidx')['len'].sum()
    rgidx = exsiz[exsiz<minelen].index.values
    LOG.info('minelen filter #ex {0}=>{1}'.format(len(ex), len(ex)-len(rgidx)))
    ex2 = ex[~ex['_gidx'].isin(rgidx)]
    sj2 = sj[~sj['_gidx'].isin(rgidx)]
    # write
    UT.write_pandas(ex2, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj2, dstpre+'.sj.txt.gz', 'h')
    ci = UT.chopintervals(ex2, dstpre+'.ci.txt.gz')
    return sj2, ex2
Exemple #4
0
def filter_sj(bwsjpre, statspath, chrom, csize, params):
    # read in junction stats
    stats = UT.read_pandas(statspath)
    if 'chr' not in stats:
        stats['chr'] = [x.split(':')[0] for x in stats['locus']]
    if '#detected' in stats:
        stats.rename(columns={'#detected': 'detected'}, inplace=True)
    stats = stats[stats['chr'] == chrom].copy()
    if 'pc' not in stats:
        stats['pc'] = [locus2pc(x) for x in stats['locus']]
    flds = ['detected', 'maxcnt', 'maxoverhang']
    dics = {f: UT.df2dict(stats, 'pc', f) for f in flds}
    # read sjpath
    fpath_chr = bwsjpre + '.sjpath.{0}.bed.gz'.format(chrom)
    dstpath = bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format(chrom)
    if os.path.exists(fpath_chr):
        sj = GGB.read_bed(fpath_chr)
    else:
        fpath = bwsjpre + '.sjpath.bed.gz'
        sj = GGB.read_bed(fpath)
        sj = sj[sj['chr'] == chrom].copy()
    name0 = sj.iloc[0]['name']
    if len(name0.split('|')) < len(name0.split(',')):  # exons attached?
        sj['name'] = [','.join(x.split(',')[1:-1]) for x in sj['name']]
    # filter unstranded
    sj = sj[sj['strand'].isin(['+', '-'])].copy()
    # filter with stats
    for f in flds:
        sj[f] = [
            N.min([dics[f].get(x, 0) for x in y.split(',')])
            for y in sj['name']
        ]
        sj = sj[sj[f] > params['th_' + f]].copy()  # filter
    # edge exon size
    sj['eflen'] = [int(x.split(',')[0]) for x in sj['esizes']]
    sj['ellen'] = [int(x.split(',')[-2]) for x in sj['esizes']]
    eth = params['th_minedgeexon']
    sj = sj[(sj['eflen'] > eth) & (sj['ellen'] > eth)].copy()
    # calculate sjratio, sjratio2
    sjexbw = A2.SjExBigWigs(bwsjpre, mixunstranded=False)
    for s in ['+', '-']:
        idx = sj['strand'] == s
        with sjexbw:
            sa = sjexbw.bws['sj'][s].get(chrom, 0, csize)
            ea = sjexbw.bws['ex'][s].get(chrom, 0, csize)
        a = sa + ea
        sj.loc[idx, 'sjratio2'] = [
            x / N.mean(a[int(s):int(e)])
            for x, s, e in sj[idx][['sc1', 'tst', 'ted']].values
        ]
    sj = sj[sj['sjratio2'] > params['th_sjratio2']]
    GGB.write_bed(sj, dstpath, ncols=12)
Exemple #5
0
    def __init__(self, j2pre, code, chromdir, rmskviz, dstpre, **kw):
        self.j2pre = j2pre
        self.fnobj = FN.FileNamesBase(prefix)
        self.chromdir = chromdir
        self.rmskviz = rmskviz
        self.gfc = FA.GenomeFASTAChroms(chromdir)

        self.params = RMSKPARAMS.copy()
        self.params.update(kw)

        # get exons from paths
        self.paths = paths = UT.read_pandas(j2pre + '.paths.txt.gz',
                                            names=A2.PATHCOLS)
Exemple #6
0
 def ci(self):
     cicols = ['chr','st','ed','name','id']
     cipath = self.cipath()
     if os.path.exists(cipath):
         LOG.info('reading ci({0}) from cache...'.format(cipath))
         ci = UT.read_pandas(cipath, names=cicols)
         return ci
     if not os.path.exists(self.gtfpath):
         raise RuntimeError('file {0} does not exist'.format(self.gtfpath))
     LOG.info('making ci..')
     sj,ex = self.sjex()
     ci = UT.chopintervals(ex, cipath)
     return ci
Exemple #7
0
def filter_sjexdf(mdstpre, rdstpre):
    exdf = UT.read_pandas(mdstpre + '.exdf.txt.gz', names=A3.EXDFCOLS)
    sedf = UT.read_pandas(mdstpre + '.sedf.txt.gz', names=A3.EXDFCOLS)
    exdf = PD.concat([exdf, sedf], ignore_index=True)
    sjdf = UT.read_pandas(mdstpre + '.sjdf.txt.gz', names=A3.SJDFCOLS)

    ex = UT.read_pandas(rdstpre + '.ex.txt.gz')
    sj = UT.read_pandas(rdstpre + '.sj.txt.gz')

    def select_chromwise_df(exdf, ex):
        npchrs = []
        for chrom in exdf['chr'].unique():
            pchr = exdf[exdf['chr'] == chrom]
            echr = ex[ex['chr'] == chrom]
            exnames = set(echr['name'].values)
            idx = [x in exnames for x in pchr['name']]
            npchrs.append(pchr[idx])
        return PD.concat(npchrs, ignore_index=True)

    nexdf = select_chromwise_df(exdf, ex)
    nsjdf = select_chromwise_df(sjdf, sj)
    UT.write_pandas(nexdf, rdstpre + '.exdf.txt.gz', '')
    UT.write_pandas(nsjdf, rdstpre + '.sjdf.txt.gz', '')
Exemple #8
0
def read_sj(path, parsename=False):
    # read BED (input) or TXT (output) with consistent column names
    if path[-7:]=='.bed.gz' or path[-4:]=='.bed':
        df = read_bed(path).rename(columns={'sc1':'ucnt','tst':'mcnt'})
        if parsename:
            # name is encoded as above 'motif-k0[k1]-u(reads)-m(reads)-o(maxoverhang)'
            # motif(0), known(1), u(2), m(3), o(4)
            tmp = df['name'].str.split('-')
            df['motif'] = tmp.str[0]
            df['annotated'] = tmp.str[1].str[1]
            df['maxoverhang'] = tmp.str[4].str[1:].astype(int)
    else:
        df = UT.read_pandas(path) # header should be there
    return df
Exemple #9
0
def trim_ex(expath, dstpath, dstcipath, length=1000, gidfld='_gidx', np=7):
    """Generate trimmed version of genes for calculating coverage to avoid length bias. 

    Args:
        expath (str): path exon tsv
        dstpath (str): path to trimmed exon
        dstcipath (str): path to ci (chopped interval) 
        length (pos int): length to trim from 3' end in base pair (default 1000 bp)
        gidfld (str): column name for gene id (default _gidx)
        np (pos int): number of CPU to use

    Generates:
        Two files (dstpath, dstcipath).

    Returns:
        a dataframe containing trimmed exons
    """
    #ex = UT.read_pandas(MD.paths[code]['ex'])
    #dstpath = MD.trimmedex[code][length]['ex']
    #dstcipath = MD.trimmedex[code][length]['ci']
    ex = UT.read_pandas(expath)
    if 'len' not in ex.columns:
        ex['len'] = ex['ed'] - ex['st']
    if np==1:
        recs = trim_ex_worker((ex, length, gidfld))
    else:
        chroms = sorted(ex['chr'].unique())
        data = [(ex[ex['chr']==c], length, gidfld) for c in chroms]
        recs = []
        try:
            p = multiprocessing.Pool(np)
            for v in p.map(trim_ex_worker, data):
                recs += v
            #recs = reduce(iadd, p.map(trim_ex_worker, *zip(*data)))
        finally:
            p.close()
            # p.join()
    cols = list(ex.columns.values)
    nex = PD.DataFrame(recs, columns = cols)
    nex['len'] = nex['ed'] - nex['st']
    # edge case
    nex.loc[nex['st']==nex['ed'],'ed'] = nex['st'] + 1
    UT.save_tsv_nidx_whead(nex, dstpath)
    UT.chopintervals(nex, dstcipath)

    return nex
Exemple #10
0
def filter_paths(mdstpre, rdstpre):
    ex = UT.read_pandas(rdstpre + '.ex.txt.gz')

    def select_chromwise(paths, ex):
        npchrs = []
        for chrom in paths['chr'].unique():
            pchr = paths[paths['chr'] == chrom]
            echr = ex[ex['chr'] == chrom]
            exnames = set(echr['name'].values)
            #e2gname = UT.df2dict(echr,'name','gname')
            idx = [
                all([x in exnames for x in y.split('|')]) for y in pchr['name']
            ]
            npchrs.append(pchr[idx])
        return PD.concat(npchrs, ignore_index=True)

    paths = GGB.read_bed(mdstpre + '.paths.withse.bed.gz')
    npaths = select_chromwise(paths, ex)
    GGB.write_bed(npaths, rdstpre + '.paths.withse.bed.gz', ncols=12)

    paths = GGB.read_bed(mdstpre + '.paths.txt.gz')
    npaths = select_chromwise(paths, ex)
    GGB.write_bed(npaths, rdstpre + '.paths.txt.gz', ncols=12)
Exemple #11
0
def gtf_from_bed12(modelpre, dstpath=None, source='.'):
    # path['gname'] contains gene id
    paths = GGB.read_bed(modelpre+'.paths.withse.bed.gz')
    ex = UT.read_pandas(modelpre+'.ex.txt.gz')
    ex['id'] = ex['chr']+':'+ex['name']
    n2gn = UT.df2dict(ex, 'id', 'gname')
    # n2gn = UT.df2dict(ex, 'name', 'gname') # there may be same st,ed in different chromosome
    paths['id'] = paths['chr']+':'+paths['name']
    paths['id0'] = paths['chr']+':'+paths['name'].str.split('|').str[0]
    paths['gname'] = [n2gn[x] for x in paths['id0']]
    g2cnt = {}
    tnames = []
    for x in paths['gname']:
        i = g2cnt.get(x,1)
        tnames.append('{0}.{1}'.format(x,i))
        g2cnt[x] = i+1
    paths['tname'] = tnames    
    txt = 'gene_id "{0}"; transcript_id "{1}"; exon_number "{2}";'
    def _gen():
        cols = ['chr','st','ed','gname','tname','esizes','estarts','strand']
        for c,s,e,gn,tn,esi,est,strand in paths[cols].values:
            esizes = [int(x) for x in esi.split(',')[:-1]]
            estarts = [int(x) for x in est.split(',')[:-1]]
            for i,(x,y) in enumerate(zip(esizes,estarts)):
                est = s+y
                eed = est+x
                extra = txt.format(gn,tn,i+1)
                yield (c,source,'exon',est+1,eed,'.',strand,'.',extra)
    df = PD.DataFrame([x for x in _gen()], columns=GGB.GTFCOLS)
    if dstpath is None:
        dstpath = bedpath.replace('.bed','.gtf')
    GGB.write_gtf(df, dstpath)
    
    idf = paths[['id','chr','name','tname','gname']]
    UT.write_pandas(idf, modelpre+'.idmap.txt.gz','h')
    return df
Exemple #12
0
    def calc_53gap_params(self, covfactor=0, np=10, emaxth=1, eth=1):
        zoom = self.zoom
        d5path = self.dstpre + '.{0}.{1}.gap5params.txt.gz'.format(
            self.refcode, covfactor)
        d3path = self.dstpre + '.{0}.{1}.gap3params.txt.gz'.format(
            self.refcode, covfactor)
        if os.path.exists(d5path):
            print('reading from cache {0}'.format(d5path))
            d5 = UT.read_pandas(d5path)
        else:
            d5 = self.calc_params_mp(self.ne_5,
                                     win=8192,
                                     np=np,
                                     gapmode='53',
                                     direction='<',
                                     covfactor=covfactor)
            UT.write_pandas(d5, d5path, 'h')
        if os.path.exists(d3path):
            print('reading from cache {0}'.format(d3path))
            d3 = UT.read_pandas(d3path)
        else:
            d3 = self.calc_params_mp(self.ne_3,
                                     win=8192,
                                     np=np,
                                     gapmode='53',
                                     direction='>',
                                     covfactor=covfactor)
            UT.write_pandas(d3, d3path, 'h')

        i5 = (d5['eOut'] > eth) & (d5['emax'] > emaxth)
        i3 = (d3['eIn'] > eth) & (d3['emax'] > emaxth)
        d50 = d5[i5]
        d30 = d3[i3]

        def _fitone(d0, x, y1, y2, rx='sin', lrx='lsin'):
            da = d0[[x, y1]].copy().rename(columns={y1: 'gap', x: rx})
            db = d0[[x, y2]].copy().rename(columns={y2: 'gap', x: rx})
            da['kind'] = 1
            db['kind'] = 0
            D = PD.concat([da, db], ignore_index=True)
            D[lrx] = N.log2(zoom * D[rx] + 1)
            D['lgap'] = N.log2(D['gap'] + 1)
            X = D[[lrx, 'lgap']].values
            Y = D['kind'].values
            lr = LogisticRegression()
            lr.fit(X, Y)
            Z = lr.predict(X)
            return locals()

        fit5 = _fitone(d50, 'eOut', 'gap', 'gapIn', 'ein', 'lein')
        fit3 = _fitone(d30, 'eIn', 'gap', 'gapOut', 'ein', 'lein')

        # max exon size
        m5 = N.max(self.ne_5['len'])
        m3 = N.max(self.ne_3['len'])

        rx, lrx = 'ein', 'lein'
        # save coefs
        p5path = self.dstpre + '.{0}.gap5params.json'.format(self.refcode)
        f = fit5
        self.write_params(p5path, f['lr'], f['Y'], f['Z'], [lrx, 'lgap'], {
            'th': covfactor,
            'zoom': zoom,
            'maxsize': int(m5)
        })
        p3path = self.dstpre + '.{0}.gap3params.json'.format(self.refcode)
        f = fit3
        self.write_params(p3path, f['lr'], f['Y'], f['Z'], [lrx, 'lgap'], {
            'th': covfactor,
            'zoom': zoom,
            'maxsize': int(m3)
        })

        # save scatter plots
        spath = self.dstpre + '.{0}.gap53params'.format(self.refcode)
        title = self.dstpre.split('/')[-1]
        self.plot_gap53_fit(fit5, fit3, spath + '.0.png', title, ptyp='both')
        self.plot_gap53_fit(fit5, fit3, spath + '.pdf', title, ptyp='pdf')
        self.plot_gap53_fit(fit5, fit3, spath + '.png', title, ptyp='png')

        return locals()
Exemple #13
0
 def ex(self):
     sjpath, expath = self.sjexpaths()
     if UT.notstale(expath):
         return UT.read_pandas(expath)
     sj,ex = self.sjex()
     return ex
Exemple #14
0
    def extract_53_pair(self):
        # between genes
        ex = self.ex
        tmpprefix = self.refpre
        ex['_apos'] = ex['a_pos'].str.split(':').str[1].astype(int)
        ex['_dpos'] = ex['d_pos'].str.split(':').str[1].astype(int)
        ex.loc[ex['cat'] == '3', 'spos'] = ex['_apos']
        ex.loc[ex['cat'] == '5', 'spos'] = ex['_dpos']
        cols = ['chr', 'st', 'ed', 'name', 'strand', '_gidx1', '_gidx2']

        def _find(ecs, chrom, strand):
            e53 = ecs[ecs['cat'].isin(['3', '5'])].sort_values('spos')
            #esorted = echrstrand.sort_values('_apos')
            v1 = e53.iloc[:-1][['spos', 'cat', '_gidx', '_id', 'st',
                                'ed']].values
            v2 = e53.iloc[1:][['spos', 'cat', '_gidx', '_id', 'st',
                               'ed']].values
            pairs = []
            if strand == '+':
                for r1, r2 in zip(v1, v2):
                    if r1[2] != r2[2]:  # not same gene
                        if (r1[1] == '3') & (r2[1] == '5') & (
                                r1[5] < r2[4]):  # non overlapping 3=>5
                            name = '+g{0}e{1}|g{2}e{3}'.format(
                                r1[2], r1[3], r2[2], r2[3])
                            pairs.append((chrom, r1[0], r2[0], name, strand,
                                          r1[2], r2[2]))
            else:
                for r1, r2 in zip(v1, v2):
                    if r1[2] != r2[2]:
                        if (r1[1] == '5') & (r2[1]
                                             == '3') & (r1[5] < r2[4]):  #
                            name = '-g{0}e{1}|g{2}e{3}'.format(
                                r1[2], r1[3], r2[2], r2[3])
                            pairs.append((chrom, r1[0], r2[0], name, strand,
                                          r1[2], r2[2]))

            df = PD.DataFrame(pairs, columns=cols)
            return df

        rslts = []
        for chrom in ex['chr'].unique():
            for strand in ['+', '-']:
                echrstrand = ex[(ex['chr'] == chrom)
                                & (ex['strand'] == strand)]
                rslts.append(_find(echrstrand, chrom, strand))
        df = PD.concat(rslts,
                       ignore_index=True).sort_values(['chr', 'st', 'ed'])
        # intersect with internal exons
        a = tmpprefix + '.53.exi.bed'  # ncol 3
        b = tmpprefix + '.53.bed'  #ncol 5
        c = tmpprefix + '.53.exi.ovl.txt'
        exi = ex[ex['cat'] == 'i'].sort_values(['chr', 'st', 'ed'])
        UT.write_pandas(exi[['chr', 'st', 'ed']], a, '')
        UT.write_pandas(df, b, '')
        c = BT.bedtoolintersect(b, a, c, wao=True)
        cols1 = cols + ['b_chr', 'b_st', 'b_ed', 'ovl']
        cdf = UT.read_pandas(c, names=cols1)
        sdf = cdf[cdf['ovl'] == 0][cols]
        sdf['locus'] = UT.calc_locus(sdf)
        sdf['len'] = sdf['ed'] - sdf['st']
        maxexonsize = self.ne_i['len'].max()
        sdf = sdf[(sdf['len'] > 20)
                  & (sdf['len'] < max(2 * maxexonsize, 20000))]
        UT.write_pandas(sdf, tmpprefix + '.e53pair.bed.gz')
        sdf.index.name = '_id'

        self.e53 = sdf.reset_index()
Exemple #15
0
    def calc_53_params(self, sdiffth=1, np=10, alpha=0.1):
        # get parameters
        dic = {}
        zoom = self.zoom
        # seta = ['ne_i','ne_5','ne_3','e5i','e3i','e5ia','e3ia']
        # setb = ['ne_5','ne_3','e5i','e3i','e5ia','e3ia']
        # setc = ['ne_i','ne_3','ne_5','e5ia','e3ia']
        seta = ['ne_i', 'ne_5', 'ne_3', 'e5i', 'e3i']
        setb = ['ne_5', 'ne_3', 'e5i', 'e3i']
        setc = ['ne_i', 'ne_3', 'ne_5']
        for x in seta:
            fpath = self.dstpre + '.{0}.{1}.flux.txt.gz'.format(
                self.refcode, x)
            if os.path.exists(fpath):
                print('reading from cache {0}'.format(fpath))
                dic[x] = UT.read_pandas(fpath)
            else:
                df = getattr(self, x)
                print('calculating {0}...'.format(x))
                dic[x] = self.calc_flux_mp(df, np=np)
                UT.write_pandas(dic[x], fpath, 'h')
        dicb = {}
        FN0 = 0
        for x in setb:
            f = dic[x]
            f['kind'] = 1
            idx0 = N.abs(
                N.log2(zoom * f['sin'] + 1) -
                N.log2(zoom * f['sout'] + 1)) > sdiffth
            idx1 = (f['sdin'] != 0) | (f['sdout'] != 0
                                       )  # should have either in or out
            idx = idx0 & idx1
            FN0 += N.sum((~idx0) & idx1)  # pre filtered positive
            dicb[x] = f[idx]
        f = dic['ne_i']
        f['kind'] = 0
        idx = (f['ecovmax'] > 1) & (
            (f['sdin'] != 0) & (f['sdout'] != 0))  # should have both in&out
        dicb['ne_i'] = f[idx]
        D = PD.concat([dicb[x] for x in setc], ignore_index=True)
        D2 = PD.concat([dicb['ne_i'], dicb['e3i'], dicb['e5i']],
                       ignore_index=True)
        # don't use e3i, e5i too many non-actives

        D['lsin'] = N.log2(zoom * D['sin'] + 1)
        D['lsout'] = N.log2(zoom * D['sout'] + 1)
        D['sdiff'] = N.abs(D['lsin'] - D['lsout'])
        D['smean'] = (D['lsin'] + D['lsout']) / 2.
        X = D[['sdiff', 'smean']].values
        Y = D['kind'].values
        lr = LogisticRegression()
        lr.fit(X, Y)
        Z = lr.predict(X)
        D2['lsin'] = N.log2(zoom * D2['sin'] + 1)
        D2['lsout'] = N.log2(zoom * D2['sout'] + 1)
        D2['sdiff'] = N.abs(D2['lsin'] - D2['lsout'])
        D2['smean'] = (D2['lsin'] + D2['lsout']) / 2.
        X2 = D2[['sdiff', 'smean']].values
        Z2 = lr.predict(X2)
        # save fit coefficients
        ppath = self.dstpre + '.{0}.e53params.json'.format(self.refcode)
        self.write_params(ppath,
                          lr,
                          Y,
                          Z, ['sdiff', 'smean'], {
                              'sdiffth': sdiffth,
                              'zoom': zoom
                          },
                          FN0=FN0)
        # save scatter plots
        spath = self.dstpre + '.{0}.e53params'.format(self.refcode)
        title = self.dstpre.split('/')[-1]
        self.plot_sin_sout(dic,
                           D,
                           Y,
                           Z,
                           D2,
                           Z2,
                           sdiffth,
                           spath + '.0.png',
                           title,
                           alpha=alpha)
        self.plot_sin_sout(dic,
                           D,
                           Y,
                           Z,
                           D2,
                           Z2,
                           sdiffth,
                           spath + '.pdf',
                           title,
                           ptyp='pdf',
                           alpha=alpha)
        self.plot_sin_sout(dic,
                           D,
                           Y,
                           Z,
                           D2,
                           Z2,
                           sdiffth,
                           spath + '.png',
                           title,
                           ptyp='png',
                           alpha=alpha)
        return locals()
Exemple #16
0
def calc_gcov(expath, cipath, bwpath, dstprefix, override=False, np=4):
    """Calculate gene coverages.

    Args:
        expath: merged ex
        cipath: chopped interval for ex
        bwpath: bigwig file (sample)
        dstprefix: prefix for outputs

    Outputs:
        1. dstprefix+'.covci.txt.gz'
        2. dstprefix+'.gcov.txt.gz' : DataFrame(col:_gidx,len,val,gcov,len2,gcov2,cids)
            len2: calculate length from ci with cov > 0
            (normal length = use entire ci's belonging to the gene)
            gcov2 = val/len2
            cids: cid with cov > for the gene ','.joined
    """
    ex = UT.read_pandas(expath)
    covcipath = dstprefix + 'covci.txt.gz'
    gcovpath = dstprefix + 'gcov.txt.gz'

    if UT.notstale([expath, cipath], covcipath, override):
        cc = UT.read_pandas(covcipath)
    else:
        if UT.notstale(expath, cipath, False):
            ci = UT.read_pandas(cipath,
                                names=['chr', 'st', 'ed', 'name', 'id'])
        else:
            ci = UT.chopintervals(ex, cipath, idcol='_id')
        cc = calc_cov_mp(ci, bwpath, covcipath, np=np)

    # if override or (not os.path.exists(covcipath)):
    #     # calc covci
    #     if not os.path.exists(cipath):
    #         ci = UT.chopintervals(ex, cipath, idcol='_id')
    #     else:
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # else:
    #     cc = UT.read_pandas(covcipath)

    if 'id' not in cc.columns:
        cc['id'] = cc['sc1']
    if 'eid' not in cc.columns:
        cc['eid'] = cc['name'].astype(str).apply(
            lambda x: [int(y) for y in x.split(',')])
    cc['len'] = cc['ed'] - cc['st']
    cc['val'] = cc['cov'] * cc['len']
    ccf = UT.flattendf(cc[['id', 'eid', 'len', 'val', 'st', 'ed']], 'eid')
    e2g = dict(UT.izipcols(ex, ['_id', '_gidx']))
    ccf['_gidx'] = [e2g[x] for x in ccf['eid']]
    # for normal gcov: take unique combination of (gid, id) (id=cid)
    # for gocv2 : first select ccf with val>0
    ccf2 = ccf[ccf['val'] > 0].groupby(['_gidx', 'id']).first().reset_index()
    ccf2g = ccf2.groupby('_gidx')
    df2 = ccf2g[['len', 'val']].sum()
    df2['gcov2'] = df2['val'] / df2['len']
    df2['cids'] = ccf2g['id'].apply(lambda x: ','.join([str(y) for y in x]))
    df2['gst2'] = ccf2g['st'].min()
    df2['ged2'] = ccf2g['ed'].max()
    df2['glen2'] = df2['ged2'] - df2['gst2']

    df2 = df2.reset_index()

    ccf1 = ccf.groupby(['_gidx', 'id']).first().reset_index()
    ccf1g = ccf1.groupby('_gidx')
    df = ccf1g[['len', 'val']].sum()
    df['gcov'] = df['val'] / df['len']
    df['st'] = ccf1g['st'].min()
    df['ed'] = ccf1g['ed'].max()
    df['glen'] = df['ed'] - df['st']
    df = df.reset_index()
    g2chr = dict(UT.izipcols(ex, ['_gidx', 'chr']))
    df['chr'] = [g2chr[x] for x in df['_gidx']]

    def _set_df2prop(src, tgt, default):
        dic = dict(UT.izipcols(df2, ['_gidx', src]))
        df[tgt] = [dic.get(x, default) for x in df['_gidx']]

    _set_df2prop('gcov2', 'gcov2', 0)
    _set_df2prop('len', 'len2', 0)
    _set_df2prop('cids', 'cids', '')
    _set_df2prop('gst2', 'st2', -1)
    _set_df2prop('ged2', 'ed2', -1)
    _set_df2prop('glen2', 'glen2', 0)

    cols = [
        '_gidx', 'chr', 'st', 'ed', 'len', 'val', 'gcov', 'glen', 'len2',
        'gcov2', 'cids', 'st2', 'ed2', 'glen2'
    ]
    cols = ['_gidx', 'gcov']
    df = df[cols]
    UT.save_tsv_nidx_whead(df, gcovpath)
    return df
Exemple #17
0
    def prep_sjex(self, en, np=1, savesjex=True, calccovs=True):
        """ Assign ecov, gcov, jcnt """
        dcode = self.datacode
        sj = en.model('sj', dcode)
        ex = en.model('ex', dcode)
        savesj = False
        saveex = False
        # check support
        if len(sj) > 0:
            dids = set(ex['d_id'].values)
            aids = set(ex['a_id'].values)
            idx = sj['a_id'].isin(aids) & sj['d_id'].isin(dids)
            sj = sj[idx].copy()
            en.sj = sj
        if '_id' not in ex.columns:  # edge case (len(sj)==0)
            ex['_id'] = N.arange(len(ex))
        if '_gidx' not in ex.columns:  # edge case (len(sj)==0)
            ex['_gidx'] = N.arange(len(ex))

        # length
        if 'len' not in sj.columns:
            sj['len'] = sj['ed'] - sj['st']
            savesj = True
        if 'len' not in ex.columns:
            ex['len'] = ex['ed'] - ex['st']
            saveex = True
        # ecov
        if calccovs:
            print('calccov for {0}'.format(en.code))
            ecovname = self.colname('ecov')
            if ecovname not in ex.columns:
                ecov = CC.calc_ecov(
                    expath=en.modelpath('ex'),
                    cipath=en.modelpath('ci'),
                    bwpath=self.bigwig,
                    dstprefix=en.fname2(
                        '', self.datacode),  # cov is data dependent
                    override=False,  # override previous?
                    np=np)
                ex[ecovname] = ecov.set_index('eid').ix[
                    ex['_id'].values]['ecov'].values
                saveex = True
            # gcov, glen
            gcovname = self.colname('gcov')
            if gcovname not in ex.columns:
                gcov = CC.calc_gcov(
                    expath=en.modelpath('ex'),
                    cipath=en.modelpath('ci'),
                    bwpath=self.bigwig,
                    dstprefix=en.fname2('', self.datacode),
                    override=False,  # reuse covci from ecov calc
                    np=np)
                tmp = gcov.set_index('_gidx').ix[ex['_gidx'].values]
                ex[gcovname] = tmp['gcov'].values
                if 'glen' in tmp:
                    ex['glen'] = tmp[
                        'glen'].values  # glen is only dependent on model not data
                saveex = True
        else:
            ecovname = self.colname('ecov')
            if ecovname not in ex.columns:
                ex[ecovname] = 0
            gcovname = self.colname('gcov')
            if gcovname not in ex.columns:
                ex[gcovname] = 0
        # sjcnt
        ucntname = self.colname('ucnt')
        mcntname = self.colname('mcnt')
        jcntname = self.colname('jcnt')
        sjfile = self.sjfile
        if ucntname not in sj.columns:
            if sjfile.endswith('.bed') or sjfile.endswith(
                    '.bed.gz'):  # no header
                dsj = UT.read_pandas(sjfile,
                                     names=[
                                         'chr', 'st', 'ed', 'name', 'ucnt',
                                         'strand', 'mcnt'
                                     ])
            else:  # assume txt file with header
                dsj = UT.read_pandas(sjfile)
            # locus based matching
            dsj['locus'] = UT.calc_locus_strand(dsj)
            sj['locus'] = UT.calc_locus_strand(sj)
            l2u = UT.df2dict(dsj, 'locus', 'ucnt')
            l2m = UT.df2dict(dsj, 'locus', 'mcnt')
            sj[ucntname] = [l2u.get(x, 0) for x in sj['locus']]
            sj[mcntname] = [l2m.get(x, 0) for x in sj['locus']]
            sj[jcntname] = [x or y for x, y in sj[[ucntname, mcntname]].values]
            savesj = True
        if saveex and savesjex:
            en.savemodel('ex', dcode, category='output')
        if savesj and savesjex:
            en.savemodel('sj', dcode, category='output')
Exemple #18
0
def read_ucsc_knownGene(path):
    return UT.read_pandas(path, names=KGCOLS)
Exemple #19
0
    def find_match(self):
        en1 = self.en1
        en2 = self.en2
        # write internal,3,5,se exons separately for finding match
        a = en1.fname2(
            'emtmp.ex.bed.gz', en2.code
        )  # need to be unique to avoid parallel conflict (en1 ref shared)
        b = en2.fname('emtmp.ex.bed.gz')
        c = en1.fname2('emtmp.ex.ovl.txt.gz', en2.code)
        self.e1 = e1 = en1.model('ex')
        self.e2 = e2 = en2.model('ex')
        ecovname = self.colname('ecov')
        cols = [
            'chr', 'st', 'ed', 'cat', '_id', ecovname, '_gidx', 'len', 'strand'
        ]
        a = UT.write_pandas(e1[cols], a, '')
        b = UT.write_pandas(e2[cols], b, '')
        c = BT.bedtoolintersect(a, b, c, wao=True)
        ocols = cols + ['b_' + x for x in cols] + ['ovl']
        self.ov = ov = UT.read_pandas(c, names=ocols)  # overlaps of exons

        idxchr = ov['chr'] == ov['b_chr']  # str vs. str
        idxstrand = ov['strand'] == ov['b_strand']  # str vs. str
        idxp = (ov['strand'] == '+') & idxstrand
        idxn = (ov['strand'] == '-') & idxstrand
        idxst = ov['st'] == ov['b_st']  # b_st column mixed? type?
        idxed = ov['ed'] == ov['b_ed']  # b_ed column mixed? type?
        idxcat = ov['cat'] == ov['b_cat']
        idxcov = ov[ecovname] > 0  # exons with reads
        LOG.debug(
            '=' * 10 +
            'calculating match between {0} and {1}'.format(en1.code, en2.code))
        LOG.debug('len(ov):{0}'.format(len(ov)))
        for k in [
                'idxchr', 'idxstrand', 'idxp', 'idxn', 'idxst', 'idxed',
                'idxcat', 'idxcov'
        ]:
            v = locals()[k]
            LOG.debug('#{0}:{1}'.format(k, N.sum(v)))

        # internal exon cat='i' and chr,st,ed,strand match
        self.ei = ei = ov[idxchr & idxstrand & idxst & idxed & idxcat &
                          (ov['cat'] == 'i')].copy()
        # 5' cat='5' and chr,donor (+,ed)|(-,st) match, find closest
        self.e5 = e5 = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) & idxcat &
                          (ov['cat'] == '5')].copy()
        # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match
        self.e3 = e3 = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) & idxcat &
                          (ov['cat'] == '3')].copy()
        # se cat='s' and chr,
        self.es = es = ov[idxchr & (ov['cat'] == 's') & idxcat].copy()

        # allow overlap to ther categories
        self.e5b = e5b = ov[idxchr & ((idxp & idxed) | (idxn & idxst)) &
                            (ov['cat'] == '5')].copy()
        # 3' cat='3' and chr,acceptor (+,st)|(-,ed) match
        self.e3b = e3b = ov[idxchr & ((idxn & idxed) | (idxp & idxst)) &
                            (ov['cat'] == '3')].copy()
        # se cat='s' and chr,
        self.esb = esb = ov[idxchr & (ov['cat'] == 's')].copy()

        # splice junction
        self.s1 = s1 = en1.model('sj')
        self.s2 = s2 = en2.model('sj')
        jcntname = self.colname('jcnt')
        l2c = UT.df2dict(s2, 'locus', jcntname)
        jhitname = self.colname2('jhit', en2.code)
        s1[jhitname] = [l2c.get(x, 0)
                        for x in s1['locus']]  # corresponding s2 count
        self.sj = sj = s1[
            s1[jhitname] > 0].copy()  # only consider s2 count > 0

        # for batch processing
        self.e = {
            'i': ei,
            '5': e5,
            '3': e3,
            's': es,
            'j': sj,
            '5b': e5b,
            '3b': e3b,
            'sb': esb
        }
Exemple #20
0
def count_repeats_viz_mp(beddf,
                         rmskvizpath,
                         idcol='_id',
                         np=3,
                         prefix=None,
                         expand=0,
                         col='repnames'):
    """Use rmsk-viz track and check each (unioned) exon overlaps with repeats and report repeat name(s).
    Uses Bedtools and calculates chromosome-wise.  

    Args:
        beddf: Pandas DataFrame with chr,st,ed cols, when calculating repeats bp
         for genes, unioned bed should be used (use utils.make_unionex)
        idcol: colname for unique row id (default _id)
        rmskvizpath: path to repeat masker viz BED7 file (created using rmskviz2bed7)
        np: number of CPU to use
        prefix: path prefix for temp file, if not None temp files are kept. (default None)
        expand: how many bases to expand exon region in each side (default 0)
        col: column name to put in overlapping repeat names (if multiple comma separated)

    Outputs:
        are put into beddf columns with colname col(default repnames)

    """
    cleanup = False
    if prefix is None:
        cleanup = True
        prefix = os.path.join(os.path.dirname(rmskvizpath),
                              str(uuid.uuid4()) + '_')

    # chrom-wise
    chroms = sorted(beddf['chr'].unique())
    # check whether rmskviz is already split
    splitrmsk = False
    for chrom in chroms:
        rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom)  # reuse
        if not os.path.exists(rpath):
            splitrmsk = True
            break
    if splitrmsk:
        rmsk = GGB.read_bed(rmskvizpath)

    args = []
    bfiles = []
    ofiles = []
    for chrom in chroms:
        bpath = prefix + 'tgt.{0}.bed'.format(chrom)  # don't compress
        rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom)  # reuse
        if expand > 0:
            bchr = beddf[beddf['chr'] == chrom].copy()
            bchr['st'] = bchr['st'] - expand
            bchr['ed'] = bchr['ed'] + expand
            bchr.loc[bchr['st'] < 0, 'st'] = 0
        else:
            bchr = beddf[beddf['chr'] == chrom]
        UT.write_pandas(bchr[['chr', 'st', 'ed', idcol]], bpath, '')
        bfiles.append(bpath)
        if splitrmsk:
            rchr = rmsk[rmsk['chr'] == chrom]
            UT.write_pandas(rchr[['chr', 'st', 'ed', 'name', 'strand']], rpath,
                            '')
        opath = prefix + 'out.{0}.bed'.format(chrom)
        ofiles.append(opath)
        args.append([bpath, rpath, opath])

    rslts = UT.process_mp(count_repeats_viz_chr, args, np=np, doreduce=False)

    # gather outputs
    cols = ['name', 'repnames']
    outs = [UT.read_pandas(f, names=cols) for f in ofiles]
    df = PD.concat(outs, ignore_index=True)
    df['name'] = df['name'].astype(str)
    i2rn = UT.df2dict(df, 'name', 'repnames')
    beddf[col] = [i2rn[str(x)] for x in beddf[idcol]]

    # cleanup
    if cleanup:
        for f in bfiles:
            os.unlink(f)
        for f in ofiles:
            os.unlink(f)

    return beddf
Exemple #21
0
def read_ucsc_refGene(path):
    return UT.read_pandas(path, names=RGCOLS)
Exemple #22
0
 def read_txt(self, suffix, category='read'):
     return UT.read_pandas(self.txtname(suffix, category))
Exemple #23
0
def prep_sjpath_chr(j2pres, libsizes, dstpre, chrom):
    pc2st = {}
    pc2ed = {}
    pc2tst = {}
    pc2ted = {}
    pc2strand = {}
    pc2tcov = {}
    # pc2tcov0 = {}
    # chr,st,ed,name,sc1(tcov),strand,tst,ted,sc2(),#exons,estarts,esizes
    # cols = ['st','ed','name','strand','tst','ted','tcov0','tcov']
    path = dstpre + '.sjpath.{0}.bed.gz'.format(chrom)
    path0 = dstpre + '.sjpath.bed.gz'
    if os.path.exists(path0):
        return path
    if os.path.exists(path):
        return path

    cols = ['st', 'ed', 'name', 'strand', 'tst', 'ted', 'tcov']

    if libsizes is None:
        n = 1
        scales = N.ones(len(j2pres))
    else:
        n = len(j2pres)
        scales = [1e6 / float(x) for x in libsizes]
    for pre, scale in zip(j2pres, scales):
        paths = UT.read_pandas(pre + '.paths.txt.gz', names=A2.PATHCOLS)
        paths = paths[paths['chr'] == chrom]
        for st, ed, name, s, tst, ted, tcov in paths[cols].values:
            pc = ','.join(
                name.split(',')[1:-1])  # trim 53exons => intron chain
            pc2st[pc] = min(st, pc2st.get(pc, st))
            pc2ed[pc] = max(ed, pc2ed.get(pc, ed))
            pc2tst[pc] = tst
            pc2ted[pc] = ted
            pc2strand[pc] = s
            pc2tcov[pc] = pc2tcov.get(pc, 0) + scale * tcov
            #pc2tcov0[pc] = pc2tcov0.get(pc,0)+scale*tcov0
    df = PD.DataFrame({
        'st': pc2st,
        'ed': pc2ed,
        'tst': pc2tst,
        'ted': pc2ted,
        'strand': pc2strand,
        'tcov': pc2tcov
    })
    df['chr'] = chrom
    df.index.name = 'name'
    df.reset_index(inplace=True)
    # create bed12: parse name => #exons, esizes, estarts
    df['pc'] = df['name'].copy()
    idxp = df['strand'].isin(['+', '.+'])
    if libsizes is not None:
        df['tcov'] = df['tcov'] / float(n)
    df.loc[idxp, 'name'] = [
        '{0},{1},{2}'.format(s, p, e)
        for s, p, e in df[idxp][['st', 'pc', 'ed']].values
    ]
    df.loc[~idxp, 'name'] = [
        '{2},{1},{0}'.format(s, p, e)
        for s, p, e in df[~idxp][['st', 'pc', 'ed']].values
    ]
    df = df.groupby('pc').first()  # get rid of unstranded duplicates
    cmax = 9 + N.log2(N.mean(scales))
    bed = A2.path2bed12(df, cmax)
    # reset sc1 to tcov (from log2(tcov+2)*100)
    bed['sc1'] = bed['tcov']
    GGB.write_bed(bed, path, ncols=12)
    return path
Exemple #24
0
def testsampleinfo(datadir):
    si = UT.read_pandas(os.path.join(datadir, 'bedtools/test-si.txt'))
    si['bw_path'] = datadir + '/' + si['bwfile']
    si['sjbed_path'] = datadir + '/' + si['sjbed']
    return si
Exemple #25
0
def calc_ecov(expath,
              cipath,
              bwpath,
              dstprefix,
              blocksize=100,
              override=False,
              np=4):
    """Calculate exon coverages.

    Args:
        expath: merged ex
        cipath: chopped interval for ex
        bwpath: bigwig file (sample)
        dstprefix: prefix for outputs

    Outputs:
        1. dstprefix+'.covci.txt.gz': coverage for ci
        2. dstprefix+'.ecov.txt.gz' : DataFrame(cols: eid, chr, st, ed, ecov)

    """
    covcipath = dstprefix + 'covci.txt.gz'
    ecovpath = dstprefix + 'ecov.txt.gz'
    ex = UT.read_pandas(expath)

    if UT.notstale([expath, cipath], covcipath, override):
        cc = UT.read_pandas(covcipath)
    else:
        if UT.notstale(expath, cipath,
                       False):  # you do not want to override ci
            ci = UT.read_pandas(cipath,
                                names=['chr', 'st', 'ed', 'name', 'id'])
        else:
            #ex = UT.read_pandas(expath)
            ci = UT.chopintervals(ex, cipath, idcol='_id')
        cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # ex = UT.read_pandas(expath)
    # if 'locus2' not in ex:
    #     ex['locus2'] = UT.calc_locus_strand(ex)
    # if '_id' not in ex:
    #     UT.set_ids(ex)
    # e2l = UT.df2dict(ex, '_id', 'locus2')
    # ex2 = ex.groupby('locus2').first().reset_index()
    # # maps: eid (_id) <=> locus2
    # if UT.notstale([expath, cipath], covcipath, override):
    #     cc = UT.read_pandas(covcipath)
    # else:
    #     if UT.notstale(expath, cipath, False): # you do not want to override ci
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     else:
    #         ci = UT.chopintervals(ex2, cipath, idcol='_id')
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)

    # if override or (not os.path.exists(covcipath)):
    #     # calc covci
    #     if not os.path.exists(cipath):
    #         ex = UT.read_pandas(expath)
    #         ci = UT.chopintervals(ex, cipath, idcol='_id')
    #     else:
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # else:
    #     cc = UT.read_pandas(covcipath)

    if 'id' not in cc.columns:
        cc['id'] = cc['sc1']
    if 'pid' not in cc.columns:
        cc['pid'] = cc['name'].astype(str).apply(
            lambda x: [int(y) for y in x.split(',')])
        cc['name1'] = cc['pid']
    #ccf = UT.flattendf(cc[['chr','st','ed','pid']], 'pid')
    #ccfg = ccf.groupby('eid')
    #df = ccfg[['chr']].first()
    #df['st'] = ccfg['st'].min()
    #df['ed'] = ccfg['ed'].max()
    #df.reset_index(inplace=True)
    df = ex[['_id', '_pid']].rename(columns={'_id': 'eid', '_pid': 'pid'})
    e2cs = calc_ecov_mp(cc, None, np, blocksize)  # pid => cov
    # l2cs = {e2l[x]: e2cs[x] for x in e2cs} # locus2 => cov
    # ex['ecov'] = [l2cs[x] for x in ex['locus2']]
    df['ecov'] = [e2cs[x] for x in df['pid']]
    # UT.save_tsv_nidx_whead(ex[['_id','ecov']], ecovpath)
    # return ex
    UT.save_tsv_nidx_whead(df[['eid', 'pid', 'ecov']], ecovpath)
    return df
Exemple #26
0
def read_ovl(c, acols, bcols=None):
    if bcols is None:
        cols = acols+['b_'+x for x in acols]+['ovl']
    else:
        cols = acols+['b_'+x for x in bcols]+['ovl']
    return UT.read_pandas(c, names=cols)
Exemple #27
0
def calc_cov_ovl_mp(srcname,
                    bwname,
                    dstname,
                    np=1,
                    covciname=None,
                    ciname=None,
                    colname='cov',
                    override=False):
    """Calculate coverage (from BigWig) over intervals (from srcname). 
    A column (default 'cov') which contains coverages is added to source dataframe 
    and the source is overwritten. 

    Args:
        srcname: path to exons tsv
        bwname: path to bigwig
        dstname: path for result
        np: number of processors
        covciname: path to covci (coverage for chopped interval dataframe)
        ciname: path to ci (chopped interval dataframe)
        colname: name for column which contain calculated coverages

    Returns:
        source dataframe with column (cov) added

    SideEffects:
        source tsv is overwritten with new column added

    """
    if UT.isstring(srcname):
        exons = UT.read_pandas(srcname)
    else:
        exons = srcname
    # cache
    if covciname is None:
        assert (UT.isstring(srcname))
        covciname = srcname[:-7] + '.covci.txt.gz'
    if ciname is None:
        assert (UT.isstring(srcname))
        ciname = srcname[:-7] + '.ci.txt.gz'

    if override or (not os.path.exists(covciname)):
        LOG.debug('calculating covci...')
        _sttime = time.time()
        if override or not (os.path.exists(ciname)):
            ci = UT.chopintervals(exons, ciname)
        else:
            ci = UT.read_pandas(ciname,
                                names=['chr', 'st', 'ed', 'name', 'id'])
            ci['name'] = ci['name'].astype(str)
        covci = calc_cov_mp(ci, bwname, covciname, np)
        LOG.debug(' time: {0:.3f}s'.format(time.time() - _sttime))
    else:
        LOG.debug('loading cached covci...')
        covci = UT.read_pandas(covciname)
    covci['name'] = covci['name'].astype(str)

    # covci: chopped interval's cov => reverse
    # ci => exon id ====> revers exon => ci indices
    # exon cov = sum(cicov*cilen)/totlen
    LOG.debug('calculating exon cov...')
    if 'id' not in covci.columns:
        covci['id'] = covci['sc1']

    _sttime = time.time()
    e2c = {}
    for i, name in covci[['id', 'name']].values:
        for eid in name.split(','):
            e2c.setdefault(int(eid), []).append(i)
    covci['len'] = covci['ed'] - covci['st']
    covci['val'] = covci['cov'] * covci['len']

    def _gen():
        for eid in exons['_id']:
            for cid in e2c[eid]:
                yield (cid, eid)

    tmp = PD.DataFrame(list(set([x for x in _gen()])), columns=['cid', 'eid'])
    c2len = dict(covci[['id', 'len']].values)
    c2val = dict(covci[['id', 'val']].values)
    tmp['val'] = [c2val[x] for x in tmp['cid']]
    tmp['len'] = [c2len[x] for x in tmp['cid']]
    tmpg = tmp.groupby('eid')[['val', 'len']].sum().reset_index()
    tmpg['cov'] = tmpg['val'] / tmpg['len']
    e2cov = dict(tmpg[['eid', 'cov']].values)
    exons[colname] = [e2cov[x] for x in exons['_id']]

    UT.save_tsv_nidx_whead(exons, dstname)
    return exons
Exemple #28
0
def _process_mapbed_chr(dstpre, chrom, genome, chromdir, stranded):
    # 1st pass: calc dupdic
    bedpath = dstpre+'.{0}.bed'.format(chrom)
    dupids = UT.read_pandas(dstpre+'.dupitems.txt.gz', index_col=[0]).index
    # 2nd pass make wiggles
    gfc = FA.GenomeFASTAChroms(chromdir)
    chromsize = UT.df2dict(UT.chromdf(genome), 'chr', 'size')[chrom]
    
    # mqth MAPQ threshold there are ~6% <10
    # generator which makes an array
    fp = open(bedpath,'rb')

    wigs = {}
    wigpaths = {}
    for kind in ['.ex','.sj']:
        wigs[kind] = {}
        wigpaths[kind] = {}
        for strand in ['.p','.n','.u']:
            wigs[kind][strand] = {}
            wigpaths[kind][strand] = {}
            for suf in ['','.uniq']:
                wigpath = dstpre+kind+suf+strand+'.{0}.wig'.format(chrom)
                if os.path.exists(wigpath):
                    os.unlink(wigpath)
                wigpaths[kind][strand][suf] = wigpath
                wigs[kind][strand][suf] = N.zeros(chromsize, dtype=float)

    sjs = [] # path: (chr, st, ed, pcode, ucnt, strand, acnt)
    # pcode = a(apos)d(dpos) = a(ed)d(st) if strand=='+' else a(st)d(ed)
    # ucnt = unique read counts
    # acnt = multi-read adjusted all counts (=ucnt+Sum(mcnt(i)/dup(i)))
    # delete previous
    sjbed12 = dstpre+'.{0}.sjpath.bed'.format(chrom)
    if os.path.exists(sjbed12):
        os.unlink(sjbed12)

    def _write_arrays():
        for kind in ['.ex','.sj']:
            for strand in ['.p','.n','.u']:
                for suf in ['','.uniq']:
                    cybw.array2wiggle_chr64(wigs[kind][strand][suf], chrom,  wigpaths[kind][strand][suf], 'w')
        
    def _write_sj(sjs):
        # sjs = [(chr,st,ed,pathcode(name),ureads(sc1),strand,tst,ted,areads(sc2),cse),...]
        sjdf = PD.DataFrame(sjs, columns=GGB.BEDCOLS[:9]+['cse'])
        sjdfgr = sjdf.groupby('name')
        sj = sjdfgr.first()
        sj['sc1'] = sjdfgr['sc1'].sum().astype(int) # ucnt
        sj['sc2'] = sjdfgr['sc2'].sum().astype(int) # jcnt=ucnt+mcnt
        sj['st'] = sjdfgr['st'].min()
        sj['ed'] = sjdfgr['ed'].max()
        sj['#exons'] = sj['cse'].apply(len)+1
        sj['ests'] = [[0]+[z[1]-st for z in cse] for st,cse in sj[['st','cse']].values]
        sj['eeds'] = [[z[0]-st for z in cse]+[ed-st] for st,ed,cse in sj[['st','ed','cse']].values]
        esizes = [[u-v for u,v in zip(x,y)] for x,y in sj[['eeds','ests']].values]
        sj['estarts'] = ['{0},'.format(','.join([str(y) for y in x])) for x in sj['ests']]
        sj['esizes'] = ['{0},'.format(','.join([str(y) for y in x])) for x in esizes]
        sj['name'] = sj.index
        # sj = sj.reset_index()
        with open(sjbed12, 'w') as f:
            sj[GGB.BEDCOLS].to_csv(f, index=False, header=False, sep='\t', quoting=csv.QUOTE_NONE)
            
    def _append_sj(cse, css, csj, chrom,ureads,areads):
        if (len(cse)>0): # spits out splice rec
            # chr,st,ed,pathcode,ureads,strand,tst,ted,areads
            tst = cse[0][0]
            ted = cse[-1][1]
            if len(css)>0:
                strand = Counter(css).most_common()[0][0]
            else:
                strand = '.'
            name = pathcode(cse, strand)
            st = int(csj[0][1]) # first segment start
            ed = int(csj[-1][2]) # last segment end
            sjs.append((chrom,st,ed,name,ureads,strand,tst,ted,areads,cse))   
    
    def _add_to_ex_arrays(st,ed,dup,strand):
        kind='.ex'
        strand = STRANDMAP[(strand,stranded)]
        dic = wigs[kind][strand]
        dic[''][st:ed] += 1
        if not dup:
            dic['.uniq'][st:ed] += 1

    def _add_to_sj_arrays(sst,sed,dup,strand):
        kind='.sj'
        s = {'+':'.p','-':'.n','.':'.u'}[strand]
        dic = wigs[kind][s]
        # add to the arrays
        dic[''][sst:sed] += 1
        if not dup:
            dic['.uniq'][sst:sed] += 1
            ureads,areads = 1,1
        else:
            ureads,areads = 0,1
        return ureads,areads
        
    csj = [] # current collection of spliced reads
    css = [] # current strands
    cse = [] # current (sst,sed)
    csn = 0 # current segment number
    ureads,areads = 1,1 # uniq, total reads it's either 1,1 or 0,1
    pmid = None # previous map id common to spliced segments
    for line in fp:
        rec = line.strip().split(b'\t')
        # 7 column bed: chr(0), st(1), ed(2), name(3), mapq(4), strand(5), mapid(6)
        cchr = rec[0].decode()
        st,ed = int(rec[1]),int(rec[2])
        dup = rec[3] in dupids #dic[rec[3]]
        estrand = rec[5]
        _add_to_ex_arrays(st,ed,dup,estrand)
        # process splice
        if pmid != rec[6]: # new map 
            _append_sj(cse, css, csj, chrom, ureads, areads)
            csj,css,cse,csn = [rec],[],[],0 # reset running params
        else: # add segments
            csj.append(rec)            
            prec = csj[-2] # previous rec
            sst = int(prec[2]) # ed of previous segment
            sed = int(rec[1]) # st of current segment
            cse.append((sst,sed))
            # find strand
            sted = gfc.get(chrom,sst,sst+2)+gfc.get(chrom,sed-2,sed)
            strand = STED2STRAND.get(sted,'.')
            if strand != '.':
                css.append(strand)
            ureads,areads = _add_to_sj_arrays(sst,sed,dup,strand)
        pmid = rec[6]

    _append_sj(cse, css, csj, chrom, ureads, areads)

    _write_arrays()
    _write_sj(sjs)