Esempio n. 1
0
def calc_ovlratio(aname, bname, tname, nacol, nbcol, idcol=['chr','st','ed'], returnbcols=False):
    """Calculate overlapped portion of b onto a. 
    Will check existence of result file (tname) and uses it if newer than input files.

    Args:
        aname (str): bed file name 1
        bname (str): bed file name 2
        tname (str): result file name
        nacol (int): number of columns in file 1
        nbcol (int): number of columns in file 2

    Optional:
        idcol (list of str): columns which specify unique entry

    Returns:
        A Pandas DataFrame which contains overlap info
    """
    # requirement: no overlap within b
    # cache?
    if UT.notstale([aname,bname], tname):
        return UT.read_pandas(tname)
    # calculate bedtools intersect
    tmpsuf='.ovlbed.txt'
    cname = aname+tmpsuf
    if nacol==12:
        cname = bedtoolintersect(aname, bname, cname, wao=True, split=True)
    else:
        cname = bedtoolintersect(aname, bname, cname, wao=True)
    # read tmp file
    acols = GGB.BEDCOLS[:nacol]
    bcols = ['b_'+x for x in GGB.BEDCOLS[:nbcol]]
    cols = acols + bcols +['ovl']
    df = UT.read_pandas(cname, names=cols)
    dfg = df.groupby(idcol) #['chr','st','ed'])
    if returnbcols:
        dfa = dfg.first().reset_index()[acols+bcols]
    else:
        dfa = dfg.first().reset_index()[acols]        
    if nacol==12:# sum of exon sizes
        dfa['len'] = [N.sum(map(int, x.split(',')[:-1])) for x in dfa['esizes']]
    else: 
        dfa['len'] = dfa['ed']-dfa['st']
    # since b does not overlap by itself total overlap of an element of a to b is 
    # sum of overlap to individual b
    dfa['ovl'] = dfg['ovl'].sum().values
    dfa['ovlratio'] = dfa['ovl'].astype(float)/dfa['len']
    dfa['notcovbp'] = dfa['len'] - dfa['ovl']
    # clean up
    os.unlink(cname)
    # save
    UT.save_tsv_nidx_whead(dfa, tname)
    return dfa
Esempio n. 2
0
def trim_ex(expath, dstpath, dstcipath, length=1000, gidfld='_gidx', np=7):
    """Generate trimmed version of genes for calculating coverage to avoid length bias. 

    Args:
        expath (str): path exon tsv
        dstpath (str): path to trimmed exon
        dstcipath (str): path to ci (chopped interval) 
        length (pos int): length to trim from 3' end in base pair (default 1000 bp)
        gidfld (str): column name for gene id (default _gidx)
        np (pos int): number of CPU to use

    Generates:
        Two files (dstpath, dstcipath).

    Returns:
        a dataframe containing trimmed exons
    """
    #ex = UT.read_pandas(MD.paths[code]['ex'])
    #dstpath = MD.trimmedex[code][length]['ex']
    #dstcipath = MD.trimmedex[code][length]['ci']
    ex = UT.read_pandas(expath)
    if 'len' not in ex.columns:
        ex['len'] = ex['ed'] - ex['st']
    if np==1:
        recs = trim_ex_worker((ex, length, gidfld))
    else:
        chroms = sorted(ex['chr'].unique())
        data = [(ex[ex['chr']==c], length, gidfld) for c in chroms]
        recs = []
        try:
            p = multiprocessing.Pool(np)
            for v in p.map(trim_ex_worker, data):
                recs += v
            #recs = reduce(iadd, p.map(trim_ex_worker, *zip(*data)))
        finally:
            p.close()
            # p.join()
    cols = list(ex.columns.values)
    nex = PD.DataFrame(recs, columns = cols)
    nex['len'] = nex['ed'] - nex['st']
    # edge case
    nex.loc[nex['st']==nex['ed'],'ed'] = nex['st'] + 1
    UT.save_tsv_nidx_whead(nex, dstpath)
    UT.chopintervals(nex, dstcipath)

    return nex
Esempio n. 3
0
def calc_ecov_mp(covci, fname, np, blocksize=100):
    """
    WARNING: this assumes _id is assinged according to sorted (chr,st,ed)
    """
    LOG.debug('calc_ecov...')
    chroms = sorted(covci['chr'].unique())
    if 'name1' not in covci.columns:
        covci['name1'] = covci['name'].astype(str).apply(
            lambda x: [int(y) for y in x.split(',')])
    if 'eidmax' not in covci.columns:
        covci['eidmax'] = covci['name1'].apply(lambda x: max(x))
    if 'eidmin' not in covci.columns:
        covci['eidmin'] = covci['name1'].apply(lambda x: min(x))
    args = [(covci[covci['chr'] == c].copy(), blocksize) for c in chroms]
    e2cs = {}
    if np == 1:
        # for c,bwname,chrom,d in data:
        for arg in args:
            e2cs.update(calc_ecov_chrom(*arg))
    else:
        try:
            p = multiprocessing.Pool(np)
            rslts = p.map(mp_worker, zip(repeat(calc_ecov_chrom), args))
        finally:
            LOG.debug('closing pool')
            p.close()
        for x in rslts:
            e2cs.update(x)
    LOG.debug('writing rslts...')
    if fname is None:
        return e2cs
    ccf = UT.flattendf(covci, 'name1')
    ccfg = ccf.groupby('name1')
    e2chr = dict(
        UT.izipcols(ccfg['chr'].first().reset_index(), ['name1', 'chr']))
    e2st = dict(UT.izipcols(ccfg['st'].min().reset_index(), ['name1', 'st']))
    e2ed = dict(UT.izipcols(ccfg['ed'].max().reset_index(), ['name1', 'ed']))
    df = PD.DataFrame(e2cs, index=['ecov']).T
    df.index.name = 'eid'
    df = df.reset_index()
    df['chr'] = [e2chr[x] for x in df['eid']]
    df['st'] = [e2st[x] for x in df['eid']]
    df['ed'] = [e2ed[x] for x in df['eid']]
    UT.save_tsv_nidx_whead(df[['eid', 'chr', 'st', 'ed', 'ecov']], fname)
    return df
Esempio n. 4
0
def calc_cov_mp(bed, bwname, fname, np, which='cov'):
    if which == 'cov':
        worker = worker_cov
    elif which == 'max':
        worker = worker_max

    if UT.isstring(bed):
        bed = GGB.read_bed(bed)
    #cols = list(bed.columns)+['cov']
    cols = list(bed.columns) + [which]
    chroms = bed['chr'].unique()
    #LOG.debug(chroms)
    cdir = os.path.dirname(__file__)
    data = [(bed[bed['chr'] == c].copy(), bwname, c, cdir) for c in chroms]
    recs = []
    if np == 1:
        # for c,bwname,chrom,d in data:
        for arg in data:
            LOG.debug('cov calculation: processing {0}...'.format(arg[-2]))
            recs += worker(*arg)
    else:
        LOG.debug('{1} calculation: np={0}'.format(np, which))
        try:
            p = multiprocessing.Pool(np)
            a = zip(repeat(worker), data)
            rslts = p.map(mp_worker, a)
            for v in rslts:
                recs += v
            LOG.debug('done {1} calculation: np={0}'.format(np, which))
        finally:
            LOG.debug('closing pool')
            p.close()
            #p.join()
        #recs = reduce(iadd, rslts)
    LOG.debug('writing rslts...')
    df = PD.DataFrame(recs, columns=cols)
    UT.save_tsv_nidx_whead(df, fname)
    return df
Esempio n. 5
0
def calc_cov_ovl_mp(srcname,
                    bwname,
                    dstname,
                    np=1,
                    covciname=None,
                    ciname=None,
                    colname='cov',
                    override=False):
    """Calculate coverage (from BigWig) over intervals (from srcname). 
    A column (default 'cov') which contains coverages is added to source dataframe 
    and the source is overwritten. 

    Args:
        srcname: path to exons tsv
        bwname: path to bigwig
        dstname: path for result
        np: number of processors
        covciname: path to covci (coverage for chopped interval dataframe)
        ciname: path to ci (chopped interval dataframe)
        colname: name for column which contain calculated coverages

    Returns:
        source dataframe with column (cov) added

    SideEffects:
        source tsv is overwritten with new column added

    """
    if UT.isstring(srcname):
        exons = UT.read_pandas(srcname)
    else:
        exons = srcname
    # cache
    if covciname is None:
        assert (UT.isstring(srcname))
        covciname = srcname[:-7] + '.covci.txt.gz'
    if ciname is None:
        assert (UT.isstring(srcname))
        ciname = srcname[:-7] + '.ci.txt.gz'

    if override or (not os.path.exists(covciname)):
        LOG.debug('calculating covci...')
        _sttime = time.time()
        if override or not (os.path.exists(ciname)):
            ci = UT.chopintervals(exons, ciname)
        else:
            ci = UT.read_pandas(ciname,
                                names=['chr', 'st', 'ed', 'name', 'id'])
            ci['name'] = ci['name'].astype(str)
        covci = calc_cov_mp(ci, bwname, covciname, np)
        LOG.debug(' time: {0:.3f}s'.format(time.time() - _sttime))
    else:
        LOG.debug('loading cached covci...')
        covci = UT.read_pandas(covciname)
    covci['name'] = covci['name'].astype(str)

    # covci: chopped interval's cov => reverse
    # ci => exon id ====> revers exon => ci indices
    # exon cov = sum(cicov*cilen)/totlen
    LOG.debug('calculating exon cov...')
    if 'id' not in covci.columns:
        covci['id'] = covci['sc1']

    _sttime = time.time()
    e2c = {}
    for i, name in covci[['id', 'name']].values:
        for eid in name.split(','):
            e2c.setdefault(int(eid), []).append(i)
    covci['len'] = covci['ed'] - covci['st']
    covci['val'] = covci['cov'] * covci['len']

    def _gen():
        for eid in exons['_id']:
            for cid in e2c[eid]:
                yield (cid, eid)

    tmp = PD.DataFrame(list(set([x for x in _gen()])), columns=['cid', 'eid'])
    c2len = dict(covci[['id', 'len']].values)
    c2val = dict(covci[['id', 'val']].values)
    tmp['val'] = [c2val[x] for x in tmp['cid']]
    tmp['len'] = [c2len[x] for x in tmp['cid']]
    tmpg = tmp.groupby('eid')[['val', 'len']].sum().reset_index()
    tmpg['cov'] = tmpg['val'] / tmpg['len']
    e2cov = dict(tmpg[['eid', 'cov']].values)
    exons[colname] = [e2cov[x] for x in exons['_id']]

    UT.save_tsv_nidx_whead(exons, dstname)
    return exons
Esempio n. 6
0
def calc_gcov(expath, cipath, bwpath, dstprefix, override=False, np=4):
    """Calculate gene coverages.

    Args:
        expath: merged ex
        cipath: chopped interval for ex
        bwpath: bigwig file (sample)
        dstprefix: prefix for outputs

    Outputs:
        1. dstprefix+'.covci.txt.gz'
        2. dstprefix+'.gcov.txt.gz' : DataFrame(col:_gidx,len,val,gcov,len2,gcov2,cids)
            len2: calculate length from ci with cov > 0
            (normal length = use entire ci's belonging to the gene)
            gcov2 = val/len2
            cids: cid with cov > for the gene ','.joined
    """
    ex = UT.read_pandas(expath)
    covcipath = dstprefix + 'covci.txt.gz'
    gcovpath = dstprefix + 'gcov.txt.gz'

    if UT.notstale([expath, cipath], covcipath, override):
        cc = UT.read_pandas(covcipath)
    else:
        if UT.notstale(expath, cipath, False):
            ci = UT.read_pandas(cipath,
                                names=['chr', 'st', 'ed', 'name', 'id'])
        else:
            ci = UT.chopintervals(ex, cipath, idcol='_id')
        cc = calc_cov_mp(ci, bwpath, covcipath, np=np)

    # if override or (not os.path.exists(covcipath)):
    #     # calc covci
    #     if not os.path.exists(cipath):
    #         ci = UT.chopintervals(ex, cipath, idcol='_id')
    #     else:
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # else:
    #     cc = UT.read_pandas(covcipath)

    if 'id' not in cc.columns:
        cc['id'] = cc['sc1']
    if 'eid' not in cc.columns:
        cc['eid'] = cc['name'].astype(str).apply(
            lambda x: [int(y) for y in x.split(',')])
    cc['len'] = cc['ed'] - cc['st']
    cc['val'] = cc['cov'] * cc['len']
    ccf = UT.flattendf(cc[['id', 'eid', 'len', 'val', 'st', 'ed']], 'eid')
    e2g = dict(UT.izipcols(ex, ['_id', '_gidx']))
    ccf['_gidx'] = [e2g[x] for x in ccf['eid']]
    # for normal gcov: take unique combination of (gid, id) (id=cid)
    # for gocv2 : first select ccf with val>0
    ccf2 = ccf[ccf['val'] > 0].groupby(['_gidx', 'id']).first().reset_index()
    ccf2g = ccf2.groupby('_gidx')
    df2 = ccf2g[['len', 'val']].sum()
    df2['gcov2'] = df2['val'] / df2['len']
    df2['cids'] = ccf2g['id'].apply(lambda x: ','.join([str(y) for y in x]))
    df2['gst2'] = ccf2g['st'].min()
    df2['ged2'] = ccf2g['ed'].max()
    df2['glen2'] = df2['ged2'] - df2['gst2']

    df2 = df2.reset_index()

    ccf1 = ccf.groupby(['_gidx', 'id']).first().reset_index()
    ccf1g = ccf1.groupby('_gidx')
    df = ccf1g[['len', 'val']].sum()
    df['gcov'] = df['val'] / df['len']
    df['st'] = ccf1g['st'].min()
    df['ed'] = ccf1g['ed'].max()
    df['glen'] = df['ed'] - df['st']
    df = df.reset_index()
    g2chr = dict(UT.izipcols(ex, ['_gidx', 'chr']))
    df['chr'] = [g2chr[x] for x in df['_gidx']]

    def _set_df2prop(src, tgt, default):
        dic = dict(UT.izipcols(df2, ['_gidx', src]))
        df[tgt] = [dic.get(x, default) for x in df['_gidx']]

    _set_df2prop('gcov2', 'gcov2', 0)
    _set_df2prop('len', 'len2', 0)
    _set_df2prop('cids', 'cids', '')
    _set_df2prop('gst2', 'st2', -1)
    _set_df2prop('ged2', 'ed2', -1)
    _set_df2prop('glen2', 'glen2', 0)

    cols = [
        '_gidx', 'chr', 'st', 'ed', 'len', 'val', 'gcov', 'glen', 'len2',
        'gcov2', 'cids', 'st2', 'ed2', 'glen2'
    ]
    cols = ['_gidx', 'gcov']
    df = df[cols]
    UT.save_tsv_nidx_whead(df, gcovpath)
    return df
Esempio n. 7
0
def calc_ecov(expath,
              cipath,
              bwpath,
              dstprefix,
              blocksize=100,
              override=False,
              np=4):
    """Calculate exon coverages.

    Args:
        expath: merged ex
        cipath: chopped interval for ex
        bwpath: bigwig file (sample)
        dstprefix: prefix for outputs

    Outputs:
        1. dstprefix+'.covci.txt.gz': coverage for ci
        2. dstprefix+'.ecov.txt.gz' : DataFrame(cols: eid, chr, st, ed, ecov)

    """
    covcipath = dstprefix + 'covci.txt.gz'
    ecovpath = dstprefix + 'ecov.txt.gz'
    ex = UT.read_pandas(expath)

    if UT.notstale([expath, cipath], covcipath, override):
        cc = UT.read_pandas(covcipath)
    else:
        if UT.notstale(expath, cipath,
                       False):  # you do not want to override ci
            ci = UT.read_pandas(cipath,
                                names=['chr', 'st', 'ed', 'name', 'id'])
        else:
            #ex = UT.read_pandas(expath)
            ci = UT.chopintervals(ex, cipath, idcol='_id')
        cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # ex = UT.read_pandas(expath)
    # if 'locus2' not in ex:
    #     ex['locus2'] = UT.calc_locus_strand(ex)
    # if '_id' not in ex:
    #     UT.set_ids(ex)
    # e2l = UT.df2dict(ex, '_id', 'locus2')
    # ex2 = ex.groupby('locus2').first().reset_index()
    # # maps: eid (_id) <=> locus2
    # if UT.notstale([expath, cipath], covcipath, override):
    #     cc = UT.read_pandas(covcipath)
    # else:
    #     if UT.notstale(expath, cipath, False): # you do not want to override ci
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     else:
    #         ci = UT.chopintervals(ex2, cipath, idcol='_id')
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)

    # if override or (not os.path.exists(covcipath)):
    #     # calc covci
    #     if not os.path.exists(cipath):
    #         ex = UT.read_pandas(expath)
    #         ci = UT.chopintervals(ex, cipath, idcol='_id')
    #     else:
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # else:
    #     cc = UT.read_pandas(covcipath)

    if 'id' not in cc.columns:
        cc['id'] = cc['sc1']
    if 'pid' not in cc.columns:
        cc['pid'] = cc['name'].astype(str).apply(
            lambda x: [int(y) for y in x.split(',')])
        cc['name1'] = cc['pid']
    #ccf = UT.flattendf(cc[['chr','st','ed','pid']], 'pid')
    #ccfg = ccf.groupby('eid')
    #df = ccfg[['chr']].first()
    #df['st'] = ccfg['st'].min()
    #df['ed'] = ccfg['ed'].max()
    #df.reset_index(inplace=True)
    df = ex[['_id', '_pid']].rename(columns={'_id': 'eid', '_pid': 'pid'})
    e2cs = calc_ecov_mp(cc, None, np, blocksize)  # pid => cov
    # l2cs = {e2l[x]: e2cs[x] for x in e2cs} # locus2 => cov
    # ex['ecov'] = [l2cs[x] for x in ex['locus2']]
    df['ecov'] = [e2cs[x] for x in df['pid']]
    # UT.save_tsv_nidx_whead(ex[['_id','ecov']], ecovpath)
    # return ex
    UT.save_tsv_nidx_whead(df[['eid', 'pid', 'ecov']], ecovpath)
    return df