Ejemplo n.º 1
0
def as2exsj(dstpre, np=7):
    ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A2.EXDFCOLS)
    sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A2.SJDFCOLS)
    se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A2.EXDFCOLS)
    paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A2.PATHCOLS)
    #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.'
    #sj.loc[sj['strand'].isin(['.+','.-']),'strand'] = '.'
    sj['st'] = sj['st']+1 
    cols = A2.EXDFCOLS
    ex = PD.concat([ex[cols],se[cols]],ignore_index=True)
    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    graphpre = dstpre+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    genes = GP.find_genes4(sj,ex,
        filepre=prefix,
        np=np,
        override=False,
        separatese=True)
    ex.loc[ex['kind']=='3','cat'] = '3'
    ex.loc[ex['kind']=='5','cat'] = '5'
    UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h')
    ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz')
    return sj, ex
Ejemplo n.º 2
0
    def __init__(self, sjexpre, code, chromdir, rmskviz, outdir, **kw):
        self.sjexpre = sjexpre
        self.prefix = prefix = os.path.join(outdir, code)
        self.fnobj = FN.FileNamesBase(prefix)
        self.chromdir = chromdir
        self.rmskviz = rmskviz
        self.gfc = FA.GenomeFASTAChroms(chromdir)

        self.params = RMSKPARAMS.copy()
        self.params.update(kw)

        self.ex = UT.read_pandas(sjexpre + '.ex.txt.gz')
        self.sj = UT.read_pandas(sjexpre + '.sj.txt.gz')
        if 'glen' not in self.ex or 'tlen' not in self.ex:
            if not os.path.exists(sjexpre + '.ci.txt.gz'):
                ci = UT.chopintervals(ex, sjexpre + '.ci.txt.gz')
            else:
                ci = UT.read_ci(sjexpre + '.ci.txt.gz')
            UT.set_glen_tlen(self.ex, ci, gidx='_gidx')
            UT.write_pandas(self.ex, sjexpre + '.ex.txt.gz', 'h')
        uexpath = sjexpre + '.unionex.txt.gz'
        if os.path.exists(uexpath):
            self.uex = UT.read_pandas(uexpath)
        else:
            LOG.info('making union exons...saving to {0}'.format(uexpath))
            self.uex = UT.make_unionex(self.ex, '_gidx')
            UT.write_pandas(self.uex, uexpath, 'h')
Ejemplo n.º 3
0
    def model(self, which, code2=None):
        """Returns model dataframe (junction/exon/chopped intervals).

        Args:
            which: one of 'sj','ex', 'ci'

        """
        if hasattr(self, which):  # cached
            return getattr(self, which)

        path = self.modelpath(which, code2)
        if os.path.exists(path):  # file exists
            if which == 'ci':
                df = GGB.read_bed(path)
            else:
                df = UT.read_pandas(path)
            setattr(self, which, df)
            return df
        # file does not exists, if ci then make from ex
        if which == 'ci':
            expath = self.modelpath('ex', code2)
            if os.path.exists(expath):
                self.ci = UT.chopintervals(self.model('ex'), path)
            else:
                raise RuntimeError('file {0} does not exist'.format(expath))
        else:
            raise RuntimeError('file {0} does not exist'.format(path))
Ejemplo n.º 4
0
def trim_ex(expath, dstpath, dstcipath, length=1000, gidfld='_gidx', np=7):
    """Generate trimmed version of genes for calculating coverage to avoid length bias. 

    Args:
        expath (str): path exon tsv
        dstpath (str): path to trimmed exon
        dstcipath (str): path to ci (chopped interval) 
        length (pos int): length to trim from 3' end in base pair (default 1000 bp)
        gidfld (str): column name for gene id (default _gidx)
        np (pos int): number of CPU to use

    Generates:
        Two files (dstpath, dstcipath).

    Returns:
        a dataframe containing trimmed exons
    """
    #ex = UT.read_pandas(MD.paths[code]['ex'])
    #dstpath = MD.trimmedex[code][length]['ex']
    #dstcipath = MD.trimmedex[code][length]['ci']
    ex = UT.read_pandas(expath)
    if 'len' not in ex.columns:
        ex['len'] = ex['ed'] - ex['st']
    if np==1:
        recs = trim_ex_worker((ex, length, gidfld))
    else:
        chroms = sorted(ex['chr'].unique())
        data = [(ex[ex['chr']==c], length, gidfld) for c in chroms]
        recs = []
        try:
            p = multiprocessing.Pool(np)
            for v in p.map(trim_ex_worker, data):
                recs += v
            #recs = reduce(iadd, p.map(trim_ex_worker, *zip(*data)))
        finally:
            p.close()
            # p.join()
    cols = list(ex.columns.values)
    nex = PD.DataFrame(recs, columns = cols)
    nex['len'] = nex['ed'] - nex['st']
    # edge case
    nex.loc[nex['st']==nex['ed'],'ed'] = nex['st'] + 1
    UT.save_tsv_nidx_whead(nex, dstpath)
    UT.chopintervals(nex, dstcipath)

    return nex
Ejemplo n.º 5
0
 def ci(self):
     cicols = ['chr','st','ed','name','id']
     cipath = self.cipath()
     if os.path.exists(cipath):
         LOG.info('reading ci({0}) from cache...'.format(cipath))
         ci = UT.read_pandas(cipath, names=cicols)
         return ci
     if not os.path.exists(self.gtfpath):
         raise RuntimeError('file {0} does not exist'.format(self.gtfpath))
     LOG.info('making ci..')
     sj,ex = self.sjex()
     ci = UT.chopintervals(ex, cipath)
     return ci
Ejemplo n.º 6
0
def make_sjex(gtfpath, dstpre, np=12):
    if UT.isstring(gtfpath):
        gtf = GGB.read_gtf(gtfpath)
    else:
        gtf = gtfpath
    sj,ex = gtf2exonsj(gtf, np=np)
    print(ex.groupby(['kind','cat']).size())
    ex.loc[ex['kind']=='5','cat'] = '5'
    ex.loc[ex['kind']=='3','cat'] = '3'
    UT.write_pandas(ex, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj, dstpre+'.sj.txt.gz', 'h')
    # make ci
    ci = UT.chopintervals(ex, dstpre+'.ci.txt.gz')
    return {'sj':sj,'ex':ex}
Ejemplo n.º 7
0
def as3exsj(dstpre, minelen=150, np=7):
    ex = UT.read_pandas(dstpre+'.exdf.txt.gz', names=A3.EXDFCOLS)
    sj = UT.read_pandas(dstpre+'.sjdf.txt.gz', names=A3.SJDFCOLS)
    se = UT.read_pandas(dstpre+'.sedf.txt.gz', names=A3.EXDFCOLS)
    paths = UT.read_pandas(dstpre+'.paths.txt.gz', names=A3.PATHCOLS)
    #ex.loc[ex['strand'].isin(['.+','.-']),'strand'] = '.'
    # sj.loc[sj['strand']=='.+','strand'] = '+'
    # sj.loc[sj['strand']=='.-','strand'] = '-'
    sj['st'] = sj['st']+1 
    cols = A3.EXDFCOLS
    ex = PD.concat([ex[cols],se[cols]],ignore_index=True)
    UT.set_info(sj,ex)
    UT.set_exon_category(sj, ex)

    # find genes (connected components) set '_gidx'
    graphpre = dstpre+str(uuid.uuid4())+'_'
    prefix = os.path.abspath(graphpre) # need unique prefix for parallel processing
    # genes = GP.find_genes4(sj,ex,
    #     filepre=prefix,
    #     np=np,
    #     override=False,
    #     separatese=True)
    genes = GP.find_genes3(sj,ex, # don't use exon overlap as connection
        np=np,
        override=False)
    ex.loc[ex['kind']=='3','cat'] = '3'
    ex.loc[ex['kind']=='5','cat'] = '5'

    # remove these with elen smaller than minelen
    ex['len'] = ex['ed']-ex['st']
    exsiz = ex.groupby('_gidx')['len'].sum()
    rgidx = exsiz[exsiz<minelen].index.values
    LOG.info('minelen filter #ex {0}=>{1}'.format(len(ex), len(ex)-len(rgidx)))
    ex2 = ex[~ex['_gidx'].isin(rgidx)]
    sj2 = sj[~sj['_gidx'].isin(rgidx)]
    # write
    UT.write_pandas(ex2, dstpre+'.ex.txt.gz', 'h')
    UT.write_pandas(sj2, dstpre+'.sj.txt.gz', 'h')
    ci = UT.chopintervals(ex2, dstpre+'.ci.txt.gz')
    return sj2, ex2
Ejemplo n.º 8
0
def make_sjexci(path, np):
    if path[-3:]=='.gz':
        bpath = path[:-3]
    else:
        bpath = path
    ext = bpath[-4:]
    if ext not in ['.gtf', '.bed', '.txt']:
        raise ValueError('unknown filetype {0}, should be either .gtf,.bed (bed12),.txt (ucsc knownGene)'.format(ext))
    pathprefix = bpath[:-4]

    if not os.path.exists(path):
        raise ValueError('{0} file does not exists'.format(ext))

    if ext=='.gtf':
        df = GGB.read_gtf(path).sort_values(['chr',])
        sj, ex = gtf2exonsj(df, np=np)
    elif ext=='.bed': 
        df = GGB.read_bed(path)
        sj, ex = bed2exonsj(df, np=np)
    elif ext=='.txt': # UCSC download
        if 'knownGene' in path:
            df = GGB.read_ucsc_knownGene(path)
            sj, ex = kg2exonsj(df, np=np)
        elif 'refGene' in path:
            df = GGB.read_ucsc_refGene(path)
            sj, ex = kg2exonsj(df, np=np) # same as kg
    
    # save
    LOG.info('saving sj to {0}'.format(pathprefix+'.sj.txt.gz'))
    UT.write_pandas(sj, pathprefix+'.sj.txt.gz', 'h')
    LOG.info('saving ex to {0}'.format(pathprefix+'.ex.txt.gz'))
    UT.write_pandas(ex, pathprefix+'.ex.txt.gz', 'h')    

    # make ci
    ci = UT.chopintervals(ex, pathprefix+'.ci.txt.gz')
    return sj, ex
Ejemplo n.º 9
0
def calc_cov_ovl_mp(srcname,
                    bwname,
                    dstname,
                    np=1,
                    covciname=None,
                    ciname=None,
                    colname='cov',
                    override=False):
    """Calculate coverage (from BigWig) over intervals (from srcname). 
    A column (default 'cov') which contains coverages is added to source dataframe 
    and the source is overwritten. 

    Args:
        srcname: path to exons tsv
        bwname: path to bigwig
        dstname: path for result
        np: number of processors
        covciname: path to covci (coverage for chopped interval dataframe)
        ciname: path to ci (chopped interval dataframe)
        colname: name for column which contain calculated coverages

    Returns:
        source dataframe with column (cov) added

    SideEffects:
        source tsv is overwritten with new column added

    """
    if UT.isstring(srcname):
        exons = UT.read_pandas(srcname)
    else:
        exons = srcname
    # cache
    if covciname is None:
        assert (UT.isstring(srcname))
        covciname = srcname[:-7] + '.covci.txt.gz'
    if ciname is None:
        assert (UT.isstring(srcname))
        ciname = srcname[:-7] + '.ci.txt.gz'

    if override or (not os.path.exists(covciname)):
        LOG.debug('calculating covci...')
        _sttime = time.time()
        if override or not (os.path.exists(ciname)):
            ci = UT.chopintervals(exons, ciname)
        else:
            ci = UT.read_pandas(ciname,
                                names=['chr', 'st', 'ed', 'name', 'id'])
            ci['name'] = ci['name'].astype(str)
        covci = calc_cov_mp(ci, bwname, covciname, np)
        LOG.debug(' time: {0:.3f}s'.format(time.time() - _sttime))
    else:
        LOG.debug('loading cached covci...')
        covci = UT.read_pandas(covciname)
    covci['name'] = covci['name'].astype(str)

    # covci: chopped interval's cov => reverse
    # ci => exon id ====> revers exon => ci indices
    # exon cov = sum(cicov*cilen)/totlen
    LOG.debug('calculating exon cov...')
    if 'id' not in covci.columns:
        covci['id'] = covci['sc1']

    _sttime = time.time()
    e2c = {}
    for i, name in covci[['id', 'name']].values:
        for eid in name.split(','):
            e2c.setdefault(int(eid), []).append(i)
    covci['len'] = covci['ed'] - covci['st']
    covci['val'] = covci['cov'] * covci['len']

    def _gen():
        for eid in exons['_id']:
            for cid in e2c[eid]:
                yield (cid, eid)

    tmp = PD.DataFrame(list(set([x for x in _gen()])), columns=['cid', 'eid'])
    c2len = dict(covci[['id', 'len']].values)
    c2val = dict(covci[['id', 'val']].values)
    tmp['val'] = [c2val[x] for x in tmp['cid']]
    tmp['len'] = [c2len[x] for x in tmp['cid']]
    tmpg = tmp.groupby('eid')[['val', 'len']].sum().reset_index()
    tmpg['cov'] = tmpg['val'] / tmpg['len']
    e2cov = dict(tmpg[['eid', 'cov']].values)
    exons[colname] = [e2cov[x] for x in exons['_id']]

    UT.save_tsv_nidx_whead(exons, dstname)
    return exons
Ejemplo n.º 10
0
def calc_gcov(expath, cipath, bwpath, dstprefix, override=False, np=4):
    """Calculate gene coverages.

    Args:
        expath: merged ex
        cipath: chopped interval for ex
        bwpath: bigwig file (sample)
        dstprefix: prefix for outputs

    Outputs:
        1. dstprefix+'.covci.txt.gz'
        2. dstprefix+'.gcov.txt.gz' : DataFrame(col:_gidx,len,val,gcov,len2,gcov2,cids)
            len2: calculate length from ci with cov > 0
            (normal length = use entire ci's belonging to the gene)
            gcov2 = val/len2
            cids: cid with cov > for the gene ','.joined
    """
    ex = UT.read_pandas(expath)
    covcipath = dstprefix + 'covci.txt.gz'
    gcovpath = dstprefix + 'gcov.txt.gz'

    if UT.notstale([expath, cipath], covcipath, override):
        cc = UT.read_pandas(covcipath)
    else:
        if UT.notstale(expath, cipath, False):
            ci = UT.read_pandas(cipath,
                                names=['chr', 'st', 'ed', 'name', 'id'])
        else:
            ci = UT.chopintervals(ex, cipath, idcol='_id')
        cc = calc_cov_mp(ci, bwpath, covcipath, np=np)

    # if override or (not os.path.exists(covcipath)):
    #     # calc covci
    #     if not os.path.exists(cipath):
    #         ci = UT.chopintervals(ex, cipath, idcol='_id')
    #     else:
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # else:
    #     cc = UT.read_pandas(covcipath)

    if 'id' not in cc.columns:
        cc['id'] = cc['sc1']
    if 'eid' not in cc.columns:
        cc['eid'] = cc['name'].astype(str).apply(
            lambda x: [int(y) for y in x.split(',')])
    cc['len'] = cc['ed'] - cc['st']
    cc['val'] = cc['cov'] * cc['len']
    ccf = UT.flattendf(cc[['id', 'eid', 'len', 'val', 'st', 'ed']], 'eid')
    e2g = dict(UT.izipcols(ex, ['_id', '_gidx']))
    ccf['_gidx'] = [e2g[x] for x in ccf['eid']]
    # for normal gcov: take unique combination of (gid, id) (id=cid)
    # for gocv2 : first select ccf with val>0
    ccf2 = ccf[ccf['val'] > 0].groupby(['_gidx', 'id']).first().reset_index()
    ccf2g = ccf2.groupby('_gidx')
    df2 = ccf2g[['len', 'val']].sum()
    df2['gcov2'] = df2['val'] / df2['len']
    df2['cids'] = ccf2g['id'].apply(lambda x: ','.join([str(y) for y in x]))
    df2['gst2'] = ccf2g['st'].min()
    df2['ged2'] = ccf2g['ed'].max()
    df2['glen2'] = df2['ged2'] - df2['gst2']

    df2 = df2.reset_index()

    ccf1 = ccf.groupby(['_gidx', 'id']).first().reset_index()
    ccf1g = ccf1.groupby('_gidx')
    df = ccf1g[['len', 'val']].sum()
    df['gcov'] = df['val'] / df['len']
    df['st'] = ccf1g['st'].min()
    df['ed'] = ccf1g['ed'].max()
    df['glen'] = df['ed'] - df['st']
    df = df.reset_index()
    g2chr = dict(UT.izipcols(ex, ['_gidx', 'chr']))
    df['chr'] = [g2chr[x] for x in df['_gidx']]

    def _set_df2prop(src, tgt, default):
        dic = dict(UT.izipcols(df2, ['_gidx', src]))
        df[tgt] = [dic.get(x, default) for x in df['_gidx']]

    _set_df2prop('gcov2', 'gcov2', 0)
    _set_df2prop('len', 'len2', 0)
    _set_df2prop('cids', 'cids', '')
    _set_df2prop('gst2', 'st2', -1)
    _set_df2prop('ged2', 'ed2', -1)
    _set_df2prop('glen2', 'glen2', 0)

    cols = [
        '_gidx', 'chr', 'st', 'ed', 'len', 'val', 'gcov', 'glen', 'len2',
        'gcov2', 'cids', 'st2', 'ed2', 'glen2'
    ]
    cols = ['_gidx', 'gcov']
    df = df[cols]
    UT.save_tsv_nidx_whead(df, gcovpath)
    return df
Ejemplo n.º 11
0
def calc_ecov(expath,
              cipath,
              bwpath,
              dstprefix,
              blocksize=100,
              override=False,
              np=4):
    """Calculate exon coverages.

    Args:
        expath: merged ex
        cipath: chopped interval for ex
        bwpath: bigwig file (sample)
        dstprefix: prefix for outputs

    Outputs:
        1. dstprefix+'.covci.txt.gz': coverage for ci
        2. dstprefix+'.ecov.txt.gz' : DataFrame(cols: eid, chr, st, ed, ecov)

    """
    covcipath = dstprefix + 'covci.txt.gz'
    ecovpath = dstprefix + 'ecov.txt.gz'
    ex = UT.read_pandas(expath)

    if UT.notstale([expath, cipath], covcipath, override):
        cc = UT.read_pandas(covcipath)
    else:
        if UT.notstale(expath, cipath,
                       False):  # you do not want to override ci
            ci = UT.read_pandas(cipath,
                                names=['chr', 'st', 'ed', 'name', 'id'])
        else:
            #ex = UT.read_pandas(expath)
            ci = UT.chopintervals(ex, cipath, idcol='_id')
        cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # ex = UT.read_pandas(expath)
    # if 'locus2' not in ex:
    #     ex['locus2'] = UT.calc_locus_strand(ex)
    # if '_id' not in ex:
    #     UT.set_ids(ex)
    # e2l = UT.df2dict(ex, '_id', 'locus2')
    # ex2 = ex.groupby('locus2').first().reset_index()
    # # maps: eid (_id) <=> locus2
    # if UT.notstale([expath, cipath], covcipath, override):
    #     cc = UT.read_pandas(covcipath)
    # else:
    #     if UT.notstale(expath, cipath, False): # you do not want to override ci
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     else:
    #         ci = UT.chopintervals(ex2, cipath, idcol='_id')
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)

    # if override or (not os.path.exists(covcipath)):
    #     # calc covci
    #     if not os.path.exists(cipath):
    #         ex = UT.read_pandas(expath)
    #         ci = UT.chopintervals(ex, cipath, idcol='_id')
    #     else:
    #         ci = UT.read_pandas(cipath, names=['chr','st','ed','name','id'])
    #     cc = calc_cov_mp(ci, bwpath, covcipath, np=np)
    # else:
    #     cc = UT.read_pandas(covcipath)

    if 'id' not in cc.columns:
        cc['id'] = cc['sc1']
    if 'pid' not in cc.columns:
        cc['pid'] = cc['name'].astype(str).apply(
            lambda x: [int(y) for y in x.split(',')])
        cc['name1'] = cc['pid']
    #ccf = UT.flattendf(cc[['chr','st','ed','pid']], 'pid')
    #ccfg = ccf.groupby('eid')
    #df = ccfg[['chr']].first()
    #df['st'] = ccfg['st'].min()
    #df['ed'] = ccfg['ed'].max()
    #df.reset_index(inplace=True)
    df = ex[['_id', '_pid']].rename(columns={'_id': 'eid', '_pid': 'pid'})
    e2cs = calc_ecov_mp(cc, None, np, blocksize)  # pid => cov
    # l2cs = {e2l[x]: e2cs[x] for x in e2cs} # locus2 => cov
    # ex['ecov'] = [l2cs[x] for x in ex['locus2']]
    df['ecov'] = [e2cs[x] for x in df['pid']]
    # UT.save_tsv_nidx_whead(ex[['_id','ecov']], ecovpath)
    # return ex
    UT.save_tsv_nidx_whead(df[['eid', 'pid', 'ecov']], ecovpath)
    return df
Ejemplo n.º 12
0
    def filter(self, **kw):
        """ Filter genes.  
        base pair repeat overlap % >= th_bp_ovl (default 50)
        exon_repeat_overlap % >= th_ex_ovl (default 50)
        #union exon < th_uexon (default 4)

        That is, by default, it filters out 2,3 exon genes with both base pair and exon level
        overlap to repeats are greater or equal to 50%. Does not apply to single exons. 

        """
        d = self.ugb
        pr = self.params
        fn = self.fnobj
        pr.update(kw)

        idx1 = (d['rep%'] >= pr['th_bp_ovl']) & (d['rviz%'] > pr['th_ex_ovl'])
        idx2 = (d['#junc'].notnull()) & (d['#uexons'] < pr['th_uexon'])
        idx = ~(idx1 & idx2)
        self.ugb2 = ugb2 = d[idx]  # filtered
        self.ugb3 = ugb3 = d[~idx]

        gids = ugb2.index.values
        ex0 = self.ex
        sj0 = self.sj
        uex = self.uex
        # filter ex,sj,uex
        self.ex2 = ex2 = ex0[ex0['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        self.sj2 = sj2 = sj0[sj0['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        self.uex2 = uex2 = uex[uex['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        gcovfld = 'gcov_' + pr['datacode'] if pr['datacode'] else 'gcov'
        self.gbed2 = gbed2 = GGB.unionex2bed12(uex2,
                                               name=pr['gname'],
                                               sc2=gcovfld,
                                               sc1='tlen')
        gbed2['sc2'] = gbed2['sc2'].astype(int)
        # write out filtered ex,sj,ci,unionex,gbed
        UT.write_pandas(ex2, fn.txtname('ex', category='output'), 'h')
        UT.write_pandas(sj2, fn.txtname('sj', category='output'), 'h')
        UT.chopintervals(ex2, fn.txtname('ci', category='output'))

        GGB.write_bed(ex2, fn.bedname('ex', category='output'))
        GGB.write_bed(sj2, fn.bedname('sj', category='output'))

        UT.write_pandas(uex2, fn.txtname('unionex', category='output'), 'h')
        UT.write_pandas(ugb2, fn.txtname('genes.stats', category='output'),
                        'h')
        UT.write_pandas(gbed2, fn.bedname('genes', category='output'),
                        '')  # BED12

        # also write filtered out genes
        self.ex3 = ex3 = ex0[~ex0['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        self.sj3 = sj3 = sj0[~sj0['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        self.uex3 = uex3 = uex[~uex['_gidx'].isin(gids)].sort_values(
            ['chr', 'st', 'ed'])
        gcovfld = 'gcov_' + pr['datacode'] if pr['datacode'] else 'gcov'
        self.gbed3 = gbed3 = GGB.unionex2bed12(uex3,
                                               name=pr['gname'],
                                               sc2=gcovfld,
                                               sc1='tlen')
        gbed3['sc2'] = gbed3['sc2'].astype(int)
        # write out filtered ex,sj,ci,unionex,gbed
        UT.write_pandas(ex3, fn.txtname('removed.ex', category='output'), 'h')
        UT.write_pandas(sj3, fn.txtname('removed.sj', category='output'), 'h')
        UT.chopintervals(ex3, fn.txtname('removed.ci', category='output'))
        UT.write_pandas(uex3, fn.txtname('removed.unionex', category='output'),
                        'h')
        UT.write_pandas(ugb3,
                        fn.txtname('removed.genes.stats', category='output'),
                        'h')
        UT.write_pandas(gbed3, fn.bedname('removed.genes', category='output'),
                        '')  # BED12