def __init__(self, modelpre, bwpre, chrom, st, ed, dstpre, tcovth): self.modelpre = modelpre self.tcovth = tcovth A2.LocalAssembler.__init__(self, bwpre, chrom, st, ed, dstpre, refcode=None) bed12 = GGB.read_bed(modelpre + '.paths.withse.bed.gz') idx = (bed12['chr'] == chrom) & (bed12['tst'] >= st) & (bed12['ted'] <= ed) self.paths = bed12[idx].copy() sj = GGB.read_bed(bwpre + '.sjpath.bed.gz') idx0 = (sj['chr'] == chrom) & (sj['tst'] >= st) & (sj['ted'] <= ed) self.sjpaths0 = sj[idx0].copy() # load exdf, sjdf sjdf = UT.read_pandas(modelpre + '.sjdf.txt.gz', names=A2.SJDFCOLS) exdf = UT.read_pandas(modelpre + '.exdf.txt.gz', names=A2.EXDFCOLS) idx = (sjdf['chr'] == chrom) & (sjdf['st'] >= st) & (sjdf['ed'] <= ed) self.sjdf = sjdf[idx].copy() idx = (exdf['chr'] == chrom) & (exdf['st'] >= st) & (exdf['ed'] <= ed) self.exdf = exdf[idx].copy() A2.set_ad_pos(self.sjdf, 'sj') A2.set_ad_pos(self.exdf, 'ex')
def filter_sj(bwsjpre, statspath, chrom, csize, params): # read in junction stats stats = UT.read_pandas(statspath) if 'chr' not in stats: stats['chr'] = [x.split(':')[0] for x in stats['locus']] if '#detected' in stats: stats.rename(columns={'#detected': 'detected'}, inplace=True) stats = stats[stats['chr'] == chrom].copy() if 'pc' not in stats: stats['pc'] = [locus2pc(x) for x in stats['locus']] flds = ['detected', 'maxcnt', 'maxoverhang'] dics = {f: UT.df2dict(stats, 'pc', f) for f in flds} # read sjpath fpath_chr = bwsjpre + '.sjpath.{0}.bed.gz'.format(chrom) dstpath = bwsjpre + '.sjpath.{0}.filtered.bed.gz'.format(chrom) if os.path.exists(fpath_chr): sj = GGB.read_bed(fpath_chr) else: fpath = bwsjpre + '.sjpath.bed.gz' sj = GGB.read_bed(fpath) sj = sj[sj['chr'] == chrom].copy() name0 = sj.iloc[0]['name'] if len(name0.split('|')) < len(name0.split(',')): # exons attached? sj['name'] = [','.join(x.split(',')[1:-1]) for x in sj['name']] # filter unstranded sj = sj[sj['strand'].isin(['+', '-'])].copy() # filter with stats for f in flds: sj[f] = [ N.min([dics[f].get(x, 0) for x in y.split(',')]) for y in sj['name'] ] sj = sj[sj[f] > params['th_' + f]].copy() # filter # edge exon size sj['eflen'] = [int(x.split(',')[0]) for x in sj['esizes']] sj['ellen'] = [int(x.split(',')[-2]) for x in sj['esizes']] eth = params['th_minedgeexon'] sj = sj[(sj['eflen'] > eth) & (sj['ellen'] > eth)].copy() # calculate sjratio, sjratio2 sjexbw = A2.SjExBigWigs(bwsjpre, mixunstranded=False) for s in ['+', '-']: idx = sj['strand'] == s with sjexbw: sa = sjexbw.bws['sj'][s].get(chrom, 0, csize) ea = sjexbw.bws['ex'][s].get(chrom, 0, csize) a = sa + ea sj.loc[idx, 'sjratio2'] = [ x / N.mean(a[int(s):int(e)]) for x, s, e in sj[idx][['sc1', 'tst', 'ted']].values ] sj = sj[sj['sjratio2'] > params['th_sjratio2']] GGB.write_bed(sj, dstpath, ncols=12)
def model(self, which, code2=None): """Returns model dataframe (junction/exon/chopped intervals). Args: which: one of 'sj','ex', 'ci' """ if hasattr(self, which): # cached return getattr(self, which) path = self.modelpath(which, code2) if os.path.exists(path): # file exists if which == 'ci': df = GGB.read_bed(path) else: df = UT.read_pandas(path) setattr(self, which, df) return df # file does not exists, if ci then make from ex if which == 'ci': expath = self.modelpath('ex', code2) if os.path.exists(expath): self.ci = UT.chopintervals(self.model('ex'), path) else: raise RuntimeError('file {0} does not exist'.format(expath)) else: raise RuntimeError('file {0} does not exist'.format(path))
def filter_paths(mdstpre, rdstpre): ex = UT.read_pandas(rdstpre + '.ex.txt.gz') def select_chromwise(paths, ex): npchrs = [] for chrom in paths['chr'].unique(): pchr = paths[paths['chr'] == chrom] echr = ex[ex['chr'] == chrom] exnames = set(echr['name'].values) #e2gname = UT.df2dict(echr,'name','gname') idx = [ all([x in exnames for x in y.split('|')]) for y in pchr['name'] ] npchrs.append(pchr[idx]) return PD.concat(npchrs, ignore_index=True) paths = GGB.read_bed(mdstpre + '.paths.withse.bed.gz') npaths = select_chromwise(paths, ex) GGB.write_bed(npaths, rdstpre + '.paths.withse.bed.gz', ncols=12) paths = GGB.read_bed(mdstpre + '.paths.txt.gz') npaths = select_chromwise(paths, ex) GGB.write_bed(npaths, rdstpre + '.paths.txt.gz', ncols=12)
def test_sjtab2sjbed(sampleinfo, datadir, outdir): rec = sampleinfo.iloc[0] sjtab = os.path.join(datadir, 'SJ', rec['sjtab']) sjbed = os.path.join(outdir, rec['sjbed']) aligned = rec['aligned'] sj = GGB.sjtab2sjbed(sjtab, sjbed, aligned) assert os.path.exists(sjbed) SJCOLS = [ 'chr', 'st', 'ed', 'strand2', 'motif', 'annotated', 'ureads', 'mreads', 'maxoverhang' ] sji = PD.read_table(sjtab, names=SJCOLS) assert len(sj) == len(sji) #cols = ['chr','st','ed','name','strand','ucnt','mcnt'] #sjo = PD.read_table(sjbed, compression='gzip', names=cols) sjo = GGB.read_bed(sjbed) assert all(sj[GGB.BEDCOLS[:7]] == sjo)
def calc_glen(ex, cipath): ci = GGB.read_bed(cipath) # 5 col bed, name:eids, sc1:cid ci['len'] = ci['ed'] - ci['st'] ci['cid'] = ci['sc1'] c2l = dict(UT.izipcols(ci, ['cid', 'len'])) if 'cid' not in ex.columns: e2c = {} for i, name in ci[['cid', 'name']].values: for eid in name.split(','): e2c.setdefault(int(eid), []).append(i) ex['cid'] = [e2c[x] for x in ex['_id']] def _gen(): for g, cids in UT.izipcols(ex, ['_gidx', 'cid']): for c in cids: yield (c, g) df = PD.DataFrame(list(set([x for x in _gen()])), columns=['cid', '_gidx']) df['len'] = [c2l[x] for x in df['cid']] glen = df.groupby('_gidx')['len'].sum() return dict(zip(glen.index, glen.values))
def calc_cov_mp(bed, bwname, fname, np, which='cov'): if which == 'cov': worker = worker_cov elif which == 'max': worker = worker_max if UT.isstring(bed): bed = GGB.read_bed(bed) #cols = list(bed.columns)+['cov'] cols = list(bed.columns) + [which] chroms = bed['chr'].unique() #LOG.debug(chroms) cdir = os.path.dirname(__file__) data = [(bed[bed['chr'] == c].copy(), bwname, c, cdir) for c in chroms] recs = [] if np == 1: # for c,bwname,chrom,d in data: for arg in data: LOG.debug('cov calculation: processing {0}...'.format(arg[-2])) recs += worker(*arg) else: LOG.debug('{1} calculation: np={0}'.format(np, which)) try: p = multiprocessing.Pool(np) a = zip(repeat(worker), data) rslts = p.map(mp_worker, a) for v in rslts: recs += v LOG.debug('done {1} calculation: np={0}'.format(np, which)) finally: LOG.debug('closing pool') p.close() #p.join() #recs = reduce(iadd, rslts) LOG.debug('writing rslts...') df = PD.DataFrame(recs, columns=cols) UT.save_tsv_nidx_whead(df, fname) return df
def estimatecovs(modelpre, bwpre, dstpre, genome, tcovth=1, np=6): bed = GGB.read_bed(modelpre + '.paths.withse.bed.gz') chroms = bed['chr'].unique() csizedic = UT.df2dict(UT.chromdf(genome), 'chr', 'size') bundles = [] args = [] for chrom in chroms: sub = bed[(bed['chr'] == chrom)] uc = UT.union_contiguous(sub[['chr', 'st', 'ed']], returndf=True) # total about 30K=> make batch of ~1000 n = len(uc) nb = int(N.ceil(n / 1000.)) for i in range(nb): sti = 1000 * i edi = min(1000 * (i + 1), len(uc) - 1) st = max(uc.iloc[sti]['st'] - 100, 0) ed = min(uc.iloc[edi]['ed'] + 100, csizedic[chrom]) args.append([modelpre, bwpre, chrom, st, ed, dstpre, tcovth]) bundles.append((chrom, st, ed)) rslts = UT.process_mp(bundle_estimator, args, np=np, doreduce=False) concatenate_bundles(bundles, dstpre)
def make_sjexci(path, np): if path[-3:]=='.gz': bpath = path[:-3] else: bpath = path ext = bpath[-4:] if ext not in ['.gtf', '.bed', '.txt']: raise ValueError('unknown filetype {0}, should be either .gtf,.bed (bed12),.txt (ucsc knownGene)'.format(ext)) pathprefix = bpath[:-4] if not os.path.exists(path): raise ValueError('{0} file does not exists'.format(ext)) if ext=='.gtf': df = GGB.read_gtf(path).sort_values(['chr',]) sj, ex = gtf2exonsj(df, np=np) elif ext=='.bed': df = GGB.read_bed(path) sj, ex = bed2exonsj(df, np=np) elif ext=='.txt': # UCSC download if 'knownGene' in path: df = GGB.read_ucsc_knownGene(path) sj, ex = kg2exonsj(df, np=np) elif 'refGene' in path: df = GGB.read_ucsc_refGene(path) sj, ex = kg2exonsj(df, np=np) # same as kg # save LOG.info('saving sj to {0}'.format(pathprefix+'.sj.txt.gz')) UT.write_pandas(sj, pathprefix+'.sj.txt.gz', 'h') LOG.info('saving ex to {0}'.format(pathprefix+'.ex.txt.gz')) UT.write_pandas(ex, pathprefix+'.ex.txt.gz', 'h') # make ci ci = UT.chopintervals(ex, pathprefix+'.ci.txt.gz') return sj, ex
def gtf_from_bed12(modelpre, dstpath=None, source='.'): # path['gname'] contains gene id paths = GGB.read_bed(modelpre+'.paths.withse.bed.gz') ex = UT.read_pandas(modelpre+'.ex.txt.gz') ex['id'] = ex['chr']+':'+ex['name'] n2gn = UT.df2dict(ex, 'id', 'gname') # n2gn = UT.df2dict(ex, 'name', 'gname') # there may be same st,ed in different chromosome paths['id'] = paths['chr']+':'+paths['name'] paths['id0'] = paths['chr']+':'+paths['name'].str.split('|').str[0] paths['gname'] = [n2gn[x] for x in paths['id0']] g2cnt = {} tnames = [] for x in paths['gname']: i = g2cnt.get(x,1) tnames.append('{0}.{1}'.format(x,i)) g2cnt[x] = i+1 paths['tname'] = tnames txt = 'gene_id "{0}"; transcript_id "{1}"; exon_number "{2}";' def _gen(): cols = ['chr','st','ed','gname','tname','esizes','estarts','strand'] for c,s,e,gn,tn,esi,est,strand in paths[cols].values: esizes = [int(x) for x in esi.split(',')[:-1]] estarts = [int(x) for x in est.split(',')[:-1]] for i,(x,y) in enumerate(zip(esizes,estarts)): est = s+y eed = est+x extra = txt.format(gn,tn,i+1) yield (c,source,'exon',est+1,eed,'.',strand,'.',extra) df = PD.DataFrame([x for x in _gen()], columns=GGB.GTFCOLS) if dstpath is None: dstpath = bedpath.replace('.bed','.gtf') GGB.write_gtf(df, dstpath) idf = paths[['id','chr','name','tname','gname']] UT.write_pandas(idf, modelpre+'.idmap.txt.gz','h') return df
def test_bed2exonsj(testbed12): b12 = GGB.read_bed(testbed12) sj, ex = CV.bed2exonsj(b12) print(sj.iloc[:10]) print(ex.iloc[:10])
def sj(sjbed): "returns sj dataframe" sj0 = GGB.read_bed(sjbed) return sj0.iloc[:5000]
def read_bed(self, suffix, category='read'): return GGB.read_bed(self.bedname(suffix, category))
def count_repeats_viz_mp(beddf, rmskvizpath, idcol='_id', np=3, prefix=None, expand=0, col='repnames'): """Use rmsk-viz track and check each (unioned) exon overlaps with repeats and report repeat name(s). Uses Bedtools and calculates chromosome-wise. Args: beddf: Pandas DataFrame with chr,st,ed cols, when calculating repeats bp for genes, unioned bed should be used (use utils.make_unionex) idcol: colname for unique row id (default _id) rmskvizpath: path to repeat masker viz BED7 file (created using rmskviz2bed7) np: number of CPU to use prefix: path prefix for temp file, if not None temp files are kept. (default None) expand: how many bases to expand exon region in each side (default 0) col: column name to put in overlapping repeat names (if multiple comma separated) Outputs: are put into beddf columns with colname col(default repnames) """ cleanup = False if prefix is None: cleanup = True prefix = os.path.join(os.path.dirname(rmskvizpath), str(uuid.uuid4()) + '_') # chrom-wise chroms = sorted(beddf['chr'].unique()) # check whether rmskviz is already split splitrmsk = False for chrom in chroms: rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom) # reuse if not os.path.exists(rpath): splitrmsk = True break if splitrmsk: rmsk = GGB.read_bed(rmskvizpath) args = [] bfiles = [] ofiles = [] for chrom in chroms: bpath = prefix + 'tgt.{0}.bed'.format(chrom) # don't compress rpath = rmskvizpath + '.{0}.bed.gz'.format(chrom) # reuse if expand > 0: bchr = beddf[beddf['chr'] == chrom].copy() bchr['st'] = bchr['st'] - expand bchr['ed'] = bchr['ed'] + expand bchr.loc[bchr['st'] < 0, 'st'] = 0 else: bchr = beddf[beddf['chr'] == chrom] UT.write_pandas(bchr[['chr', 'st', 'ed', idcol]], bpath, '') bfiles.append(bpath) if splitrmsk: rchr = rmsk[rmsk['chr'] == chrom] UT.write_pandas(rchr[['chr', 'st', 'ed', 'name', 'strand']], rpath, '') opath = prefix + 'out.{0}.bed'.format(chrom) ofiles.append(opath) args.append([bpath, rpath, opath]) rslts = UT.process_mp(count_repeats_viz_chr, args, np=np, doreduce=False) # gather outputs cols = ['name', 'repnames'] outs = [UT.read_pandas(f, names=cols) for f in ofiles] df = PD.concat(outs, ignore_index=True) df['name'] = df['name'].astype(str) i2rn = UT.df2dict(df, 'name', 'repnames') beddf[col] = [i2rn[str(x)] for x in beddf[idcol]] # cleanup if cleanup: for f in bfiles: os.unlink(f) for f in ofiles: os.unlink(f) return beddf