def run_gatk(): params.T = 'CombineVariants' params.o = outfile params.R = ref params.nt = nthread params.variant = invcfs params._stdout = outfile params.genotypemergeoption = params.get('genotypemergeoption', 'UNIQUIFY') shell.Shell(equal=' ', duplistkeys=True).gatk(**params).run()
def run_bedtools(): params.i = infile shell.grep('^#', infile, _stdout=outfile) c = shell.Shell(dash='-', equal=' ', subcmd=True).bedtools if unique: c.sort(**params).pipe().uniq(__stdout=outfile).run() else: params.__stdout = outfile c.sort(**params).run()
def bamIndex(bam, ext='.bam.bai', samtools='samtools', nthread=1): """ Index bam files If bam file is a link, try to find the index file in its orginal directory or its realpath directory If nothing found, try to create the index file using samtools @params: `ext`: The expected extension of index file. Default: `.bam.bai` - Some tools requird `XXX.bai` without `.bam` `samtools`: The path to samtools. Default: `samtools` - If it's None, then an exception will raised instead of creating the index file `nthread`: The # threads used to create the index file. Default: `1` """ if not ext.startswith('.'): ext = '.' + ext # /path/to/some.bam -> some.bam bname = path.basename(bam) # /path/to/some.bam -> /path/to/ dname = path.dirname(bam) # some.bam -> some fname = path.splitext(bname)[0] # some -> some # [1]some -> some rname = fname.split(']', 1)[1] if fname.startswith('[') else fname samtools = shell.Shell({ 'samtools': samtools }, subcmd=True).samtools if samtools else None # /path/to/some.bam.bai expectedIndex = path.join(dname, rname + ext) if path.isfile(expectedIndex): return # if bam is not a link, there is nowhere else to find index, create it using samtools if not path.islink(bam): if samtools: samtools.index(b=True, _stdout=expectedIndex, **{'@': nthread}) else: raise ValueError('Index not found: {}'.format(bam)) return # find the index in original directory origbam = readlink(bam) origIndex = path.splitext(origbam)[0] + ext if path.isfile(origIndex): shell.ln_s(origIndex, expectedIndex) return # find the index in realpath directory realbam = path.realpath(bam) realIndex = path.splitext(realbam)[0] + ext if path.isfile(realIndex): shell.ln_s(realIndex, expectedIndex) return # if all failed, create it if samtools: samtools.index(b=True, _stdout=expectedIndex, **{'@': nthread}) else: raise ValueError('Index not found: {}'.format(bam))
def run_bedops(): params['max-mem'] = args.mem params.tmpdir = tmpdir params._ = infile shell.grep('^#', infile, _stdout=outfile) c = shell.Shell(equal=' ') if unique: c.bedops(**params).pipe().uniq(__stdout=outfile).run() else: params.__stdout = outfile c.bedops(**params).run()
def run_sort(): params.T = tmpdir params.S = argsmem params.u = unique if sortby == 'coord': params.k = ['1,1', '2,2n'] else: params.k = '4' shell.grep('^#', infile, _stdout=outfile) shell.Shell().grep(v='^#', infile).pipe(dash='-', equal='=').sort(**params, __stdout=outfile).run()
def ceQTL_filter(args): # check if AdjPval is in the header p = shell.Shell().head(n=1, _=args.cefile).pipe().grep('AdjPval').run( raiseExc=False, logger=False) hasAdjPval = p.rc == 0 pCefile = pFile2Proc.copy() pCefile.input = [args.cefile] starts = [pCefile] pCefile_snps = ceQTL_filter_snps(args, pCefile, hasAdjPval, starts) pCefile_genes = ceQTL_filter_genes( args, (pCefile, pCefile_snps)[int(args.connect == 'and')], hasAdjPval, starts) pCefile_tfs = ceQTL_filter_tfs(args, (pCefile, pCefile_genes)[int(args.connect == 'and')], hasAdjPval, starts) pCefile_regs = ceQTL_filter_regs(args, (pCefile, pCefile_tfs)[int(args.connect == 'and')], hasAdjPval, starts) pCefile_row = ceQTL_filter_row(args, (pCefile, pCefile_regs)[int(args.connect == 'and')], hasAdjPval, starts) if args.connect == 'and': setOutfile(pCefile_row, args.outfile) else: procs = list({ p for p in { pCefile_snps, pCefile_genes, pCefile_tfs, pCefile_regs, pCefile_row } if not p is pCefile }) pTsvMerge.depends = procs pTsvMerge.input = lambda *chs: [sum((ch.flatten() for ch in chs), [])] pTsvMerge.args.inopts.cnames = True pSortMerged = pSort.copy() pSortMerged.depends = pTsvMerge pSortMerged.args.unique = True setOutfile(pSortMerged, args.outfile) PyPPL().start(starts).run()
def vcfIndex(vcf, tabix='tabix'): # /path/to/some.vcf -> some.vcf # /path/to/some.vcf.gz -> some.vcf bname = path.basename( vcf[:-3]) if vcf.endswith('.gz') else path.basename(vcf) # /path/to/some.bam -> /path/to/ dname = path.dirname(vcf) # some.vcf -> some # some.vcf.gz -> some fname = path.splitext(bname)[0] # some -> some # [1]some -> some rname = fname.split(']', 1)[1] if fname.startswith('[') else fname expectedIndex = path.join(dname, rname + '.vcf.gz.tbi') if path.isfile(expectedIndex): return vcf # if vcf is not a link, there is nowhere else to find index, create it using tabix tabix = shell.Shell({'tabix': tabix}).tabix gt = gztype(vcf) if gt == 'bgzip': if path.islink(vcf): linkvcf = path.readlink(vcf) if path.isfile(linkvcf + '.tbi'): shell.ln_s(linkvcf + '.tbi', expectedIndex) return vcf realvcf = path.realpath(vcf) if path.isfile(realvcf + '.tbi'): shell.ln_s(realvcf + '.tbi', expectedIndex) return vcf tabix(p='vcf', _=vcf).run() return vcf if gt == 'gzip': tmpvcf = path.join(dname, bname + '.tmp.vcf') shell.gunzip_to(vcf, tmpvcf) shell.bgzip(tmpvcf) tabix(p='vcf', _=tmpvcf + '.gz').run() shell.mv(tmpvcf + '.gz.tbi', expectedIndex) return vcf shell.bgzip(vcf, c=True, _stdout=vcf + '.gz') tabix(p='vcf', _=vcf + '.gz').run() return vcf + '.gz'
from pyppl import Box from bioprocs.utils import shell infile = {{i.infile | quote}} outfile = {{o.outfile | quote}} bedtools = {{args.bedtools | quote}} params = {{args.params | repr}} ref = {{args.ref | quote}} shell.TOOLS.bedtools = bedtools params.fi = ref params.bed = infile params._stdout = outfile shell.Shell(subcmd=True, dash='-', equal=' ').bedtools.getfasta(**params).run()
shell.TOOLS.Rscript = Rscript if isinstance(cutoff, dict): if cutoff['by'] == 'p': cutoff['by'] = 'Pval' if cutoff['by'] == 'q': cutoff['by'] = 'AdjPval' reader = TsvReader(infile, **inopts) genes = [r[genecol] for r in reader] en = Enrichr(cutoff = cutoff, top = top, Rscript = Rscript) en.addList(genes, description = path.basename(infile)) para = Parallel(nthread = nthread) runPathview = lambda r, hsa: shell.Shell().Rscript(r, hsa).run() for db in dbs: outfile = path.join(outdir, prefix + '.' + db + '.txt') en.enrich(db) en.export(outfile, top = 100) if plot: plotfile = path.join(outdir, prefix + '.' + db + '.png') en.plot(plotfile, res = devpars.res, width = devpars.width, height = devpars.height) if pathview and 'KEGG' in db: pathviewRDir = path.join(outdir, prefix + '.' + db + '.pathview') pathviewRfile = path.join(pathviewRDir, 'pathview.R') shell.mkdir(pathviewRDir) with open(pathviewRfile, 'w') as f: f.write(""" {rimport}('__init__.r') library(pathview)
import re import urllib2 import testly import yaml from os import path from bioprocs.utils import shell from bioprocs.utils.tsvio2 import TsvReader from tempfile import gettempdir shbioprocs = shell.Shell(subcmd=True, dash='-', equal=' ', duplistkey=False).bioprocs PROCDATADIR = path.join(path.dirname(path.abspath(__file__)), 'procdata') DATAFILE = path.join(PROCDATADIR, 'data.yml') TMPDIR = gettempdir() CACHED = {} def runBioprocs(proc, args): args['config._log.shortpath:py'] = 'False' return shbioprocs[proc](**args).run(save='same', uselogger=False) def download(url, savedir=TMPDIR): bname = path.basename(url) destfile = path.join(savedir, bname) filedata = urllib2.open(url) with open(destfile, 'wb') as f: f.write(filtdata.read()) return destfile
','.join([reffreq] + list(allfreqs.values())) ]) writer.close() else: # snps snplist = path.join(jobindir, path.basename(snpfile) + '.list') reader = TsvReader(snpfile, cnames=False) writer = TsvWriter(snplist) for r in reader: writer.write([r[snpcol]]) reader.close() writer.close() shell.TOOLS.vcftools = vcftools vcftools = shell.Shell(equal=' ', dash='--').vcftools params = Box() params.snps = snplist params.recode = True params.out = path.join(joboutdir, 'tmp') if dbsnp.endswith('.gz'): params.gzvcf = dbsnp elif not path.isfile(dbsnp): raise ValueError('dbsnp file (args.dbsnp) is required by tool "local"') else: params.vcf = dbsnp vcftools(**params).run() reader = TsvReader(params.out + '.recode.vcf', cnames=False) outfiletmp = outfile + '.tmp'
from pyppl import Box from bioprocs.utils import shell params = {{args.params | repr}} params['a'] = {{i.afile | quote}} params['b'] = {{i.bfile | quote}} params['wao'] = params.get('wao', True) params['nonamecheck'] = params.get('nonamecheck', True) params['_stdout'] = {{o.outfile | quote}} shell.TOOLS.bedtools = {{args.bedtools | quote}} shell.Shell(subcmd=True, dash='-', equal=' ').bedtools.intersect(**params).run()
ref = {{args.ref | repr}} params = {{args.params | repr}} prefix = {{i.infiles | fs2name | quote}} outdir = {{job.outdir | quote}} cnvkit = {{args.cnvkit | quote}} nthread = {{args.nthread | repr}} for infile in infiles: bamIndex(infile) shell.TOOLS['cnvkit'] = cnvkit envs = dict(OPENBLAS_NUM_THREADS=str(nthread), OMP_NUM_THREADS=str(nthread), NUMEXPR_NUM_THREADS=str(nthread), MKL_NUM_THREADS=str(nthread)) ckshell = shell.Shell(subcmd=True, equal=' ', envs=envs, cwd=outdir).cnvkit # generate target file params_t = params.target params_t.o = path.join(outdir, prefix + '.bed') ckshell.target(exbaits, **params_t).run() # generate access file if not accfile: accfile = path.join(outdir, prefix + '.access.bed') params_a = params.access params_a.o = accfile ckshell.access(ref, **params_a).run() # autobin params_b = params.autobin
from os import remove from pyppl import Box from bioprocs.utils import shell infiles = {{i.infile | repr}} params = {{args.params | repr}} shell.TOOLS.bedtools = {{args.bedtools | quote}} mergedtmp = '{{job.outdir}}/mergedtmp.bed' mergedtmpsorted = '{{job.outdir}}/mergedtmp.sorted.bed' shell.touch(mergedtmp) for infile in infiles: shell.grep(v='^#', _=infile, __stdout=mergedtmp) shell.sort(k=['1,1', '2,2n'], _=mergedtmp, _stdout=mergedtmpsorted) params.i = mergedtmpsorted params._stdout = outfile shell.Shell(subcmd=True, dash='-', equal=' ').bedtools.merge(**params).run() shell.rm_rf(mergedtmp) shell.rm_rf(mergedtmpsorted)
def run_vcftools(): params.d = params.get('d', True) params.t = params.get('t', True) params._ = invcfs params._stdout = outfile shell.Shell(equal=' ').vcftools(**params).run()
from pyppl import Box from bioprocs.utils import shell from bioprocs.utils.tsvio2 import TsvReader, TsvWriter infile = {{ i.infile | quote}} outfile = {{ o.outfile | quote}} extend = {{ args.extend | bool}} gsize = {{ args.gsize | quote}} params = {{ args.params | repr}} bedtools = {{ args.bedtools | quote}} shell.TOOLS.bedtools = bedtools bedtools = shell.Shell(subcmd = True, dash = '-', equal = ' ').bedtools params['g'] = gsize params['i'] = infile if not 'l' and not 'r' and not 'b' in params: raise ValueError('You have to define a length to flank (args.params.l, args.params.r or params.b') if args.extend: left = params.get('l', params.get('b', 0)) right = params.get('r', params.get('b', 0)) stdns = params.get('s', False) reader = TsvReader(infile, cnames = False) writer = TsvWriter(outfile) for r in reader: if not stdns or r[5] == '+': left2, right2 = left, right else: left2, right2 = right, left
from pyppl import Box from bioprocs.utils import shell from bioprocs.utils.reference import vcfIndex vcffile = {{i.vcffile | quote}} regfile = {{i.regfile | quote}} outfile = {{o.outfile | quote}} tabix = {{args.tabix | quote}} params = {{args.params | repr}} shell.TOOLS.tabix = tabix vcffile = vcfIndex(vcffile, tabix) params._ = [vcffile, regfile] params._stdout = outfile shell.Shell().tabix(**params).run()
fq1 = {{ i.fqfile1 | quote}} fq2 = {{ i.fqfile2 | quote}} outfile = {{ o.outfile | quote}} outdir = {{ o.outdir | quote}} params = {{ args.params | repr}} idxfile = {{ args.idxfile | quote}} kallisto = {{ args.kallisto | quote}} nthread = {{ args.nthread | repr}} shell.TOOLS.kallisto = kallisto params.i = idxfile params.o = outdir params.t = nthread params._ = [fq1, fq2] kallisto = shell.Shell(subcmd = True).kallisto kallisto.quant(**params).run() imfile = path.join(outdir, 'abundance.tsv') reader = TsvReader(imfile) writer = TsvWriter(outfile) writer.cnames = ['target_id', 'est_counts'] writer.writeHead() for r in reader: r.target_id = r.target_id.split('::')[0] try: r.est_counts = int(round(float(r.est_counts))) except TypeError: r.est_counts = 0 writer.write(r)
from bioprocs.utils import shell infile = {{i.infile | repr}} outfile = {{o.outfile | repr}} umfile = {{o.umfile | repr}} params = {{args.params | repr}} chain = {{args.lochain | repr}} liftover = {{args.liftover | repr}} shell.TOOLS.liftover = liftover shell.Shell(dash='-', equal='=').liftover(infile, chain, outfile, **params).run()
from pyppl import Box from bioprocs.utils import shell cnvkit = {{args.cnvkit | quote}} infile = {{i.cnsfile | quote}} outfile = {{o.outfile | quote}} params = {{args.params}} nthread = {{args.nthread | repr}} shell.TOOLS['cnvkit'] = cnvkit envs = dict(OPENBLAS_NUM_THREADS=nthread, OMP_NUM_THREADS=nthread, NUMEXPR_NUM_THREADS=nthread, MKL_NUM_THREADS=nthread) ckshell = shell.Shell(subcmd=True, equal=' ', envs=envs).cnvkit params.o = outfile ckshell.export('vcf', infile, **params).run()
from pyppl import Box from bioprocs.utils import shell l = {{i.l | repr}} n = {{i.n | repr}} seed = {{args.seed | repr}} bedtools = {{args.bedtools | quote}} gsize = {{args.gsize | quote}} outfile = {{o.outfile | quote}} shell.TOOLS.bedtools = bedtools shell.Shell(subcmd=True, dash='-', equal='=').bedtools.random(l=l, n=n, seed=seed, g=gsize, _stdout=outfile)
from pyppl import Box from bioprocs.utils import shell cnvkit = {{args.cnvkit | quote}} infile = {{i.infile | quote}} outfile = {{o.outfile | quote}} params = {{args.params}} shell.TOOLS['cnvkit'] = cnvkit # envs = dict( # OPENBLAS_NUM_THREADS = nthread, # OMP_NUM_THREADS = nthread, # NUMEXPR_NUM_THREADS = nthread, # MKL_NUM_THREADS = nthread # ) ckshell = shell.Shell(subcmd=True, equal=' ').cnvkit params.o = outfile ckshell.call(infile, **params).run()
{% python from os import path %} cnsfile = {{i.cnsfile | quote}} cnnfile = {{i.cnnfile | quote}} outfile = {{o.outfile | quote}} nthread = {{args.nthread | quote}} params = {{args.params | repr}} cnvkit = {{args.cnvkit | quote}} shell.TOOLS['cnvkit'] = cnvkit envs = dict( OPENBLAS_NUM_THREADS = nthread, OMP_NUM_THREADS = nthread, NUMEXPR_NUM_THREADS = nthread, MKL_NUM_THREADS = nthread ) ckshell = shell.Shell(subcmd = True, equal = ' ', envs = envs, cwd = path.dirname(outfile)).cnvkit # region cnvkit export example # cnvkit.py export theta Sample_T.cns reference.cnn -v Sample_Paired.vcf # cnvkit.py export theta Sample_Tumor.cns Sample_Normal.cnr -o Sample.theta2.interval_count # cnvkit.py export theta Sample_Tumor.cns -o Sample.theta2.interval_count # endregion # region cnvkit export theta usage # usage: cnvkit.py export theta [-h] [-r REFERENCE] [-o OUTPUT] [-v VCF] # [-i SAMPLE_ID] [-n NORMAL_ID] # [-m MIN_VARIANT_DEPTH] [-z [ALT_FREQ]] # tumor_segment # positional arguments: # tumor_segment Tumor-sample segmentation file from CNVkit (.cns).