Exemple #1
0
def run_gatk():
    params.T = 'CombineVariants'
    params.o = outfile
    params.R = ref
    params.nt = nthread
    params.variant = invcfs
    params._stdout = outfile
    params.genotypemergeoption = params.get('genotypemergeoption', 'UNIQUIFY')
    shell.Shell(equal=' ', duplistkeys=True).gatk(**params).run()
Exemple #2
0
def run_bedtools():
    params.i = infile
    shell.grep('^#', infile, _stdout=outfile)
    c = shell.Shell(dash='-', equal=' ', subcmd=True).bedtools
    if unique:
        c.sort(**params).pipe().uniq(__stdout=outfile).run()
    else:
        params.__stdout = outfile
        c.sort(**params).run()
Exemple #3
0
def bamIndex(bam, ext='.bam.bai', samtools='samtools', nthread=1):
    """
	Index bam files
	If bam file is a link, try to find the index file in its orginal directory or 
	its realpath directory
	If nothing found, try to create the index file using samtools
	@params:
		`ext`: The expected extension of index file. Default: `.bam.bai`
			- Some tools requird `XXX.bai` without `.bam`
		`samtools`: The path to samtools. Default: `samtools`
			- If it's None, then an exception will raised instead of creating the index file
		`nthread`: The # threads used to create the index file. Default: `1`
	"""
    if not ext.startswith('.'):
        ext = '.' + ext
    # /path/to/some.bam -> some.bam
    bname = path.basename(bam)
    # /path/to/some.bam -> /path/to/
    dname = path.dirname(bam)
    # some.bam -> some
    fname = path.splitext(bname)[0]
    # some -> some
    # [1]some -> some
    rname = fname.split(']', 1)[1] if fname.startswith('[') else fname

    samtools = shell.Shell({
        'samtools': samtools
    }, subcmd=True).samtools if samtools else None
    # /path/to/some.bam.bai
    expectedIndex = path.join(dname, rname + ext)
    if path.isfile(expectedIndex):
        return
    # if bam is not a link, there is nowhere else to find index, create it using samtools
    if not path.islink(bam):
        if samtools:
            samtools.index(b=True, _stdout=expectedIndex, **{'@': nthread})
        else:
            raise ValueError('Index not found: {}'.format(bam))
        return
    # find the index in original directory
    origbam = readlink(bam)
    origIndex = path.splitext(origbam)[0] + ext
    if path.isfile(origIndex):
        shell.ln_s(origIndex, expectedIndex)
        return
    # find the index in realpath directory
    realbam = path.realpath(bam)
    realIndex = path.splitext(realbam)[0] + ext
    if path.isfile(realIndex):
        shell.ln_s(realIndex, expectedIndex)
        return
    # if all failed, create it
    if samtools:
        samtools.index(b=True, _stdout=expectedIndex, **{'@': nthread})
    else:
        raise ValueError('Index not found: {}'.format(bam))
Exemple #4
0
def run_bedops():
    params['max-mem'] = args.mem
    params.tmpdir = tmpdir
    params._ = infile
    shell.grep('^#', infile, _stdout=outfile)
    c = shell.Shell(equal=' ')
    if unique:
        c.bedops(**params).pipe().uniq(__stdout=outfile).run()
    else:
        params.__stdout = outfile
        c.bedops(**params).run()
Exemple #5
0
def run_sort():
    params.T = tmpdir
    params.S = argsmem
    params.u = unique
    if sortby == 'coord':
        params.k = ['1,1', '2,2n']
    else:
        params.k = '4'
    shell.grep('^#', infile, _stdout=outfile)
    shell.Shell().grep(v='^#',
                       infile).pipe(dash='-',
                                    equal='=').sort(**params,
                                                    __stdout=outfile).run()
Exemple #6
0
def ceQTL_filter(args):
    # check if AdjPval is in the header
    p = shell.Shell().head(n=1, _=args.cefile).pipe().grep('AdjPval').run(
        raiseExc=False, logger=False)

    hasAdjPval = p.rc == 0
    pCefile = pFile2Proc.copy()
    pCefile.input = [args.cefile]
    starts = [pCefile]

    pCefile_snps = ceQTL_filter_snps(args, pCefile, hasAdjPval, starts)
    pCefile_genes = ceQTL_filter_genes(
        args, (pCefile, pCefile_snps)[int(args.connect == 'and')], hasAdjPval,
        starts)
    pCefile_tfs = ceQTL_filter_tfs(args,
                                   (pCefile,
                                    pCefile_genes)[int(args.connect == 'and')],
                                   hasAdjPval, starts)
    pCefile_regs = ceQTL_filter_regs(args,
                                     (pCefile,
                                      pCefile_tfs)[int(args.connect == 'and')],
                                     hasAdjPval, starts)
    pCefile_row = ceQTL_filter_row(args,
                                   (pCefile,
                                    pCefile_regs)[int(args.connect == 'and')],
                                   hasAdjPval, starts)

    if args.connect == 'and':
        setOutfile(pCefile_row, args.outfile)
    else:
        procs = list({
            p
            for p in {
                pCefile_snps, pCefile_genes, pCefile_tfs, pCefile_regs,
                pCefile_row
            } if not p is pCefile
        })
        pTsvMerge.depends = procs
        pTsvMerge.input = lambda *chs: [sum((ch.flatten() for ch in chs), [])]
        pTsvMerge.args.inopts.cnames = True

        pSortMerged = pSort.copy()
        pSortMerged.depends = pTsvMerge
        pSortMerged.args.unique = True

        setOutfile(pSortMerged, args.outfile)

    PyPPL().start(starts).run()
Exemple #7
0
def vcfIndex(vcf, tabix='tabix'):

    # /path/to/some.vcf -> some.vcf
    # /path/to/some.vcf.gz -> some.vcf
    bname = path.basename(
        vcf[:-3]) if vcf.endswith('.gz') else path.basename(vcf)
    # /path/to/some.bam -> /path/to/
    dname = path.dirname(vcf)
    # some.vcf -> some
    # some.vcf.gz -> some
    fname = path.splitext(bname)[0]
    # some -> some
    # [1]some -> some
    rname = fname.split(']', 1)[1] if fname.startswith('[') else fname

    expectedIndex = path.join(dname, rname + '.vcf.gz.tbi')
    if path.isfile(expectedIndex):
        return vcf

    # if vcf is not a link, there is nowhere else to find index, create it using tabix
    tabix = shell.Shell({'tabix': tabix}).tabix
    gt = gztype(vcf)
    if gt == 'bgzip':
        if path.islink(vcf):
            linkvcf = path.readlink(vcf)
            if path.isfile(linkvcf + '.tbi'):
                shell.ln_s(linkvcf + '.tbi', expectedIndex)
                return vcf
            realvcf = path.realpath(vcf)
            if path.isfile(realvcf + '.tbi'):
                shell.ln_s(realvcf + '.tbi', expectedIndex)
                return vcf
        tabix(p='vcf', _=vcf).run()
        return vcf
    if gt == 'gzip':
        tmpvcf = path.join(dname, bname + '.tmp.vcf')
        shell.gunzip_to(vcf, tmpvcf)
        shell.bgzip(tmpvcf)
        tabix(p='vcf', _=tmpvcf + '.gz').run()
        shell.mv(tmpvcf + '.gz.tbi', expectedIndex)
        return vcf
    shell.bgzip(vcf, c=True, _stdout=vcf + '.gz')
    tabix(p='vcf', _=vcf + '.gz').run()
    return vcf + '.gz'
Exemple #8
0
from pyppl import Box
from bioprocs.utils import shell

infile = {{i.infile | quote}}
outfile = {{o.outfile | quote}}
bedtools = {{args.bedtools | quote}}
params = {{args.params | repr}}
ref = {{args.ref | quote}}

shell.TOOLS.bedtools = bedtools

params.fi = ref
params.bed = infile
params._stdout = outfile
shell.Shell(subcmd=True, dash='-', equal=' ').bedtools.getfasta(**params).run()
Exemple #9
0
shell.TOOLS.Rscript = Rscript
if isinstance(cutoff, dict):
	if cutoff['by'] == 'p':
		cutoff['by'] = 'Pval'
	if cutoff['by'] == 'q':
		cutoff['by'] = 'AdjPval'

reader = TsvReader(infile, **inopts)
genes  = [r[genecol] for r in reader]

en = Enrichr(cutoff = cutoff, top = top, Rscript = Rscript)
en.addList(genes, description = path.basename(infile))

para = Parallel(nthread = nthread)
runPathview = lambda r, hsa: shell.Shell().Rscript(r, hsa).run()
for db in dbs:
	outfile = path.join(outdir, prefix + '.' + db + '.txt')
	en.enrich(db)
	en.export(outfile, top = 100)
	if plot:
		plotfile = path.join(outdir, prefix + '.' + db + '.png')
		en.plot(plotfile, res = devpars.res, width = devpars.width, height = devpars.height)
	if pathview and 'KEGG' in db:
		pathviewRDir  = path.join(outdir, prefix + '.' + db + '.pathview')
		pathviewRfile = path.join(pathviewRDir, 'pathview.R')
		shell.mkdir(pathviewRDir)
		with open(pathviewRfile, 'w') as f:
			f.write("""
			{rimport}('__init__.r')
			library(pathview)
Exemple #10
0
import re
import urllib2
import testly
import yaml
from os import path
from bioprocs.utils import shell
from bioprocs.utils.tsvio2 import TsvReader
from tempfile import gettempdir

shbioprocs = shell.Shell(subcmd=True, dash='-', equal=' ',
                         duplistkey=False).bioprocs

PROCDATADIR = path.join(path.dirname(path.abspath(__file__)), 'procdata')
DATAFILE = path.join(PROCDATADIR, 'data.yml')
TMPDIR = gettempdir()
CACHED = {}


def runBioprocs(proc, args):
    args['config._log.shortpath:py'] = 'False'
    return shbioprocs[proc](**args).run(save='same', uselogger=False)


def download(url, savedir=TMPDIR):
    bname = path.basename(url)
    destfile = path.join(savedir, bname)
    filedata = urllib2.open(url)
    with open(destfile, 'wb') as f:
        f.write(filtdata.read())
    return destfile
Exemple #11
0
                ','.join([reffreq] + list(allfreqs.values()))
            ])
    writer.close()

else:
    # snps
    snplist = path.join(jobindir, path.basename(snpfile) + '.list')
    reader = TsvReader(snpfile, cnames=False)
    writer = TsvWriter(snplist)
    for r in reader:
        writer.write([r[snpcol]])
    reader.close()
    writer.close()

    shell.TOOLS.vcftools = vcftools
    vcftools = shell.Shell(equal=' ', dash='--').vcftools

    params = Box()
    params.snps = snplist
    params.recode = True
    params.out = path.join(joboutdir, 'tmp')
    if dbsnp.endswith('.gz'):
        params.gzvcf = dbsnp
    elif not path.isfile(dbsnp):
        raise ValueError('dbsnp file (args.dbsnp) is required by tool "local"')
    else:
        params.vcf = dbsnp
    vcftools(**params).run()

    reader = TsvReader(params.out + '.recode.vcf', cnames=False)
    outfiletmp = outfile + '.tmp'
Exemple #12
0
from pyppl import Box
from bioprocs.utils import shell

params = {{args.params | repr}}

params['a'] = {{i.afile | quote}}
params['b'] = {{i.bfile | quote}}
params['wao'] = params.get('wao', True)
params['nonamecheck'] = params.get('nonamecheck', True)
params['_stdout'] = {{o.outfile | quote}}

shell.TOOLS.bedtools = {{args.bedtools | quote}}
shell.Shell(subcmd=True, dash='-',
            equal=' ').bedtools.intersect(**params).run()
Exemple #13
0
ref = {{args.ref | repr}}
params = {{args.params | repr}}
prefix = {{i.infiles | fs2name | quote}}
outdir = {{job.outdir | quote}}
cnvkit = {{args.cnvkit | quote}}
nthread = {{args.nthread | repr}}

for infile in infiles:
    bamIndex(infile)

shell.TOOLS['cnvkit'] = cnvkit
envs = dict(OPENBLAS_NUM_THREADS=str(nthread),
            OMP_NUM_THREADS=str(nthread),
            NUMEXPR_NUM_THREADS=str(nthread),
            MKL_NUM_THREADS=str(nthread))
ckshell = shell.Shell(subcmd=True, equal=' ', envs=envs, cwd=outdir).cnvkit

# generate target file
params_t = params.target
params_t.o = path.join(outdir, prefix + '.bed')
ckshell.target(exbaits, **params_t).run()

# generate access file
if not accfile:
    accfile = path.join(outdir, prefix + '.access.bed')
    params_a = params.access
    params_a.o = accfile
    ckshell.access(ref, **params_a).run()

# autobin
params_b = params.autobin
Exemple #14
0
from os import remove
from pyppl import Box
from bioprocs.utils import shell

infiles = {{i.infile | repr}}
params = {{args.params | repr}}

shell.TOOLS.bedtools = {{args.bedtools | quote}}

mergedtmp = '{{job.outdir}}/mergedtmp.bed'
mergedtmpsorted = '{{job.outdir}}/mergedtmp.sorted.bed'

shell.touch(mergedtmp)

for infile in infiles:
    shell.grep(v='^#', _=infile, __stdout=mergedtmp)

shell.sort(k=['1,1', '2,2n'], _=mergedtmp, _stdout=mergedtmpsorted)

params.i = mergedtmpsorted
params._stdout = outfile
shell.Shell(subcmd=True, dash='-', equal=' ').bedtools.merge(**params).run()

shell.rm_rf(mergedtmp)
shell.rm_rf(mergedtmpsorted)
Exemple #15
0
def run_vcftools():
    params.d = params.get('d', True)
    params.t = params.get('t', True)
    params._ = invcfs
    params._stdout = outfile
    shell.Shell(equal=' ').vcftools(**params).run()
Exemple #16
0
from pyppl import Box
from bioprocs.utils import shell
from bioprocs.utils.tsvio2 import TsvReader, TsvWriter

infile   = {{ i.infile | quote}}
outfile  = {{ o.outfile | quote}}
extend   = {{ args.extend | bool}}
gsize    = {{ args.gsize | quote}}
params   = {{ args.params | repr}}
bedtools = {{ args.bedtools | quote}}

shell.TOOLS.bedtools = bedtools
bedtools = shell.Shell(subcmd = True, dash = '-', equal = ' ').bedtools

params['g']   = gsize
params['i']   = infile

if not 'l' and not 'r' and not 'b' in params:
	raise ValueError('You have to define a length to flank (args.params.l, args.params.r or params.b')

if args.extend:
	left   = params.get('l', params.get('b', 0))
	right  = params.get('r', params.get('b', 0))
	stdns  = params.get('s', False)
	reader = TsvReader(infile, cnames = False)
	writer = TsvWriter(outfile)
	for r in reader:
		if not stdns or r[5] == '+':
			left2, right2 = left, right
		else:
			left2, right2 = right, left
Exemple #17
0
from pyppl import Box
from bioprocs.utils import shell
from bioprocs.utils.reference import vcfIndex

vcffile = {{i.vcffile | quote}}
regfile = {{i.regfile | quote}}
outfile = {{o.outfile | quote}}
tabix = {{args.tabix | quote}}
params = {{args.params | repr}}

shell.TOOLS.tabix = tabix
vcffile = vcfIndex(vcffile, tabix)

params._ = [vcffile, regfile]
params._stdout = outfile
shell.Shell().tabix(**params).run()
Exemple #18
0
fq1     = {{ i.fqfile1 | quote}}
fq2     = {{ i.fqfile2 | quote}}
outfile = {{ o.outfile | quote}}
outdir  = {{ o.outdir | quote}}
params = {{ args.params | repr}}
idxfile = {{ args.idxfile | quote}}
kallisto = {{ args.kallisto | quote}}
nthread = {{ args.nthread | repr}}

shell.TOOLS.kallisto = kallisto
params.i = idxfile
params.o = outdir
params.t = nthread
params._ = [fq1, fq2]

kallisto = shell.Shell(subcmd = True).kallisto
kallisto.quant(**params).run()

imfile        = path.join(outdir, 'abundance.tsv')
reader        = TsvReader(imfile)
writer        = TsvWriter(outfile)
writer.cnames = ['target_id', 'est_counts']
writer.writeHead()

for r in reader:
	r.target_id = r.target_id.split('::')[0]
	try:
		r.est_counts = int(round(float(r.est_counts)))
	except TypeError:
		r.est_counts = 0
	writer.write(r)
Exemple #19
0
from bioprocs.utils import shell

infile = {{i.infile | repr}}
outfile = {{o.outfile | repr}}
umfile = {{o.umfile | repr}}
params = {{args.params | repr}}
chain = {{args.lochain | repr}}
liftover = {{args.liftover | repr}}

shell.TOOLS.liftover = liftover
shell.Shell(dash='-', equal='=').liftover(infile, chain, outfile,
                                          **params).run()
Exemple #20
0
from pyppl import Box
from bioprocs.utils import shell

cnvkit = {{args.cnvkit | quote}}
infile = {{i.cnsfile | quote}}
outfile = {{o.outfile | quote}}
params = {{args.params}}
nthread = {{args.nthread | repr}}

shell.TOOLS['cnvkit'] = cnvkit
envs = dict(OPENBLAS_NUM_THREADS=nthread,
            OMP_NUM_THREADS=nthread,
            NUMEXPR_NUM_THREADS=nthread,
            MKL_NUM_THREADS=nthread)
ckshell = shell.Shell(subcmd=True, equal=' ', envs=envs).cnvkit

params.o = outfile
ckshell.export('vcf', infile, **params).run()
Exemple #21
0
from pyppl import Box
from bioprocs.utils import shell

l = {{i.l | repr}}
n = {{i.n | repr}}
seed = {{args.seed | repr}}
bedtools = {{args.bedtools | quote}}
gsize = {{args.gsize | quote}}
outfile = {{o.outfile | quote}}

shell.TOOLS.bedtools = bedtools
shell.Shell(subcmd=True, dash='-', equal='=').bedtools.random(l=l,
                                                              n=n,
                                                              seed=seed,
                                                              g=gsize,
                                                              _stdout=outfile)
Exemple #22
0
from pyppl import Box
from bioprocs.utils import shell

cnvkit = {{args.cnvkit | quote}}
infile = {{i.infile | quote}}
outfile = {{o.outfile | quote}}
params = {{args.params}}

shell.TOOLS['cnvkit'] = cnvkit
# envs = dict(
# 	OPENBLAS_NUM_THREADS = nthread,
# 	OMP_NUM_THREADS      = nthread,
# 	NUMEXPR_NUM_THREADS  = nthread,
# 	MKL_NUM_THREADS      = nthread
# )
ckshell = shell.Shell(subcmd=True, equal=' ').cnvkit

params.o = outfile
ckshell.call(infile, **params).run()
Exemple #23
0
{% python from os import path %}
cnsfile  = {{i.cnsfile | quote}}
cnnfile  = {{i.cnnfile | quote}}
outfile  = {{o.outfile | quote}}
nthread  = {{args.nthread | quote}}
params   = {{args.params | repr}}
cnvkit   = {{args.cnvkit | quote}}

shell.TOOLS['cnvkit'] = cnvkit
envs = dict(
	OPENBLAS_NUM_THREADS = nthread,
	OMP_NUM_THREADS      = nthread,
	NUMEXPR_NUM_THREADS  = nthread,
	MKL_NUM_THREADS      = nthread
)
ckshell = shell.Shell(subcmd = True, equal = ' ', envs = envs, cwd = path.dirname(outfile)).cnvkit

# region cnvkit export example
# cnvkit.py export theta Sample_T.cns reference.cnn -v Sample_Paired.vcf
# cnvkit.py export theta Sample_Tumor.cns Sample_Normal.cnr -o Sample.theta2.interval_count
# cnvkit.py export theta Sample_Tumor.cns -o Sample.theta2.interval_count
# endregion

# region cnvkit export theta usage
# usage: cnvkit.py export theta [-h] [-r REFERENCE] [-o OUTPUT] [-v VCF]
#                               [-i SAMPLE_ID] [-n NORMAL_ID]
#                               [-m MIN_VARIANT_DEPTH] [-z [ALT_FREQ]]
#                               tumor_segment

# positional arguments:
#   tumor_segment         Tumor-sample segmentation file from CNVkit (.cns).