en.plot(plotfile, res = devpars.res, width = devpars.width, height = devpars.height) if pathview and 'KEGG' in db: pathviewRDir = path.join(outdir, prefix + '.' + db + '.pathview') pathviewRfile = path.join(pathviewRDir, 'pathview.R') shell.mkdir(pathviewRDir) with open(pathviewRfile, 'w') as f: f.write(""" {rimport}('__init__.r') library(pathview) args = commandArgs(trailingOnly = TRUE) setwd({pathviewRDir!r}) inopts = {{args.inopts | R}} inopts$rnames = FALSE indata = read.table.inopts({infile!r}, inopts) genes = as.vector(indata[, {genecol}, drop = TRUE]) pvargs = {{args.pathview | R}} {% raw %} if (!is.null(pvargs$fccol)) {{ fcdata = as.vector(indata[, pvargs$fccol, drop = TRUE]) names(fcdata) = genes genes = fcdata }} {% endraw %} pathview(gene.data = genes, pathway.id = args[1], species = 'hsa', gene.idtype="SYMBOL") """.format( rimport = rimport, genecol = genecol + 1 if isinstance(genecol, int) else genecol, infile = infile, pathviewRDir = pathviewRDir) ) para.run(runPathview, [(pathviewRfile, term.Term.split('_')[-1]) for term in en.results[:top]])
samplevcf = "{{job.outdir}}/{{i.infile | fn}}-%s.vcf" % sample cmd = '{{args.vcftools}} %s {{i.infile | quote}} > "%s"' % (cmdargs(vtparams), samplevcf) # vcf2maf.pl --input-vcf ZYYP-ZYYB.vcf --output-maf ZYYP-ZYYB.snpEff.maf --tumor-id ZXLT-ZXLB_TUMOR --normal-id ZXLT-ZXLB_NORMAL --vep-data /path/to/vep/cache/ --filter-vcf /path/to/vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz --ref-fasta /path/to/hs37d5/phase2_reference_assembly_sequence/hs37d5.fa --vep-path /path/to/miniconda2/bin params['input-vcf'] = samplevcf params['output-maf'] = "{{job.outdir}}/{{i.infile | fn}}-%s.maf" % sample params['vep-data'] = {{args.vepDb | quote}} params['vep-forks'] = {{args.nthread}} params['filter-vcf'] = {{args.filtervcf | quote}} params['ref-fasta'] = {{args.ref | quote}} params['vep-path'] = path.dirname(vep) cmd = cmd + '; {{args.vcf2maf}} --tumor-id %s %s' % (sample, cmdargs(params, equal=' ')) cmds.append(cmd) {% if args.nthread == 1 %} for cmd in cmds: runcmd(cmd) {% else %} # Note the threads may be hanging on here. p = Parallel({{args.nthread}}) p.run('{}', [(cmd,) for cmd in cmds]) {% endif %} for i, sample in enumerate(samples): singlemaf = "{{job.outdir}}/{{i.infile | fn}}-%s.maf" % sample if i == 0: runcmd('cat "%s" > {{o.outfile | quote}}' % singlemaf) else: runcmd('egrep -v "^#|^Hugo_Symbol" "%s" >> {{o.outfile | quote}}' % singlemaf) {% endif %} {% endif %}
invcfs = {{i.infiles | repr}} outfile = {{o.outfile | quote}} nthread = {{args.nthread | int}} joboutdir = {{job.outdir | quote}} vcftools = {{args.vcftools | quote}} gatk = {{args.gatk | quote}} tabix = {{args.tabix | quote}} ref = {{args.ref | quote}} params = {{args.params | repr}} tool = {{args.tool | quote}} shell.TOOLS.vcftools = vcftools shell.TOOLS.gatk = gatk para = Parallel(nthread, raiseExc=True) invcfs = para.run(vcfIndex, [(vcf, tabix) for vcf in invcfs]) def run_vcftools(): params.d = params.get('d', True) params.t = params.get('t', True) params._ = invcfs params._stdout = outfile shell.Shell(equal=' ').vcftools(**params).run() def run_gatk(): params.T = 'CombineVariants' params.o = outfile params.R = ref params.nt = nthread
exts = dict() for sam in sam_meta: parts = sam['file_name'].split('.') ext = '.' + parts[-1] if ext == '.gz': ext = '.' + parts[-2] + ext exts [sam['file_name']] = ext sample_ids[sam['file_name']] = sam['associated_entities'][0]['entity_submitter_id'][:15] samfiles = [] for ext in set(exts.values()): samfiles += glob.glob (os.path.join(os.path.abspath(indir), "*" + ext)) # or direct dir from TCGA download samfiles += glob.glob (os.path.join(os.path.abspath(indir), "*", "*" + ext)) lock = Lock() def single(samfile): bn = os.path.basename (samfile) if not bn in sample_ids: return newfile = os.path.join (outdir, sample_ids[bn] + exts[bn]) with lock: if os.path.exists (newfile): os.remove(newfile) if 'link' in method: os.symlink (samfile, newfile) elif method == 'copy': copyfile(samfile, newfile) p = Parallel(nthread = nthread, backend = 'threading', raiseExc = True) p.run(single, [(samfile,) for samfile in samfiles])
runcmd( cmd.format(bedtools=bedtools, params=params, outfile=outfile)) remove(outfile1) remove(outfile2) return outfile infile1 = tabindex(infile1, outdir) infile2 = tabindex(infile2, outdir) chroms = [ chr.strip() for chr in check_output([tabix, '-l', infile1]).splitlines() ] if nthread > 1: p = Parallel(nthread, raiseExc=True) outfiles = p.run(runChrom, [(infile1, infile2, chrom) for chrom in chroms]) # make sure it's in the right order outfiles = sorted(outfiles, key=lambda x: chroms.index(x.split('.')[-2])) else: outfiles = [] for chrom in chroms: outfiles.append(runChrom(infile1, infile2, chrom)) with open(outfile, 'a+') as fout: for of in outfiles: with open(of) as f: fout.write(f.read()) elif tool == 'pyvcf': import vcf
makedirs(thdir) asbname = path.basename(affysnps).split('.')[0] for i, dist in enumerate(dists): writer = TsvWriter(path.join(thdir, '{bname}.thread{i}.snp'.format( bname = asbname, i = i ))) for _ in range(dist): writer.write(next(reader)) writer.close() para = Parallel(nthread, raiseExc = True) para.run(getAlleleCount, [ (tumbam, path.join( thdir, '{bname}.thread{i}.snp'.format(bname = asbname, i = i) ), path.join( thdir, '{tumbn}.thread{i}.bamrc'.format(tumbn = path.basename(tumbam), i = i) )) for i in range(nthread) ]) # merge to tumsnp writer = TsvWriter(tumsnp) writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount'] writer.writeHead(lambda cn: "#" + "\t".join(cn)) for i in range(nthread): subrc = path.join( thdir, '{tumbn}.thread{i}.bamrc'.format(tumbn = path.basename(tumbam), i = i) ) reader = TsvReader(subrc, cnames = False) for r in reader: writer.write(r.values()) reader.close()