pathview = {{args.pathview | repr}} shell.TOOLS.Rscript = Rscript if isinstance(cutoff, dict): if cutoff['by'] == 'p': cutoff['by'] = 'Pval' if cutoff['by'] == 'q': cutoff['by'] = 'AdjPval' reader = TsvReader(infile, **inopts) genes = [r[genecol] for r in reader] en = Enrichr(cutoff = cutoff, top = top, Rscript = Rscript) en.addList(genes, description = path.basename(infile)) para = Parallel(nthread = nthread) runPathview = lambda r, hsa: shell.Shell().Rscript(r, hsa).run() for db in dbs: outfile = path.join(outdir, prefix + '.' + db + '.txt') en.enrich(db) en.export(outfile, top = 100) if plot: plotfile = path.join(outdir, prefix + '.' + db + '.png') en.plot(plotfile, res = devpars.res, width = devpars.width, height = devpars.height) if pathview and 'KEGG' in db: pathviewRDir = path.join(outdir, prefix + '.' + db + '.pathview') pathviewRfile = path.join(pathviewRDir, 'pathview.R') shell.mkdir(pathviewRDir) with open(pathviewRfile, 'w') as f: f.write(""" {rimport}('__init__.r')
writer = MemeWriter(qfile) writer.meta = reader.meta writer.writeMeta() for _ in range(joblist[i]): try: writer.write(reader.next()) except StopIteration: break writer.close() thparams = params.copy() thparams[""] = [qfile, mfile2] thparams.thresh = qval thparams.oc = ocdir cmdps.append((tomtom, cmdargs(thparams, dash='-', equal=' '))) reader.close() Parallel(nthread, raiseExc=True).run('{} {}', cmdps) writer = TsvWriter(outfile) reader = TsvReader(path.join(ocdirs[0], 'tomtom.txt'), comment='##', cnames=lambda header: header[1:].strip().split("\t")) writer.cnames = reader.cnames writer.writeHead(lambda cnames: "#" + "\t".join(cnames)) reader.close() for ocdir in ocdirs: reader = TsvReader( path.join(ocdir, 'tomtom.txt'), comment='##', cnames=lambda header: header[1:].strip().split("\t")) for r in reader: writer.write(r)
samplevcf = "{{job.outdir}}/{{i.infile | fn}}-%s.vcf" % sample cmd = '{{args.vcftools}} %s {{i.infile | quote}} > "%s"' % (cmdargs(vtparams), samplevcf) # vcf2maf.pl --input-vcf ZYYP-ZYYB.vcf --output-maf ZYYP-ZYYB.snpEff.maf --tumor-id ZXLT-ZXLB_TUMOR --normal-id ZXLT-ZXLB_NORMAL --vep-data /path/to/vep/cache/ --filter-vcf /path/to/vep/ExAC_nonTCGA.r0.3.1.sites.vep.vcf.gz --ref-fasta /path/to/hs37d5/phase2_reference_assembly_sequence/hs37d5.fa --vep-path /path/to/miniconda2/bin params['input-vcf'] = samplevcf params['output-maf'] = "{{job.outdir}}/{{i.infile | fn}}-%s.maf" % sample params['vep-data'] = {{args.vepDb | quote}} params['vep-forks'] = {{args.nthread}} params['filter-vcf'] = {{args.filtervcf | quote}} params['ref-fasta'] = {{args.ref | quote}} params['vep-path'] = path.dirname(vep) cmd = cmd + '; {{args.vcf2maf}} --tumor-id %s %s' % (sample, cmdargs(params, equal=' ')) cmds.append(cmd) {% if args.nthread == 1 %} for cmd in cmds: runcmd(cmd) {% else %} # Note the threads may be hanging on here. p = Parallel({{args.nthread}}) p.run('{}', [(cmd,) for cmd in cmds]) {% endif %} for i, sample in enumerate(samples): singlemaf = "{{job.outdir}}/{{i.infile | fn}}-%s.maf" % sample if i == 0: runcmd('cat "%s" > {{o.outfile | quote}}' % singlemaf) else: runcmd('egrep -v "^#|^Hugo_Symbol" "%s" >> {{o.outfile | quote}}' % singlemaf) {% endif %} {% endif %}
invcfs = {{i.infiles | repr}} outfile = {{o.outfile | quote}} nthread = {{args.nthread | int}} joboutdir = {{job.outdir | quote}} vcftools = {{args.vcftools | quote}} gatk = {{args.gatk | quote}} tabix = {{args.tabix | quote}} ref = {{args.ref | quote}} params = {{args.params | repr}} tool = {{args.tool | quote}} shell.TOOLS.vcftools = vcftools shell.TOOLS.gatk = gatk para = Parallel(nthread, raiseExc=True) invcfs = para.run(vcfIndex, [(vcf, tabix) for vcf in invcfs]) def run_vcftools(): params.d = params.get('d', True) params.t = params.get('t', True) params._ = invcfs params._stdout = outfile shell.Shell(equal=' ').vcftools(**params).run() def run_gatk(): params.T = 'CombineVariants' params.o = outfile params.R = ref
exts = dict() for sam in sam_meta: parts = sam['file_name'].split('.') ext = '.' + parts[-1] if ext == '.gz': ext = '.' + parts[-2] + ext exts [sam['file_name']] = ext sample_ids[sam['file_name']] = sam['associated_entities'][0]['entity_submitter_id'][:15] samfiles = [] for ext in set(exts.values()): samfiles += glob.glob (os.path.join(os.path.abspath(indir), "*" + ext)) # or direct dir from TCGA download samfiles += glob.glob (os.path.join(os.path.abspath(indir), "*", "*" + ext)) lock = Lock() def single(samfile): bn = os.path.basename (samfile) if not bn in sample_ids: return newfile = os.path.join (outdir, sample_ids[bn] + exts[bn]) with lock: if os.path.exists (newfile): os.remove(newfile) if 'link' in method: os.symlink (samfile, newfile) elif method == 'copy': copyfile(samfile, newfile) p = Parallel(nthread = nthread, backend = 'threading', raiseExc = True) p.run(single, [(samfile,) for samfile in samfiles])
params = cmdargs(params, dash='-', equal=' ') runcmd( cmd.format(bedtools=bedtools, params=params, outfile=outfile)) remove(outfile1) remove(outfile2) return outfile infile1 = tabindex(infile1, outdir) infile2 = tabindex(infile2, outdir) chroms = [ chr.strip() for chr in check_output([tabix, '-l', infile1]).splitlines() ] if nthread > 1: p = Parallel(nthread, raiseExc=True) outfiles = p.run(runChrom, [(infile1, infile2, chrom) for chrom in chroms]) # make sure it's in the right order outfiles = sorted(outfiles, key=lambda x: chroms.index(x.split('.')[-2])) else: outfiles = [] for chrom in chroms: outfiles.append(runChrom(infile1, infile2, chrom)) with open(outfile, 'a+') as fout: for of in outfiles: with open(of) as f: fout.write(f.read())
reader = TsvReader(affysnps, cnames = False) # dir to save the split file and result file thdir = path.join(outdir, 'bamrc.nthreads') if not path.exists(thdir): makedirs(thdir) asbname = path.basename(affysnps).split('.')[0] for i, dist in enumerate(dists): writer = TsvWriter(path.join(thdir, '{bname}.thread{i}.snp'.format( bname = asbname, i = i ))) for _ in range(dist): writer.write(next(reader)) writer.close() para = Parallel(nthread, raiseExc = True) para.run(getAlleleCount, [ (tumbam, path.join( thdir, '{bname}.thread{i}.snp'.format(bname = asbname, i = i) ), path.join( thdir, '{tumbn}.thread{i}.bamrc'.format(tumbn = path.basename(tumbam), i = i) )) for i in range(nthread) ]) # merge to tumsnp writer = TsvWriter(tumsnp) writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount'] writer.writeHead(lambda cn: "#" + "\t".join(cn)) for i in range(nthread): subrc = path.join( thdir, '{tumbn}.thread{i}.bamrc'.format(tumbn = path.basename(tumbam), i = i) )