def testRunOutErrByFile(self): outFile = self.getOutputFile(".stdout") errFile = self.getOutputFile(".stderr") with open(outFile, "w") as outFh, open(errFile, "w") as errFh: procOps.runProc(self.shOutErrCmd, stdout=outFh, stderr=errFh) self.diffExpected(".stdout") self.diffExpected(".stderr")
def run_cmd(self, cmd): """ Run a external command that will produce the output file for this task to stdout. Capture this to the file. """ out_h = self.output().open('w') # luigi localTargets guarantee atomicity if used as a handle runProc(cmd, stdout=out_h) out_h.close()
def run_htseq(target, genome, out_path, reference): ref = os.path.join(reference, genome, genome + '.gtf') bam = os.path.join(out_path, 'Aligned.sortedByCoord.out.bam') out_counts = os.path.join(out_path, 'htseq.counts') cmd = ['htseq-count', '-m', 'union', '-r', 'pos', '-i', 'gene_id', '-a', '10', '--stranded=no', '-f', 'bam', bam, ref] runProc(cmd, stdout=out_counts)
def run_bwa(target, ref_idx, tmp_fasta, out_dir, num_threads): tmp_sort = tmpFileGet() out_path = tmpFileGet(tmpDir=out_dir) cmd = [['bwa', 'mem', '-t', num_threads, ref_idx, tmp_fasta], ['samtools', 'view', '-b', '-'], ['samtools', 'sort', '-O', 'bam', '-T', tmp_sort, '-']] runProc(cmd, stdout=out_path)
def run_single_star(target, genome, institute, tissue, reference, out_dir, experiment, fastq_path, num_threads, ref_genome): tmp_dir = os.path.join(target.getLocalTempDir(), "tmp_" + getRandomAlphaNumericString()) out_path = build_out_dirs(out_dir, genome, institute, tissue, experiment) + "/" star_cmd = ['STAR', '--genomeDir', reference, '--readFilesIn', fastq_path, '--outFileNamePrefix', out_path, '--outTmpDir', tmp_dir, '--runThreadN', num_threads] runProc(star_cmd + star_flags) os.remove(fastq_path) target.setFollowOnTargetFn(run_feature_counts, args=(genome, out_path, reference, ref_genome, num_threads, False))
def run_rsem(target, genome, out_path, reference, is_paired): ref = os.path.join(reference, genome) bam = os.path.join(out_path, 'Aligned.toTranscriptome.out.bam') out_rsem = os.path.join(out_path, 'RSEM') cmd = ['rsem-calculate-expression', '--bam', bam, '--temporary-folder', target.getLocalTempDir(), ref, out_rsem] if is_paired is True: cmd.append('--paired-end') runProc(cmd)
def remap_reads(tmp_reads, index, out_bam): sort_tmp = tmpFileGet() cmd = [['bwa', 'mem', '-p', index, tmp_reads], ['samtools', 'view', '-b', '-'], ['samtools', 'sort', '-T', sort_tmp, '-O', 'bam', '-']] with open(out_bam, 'w') as f_h: runProc(cmd, stdout=f_h) cmd = ['samtools', 'index', out_bam] runProc(cmd)
def pipelineCompress(cmds, outFile): """execute the pipeline commands, which must write to stdout, optionally compressing based on extension of outFile. cmds can be a single command as a list or a list of lists". Create outFile atomically""" if isinstance(cmds[0], str): cmds = [cmds] outFileTmp = fileOps.atomicTmpFile(outFile) procOps.runProc(cmds + [[getCompressCmd(outFile)]], stdout=outFileTmp) fileOps.atomicInstall(outFileTmp, outFile)
def build_attributes(database, gencode_version, name, out_dir): header = '\t'.join(['GeneId', 'GeneName', 'GeneType', 'TranscriptId', 'TranscriptType']) + '\n' cmd = ['hgsql', '-Ne', 'SELET geneId,geneName,geneType,transcriptId,transcriptType FROM ' 'wgEncodeGencodeAttrs{}'.format(gencode_version), database] with open(os.path.join(out_dir, name + '.tsv'), 'w', buffering=-1) as outf: outf.write(header) outf.flush() runProc(cmd, stdout=outf)
def cat(target, args): fofn = tmpFileGet() files = [os.path.join(target.getGlobalTempDir(), x) for x in os.listdir(target.getGlobalTempDir())] files = [x for x in files if os.path.isfile(x)] assert len(files) > 0 with open(fofn, 'w') as outf: for x in files: outf.write(x + "\n") cmd = ['samtools', 'merge', '-b', fofn, args.outBam] runProc(cmd)
def testRead(self): inf = self.getInputFile("simple1.txt") infGz = self.getOutputFile(".txt.gz") procOps.runProc(("gzip", "-c", inf), stdout=infGz) pl = Pipeline(("gzip", "-dc"), "r", otherEnd=infGz) self.cpPlToFile(pl, ".out") pl.wait() self.diffExpected(".out")
def testWrite(self): outf = self.getOutputFile(".out") outfGz = self.getOutputFile(".out.gz") pl = Pipeline(("gzip", "-1"), "w", otherEnd=outfGz) self.cpFileToPl("simple1.txt", pl) pl.wait() procOps.runProc(("zcat", outfGz), stdout=outf) self.diffExpected(".out")
def run_single_star(target, genome, institute, tissue, reference, out_dir, experiment, fastq_path, num_threads, rsem, htseq): tmp_dir = os.path.join(target.getLocalTempDir(), "tmp_" + getRandomAlphaNumericString()) out_path = build_out_dirs(out_dir, genome, institute, tissue, experiment) + "/" star_cmd = ['STAR', '--genomeDir', reference, '--readFilesIn', fastq_path, '--outFileNamePrefix', out_path, '--outTmpDir', tmp_dir, '--runThreadN', num_threads] runProc(star_cmd + star_flags) if rsem is True: target.setFollowOnTargetFn(run_rsem, args=(genome, out_path, reference, False), cpu=1) if htseq is True: target.setFollowOnTargetFn(run_htseq, args=(genome, out_path, reference), cpu=1)
def run_paired_star(target, genome, institute, tissue, reference, out_dir, experiment, fwd_fastq_path, rev_fastq_path, num_threads, ref_genome): # STAR wants a temp dir that doesn't exist, so we have to give it a fresh path because jobTree makes localTempDir() tmp_dir = os.path.join(target.getLocalTempDir(), "tmp_" + getRandomAlphaNumericString()) out_path = build_out_dirs(out_dir, genome, institute, tissue, experiment) + "/" star_cmd = ['STAR', '--genomeDir', reference, '--readFilesIn', fwd_fastq_path, rev_fastq_path, '--outFileNamePrefix', out_path, '--outTmpDir', tmp_dir, '--runThreadN', num_threads] runProc(star_cmd + star_flags) os.remove(fwd_fastq_path) os.remove(rev_fastq_path) target.setFollowOnTargetFn(run_feature_counts, args=(genome, out_path, reference, ref_genome, num_threads, True))
def extract_reads(bam, offset=50000): tmp_reads = tmpFileGet(suffix='reads.fq') tmp_shuf = tmpFileGet() region_strs = ['{}:{}-{}'.format(chrom, start - offset, stop + offset) for chrom, start, stop, para in regions] view_cmd = ['samtools', 'view', '-b', bam] view_cmd.extend(region_strs) cmd = [view_cmd, ['samtools', 'bamshuf', '-Ou', '-', tmp_shuf], ['samtools', 'bam2fq', '-']] with open(tmp_reads, 'w') as tmp_paired_h: runProc(cmd, stdout=tmp_reads) return tmp_reads
def XXtestPassWrite(self): "using FIFO to pass pipe to another process for writing" # FIXME: should this be supported somehow inf = self.getInputFile("simple1.txt") outf = self.getOutputFile(".out") pipePath = self.getOutputFile(".fifo") pl = Pipeline(("sort", "-r"), "w", otherEnd=outf, pipePath=pipePath) procOps.runProc(["cat"], stdin=inf, stdout=pl.pipePath) pl.wait() self.diffExpected(".out")
def cat(target, args): fofn = tmpFileGet() files = [ os.path.join(target.getGlobalTempDir(), x) for x in os.listdir(target.getGlobalTempDir()) ] files = [x for x in files if os.path.isfile(x)] assert len(files) > 0 with open(fofn, 'w') as outf: for x in files: outf.write(x + "\n") cmd = ['samtools', 'merge', '-b', fofn, args.outBam] runProc(cmd)
def XXtestPassRead(self): "using FIFO to pass pipe to another process for reading" # FIXME: should this be supported somehow inf = self.getInputFile("simple1.txt") infGz = self.getOutputFile(".txt.gz") cpOut = self.getOutputFile(".out") procOps.runProc(("gzip", "-c", inf), stdout=infGz) pl = Pipeline(("gzip", "-dc"), "r", otherEnd=infGz) procOps.runProc(["cat"], stdin=pl.pipePath, stdout=cpOut) pl.wait() self.diffExpected(".out")
def run_feature_counts(target, genome, out_path, reference, ref_genome, num_threads, is_paired=False): if ref_genome is None: ref = os.path.join(reference, genome + '.gtf') else: ref = os.path.join(reference, ref_genome + '.gtf') assert os.path.exists(ref), ref bam = os.path.join(out_path, 'Aligned.out.bam') assert os.path.exists(bam), bam out_counts = os.path.join(out_path, 'Aligned.out.bam.counts.cds') cmd = ['featureCounts', '-T', num_threads, '-t', 'CDS', '-g', 'gene_id', '-a', ref, '-o', out_counts, bam] if is_paired is True: cmd.append('-p') runProc(cmd)
def extract_fastq(target, genome, institute, tissue, reference, out_dir, experiment, bam_path, num_threads, ref_genome): if is_paired_sequencing(bam_path): fwd_fastq_path = tmpFileGet(prefix=experiment, suffix='fwd.fastq') rev_fastq_path = tmpFileGet(prefix=experiment, suffix='rev.fastq') cmd = ['samtools', 'fastq', '-1', fwd_fastq_path, '-2', rev_fastq_path, bam_path] runProc(cmd) run_paired_star(target, genome, institute, tissue, reference, out_dir, experiment, fwd_fastq_path, rev_fastq_path, num_threads, ref_genome) else: fastq_path = tmpFileGet(prefix=experiment) cmd = ['samtools', 'fastq', '-0', fastq_path, bam_path] runProc(cmd) run_single_star(target, genome, institute, tissue, reference, out_dir, experiment, fastq_path, num_threads, ref_genome)
def run_cmd(self, cmd, tmp_files): """ Run a external command that will produce the output file for this task to many files. These files will be atomically installed. """ runProc(cmd) for tmp_f, f in zip(*(tmp_files, self.output())): f.makedirs() if isinstance(tmp_f, luigi.LocalTarget): atomicInstall(tmp_f.path, f.path) elif isinstance(tmp_f, str): atomicInstall(tmp_f, f.path) else: raise NotImplementedError
def get_genes(database, name, out_dir, include_chroms, convert_ucsc): if include_chroms is None: cmd = ['hgsql', '-Ne', 'select * from ensGene', database] else: l = 'select * from ensGene where ' for c in include_chroms[:-1]: l += 'chrom = "{}" and '.format(c) l += 'chrom = "{}"'.format(include_chroms[-1]) cmd = ['hgsql', '-Ne', l, database] cmd = [cmd, ['cut', '-f', '2-']] # strip bin name if convert_ucsc is True: cmd += ['bin/ucscToEnsemblChrom', '-v', 'chromCol=2', '/dev/stdin'] with open(os.path.join(out_dir, name + '.gp'), 'w') as outf: runProc(cmd, stdout=outf)
def main(): file_map = get_files() names = defaultdict(set) for e, (bam1, bam2) in file_map.iteritems(): is_paired = is_paired_sequencing(bam1) unique_names = find_not_unique_reads(bam1, bam2, is_paired) for b in [bam1, bam2]: out_b = b + '.filtered.bam' filter_bam(b, out_b, unique_names) ref = mm10_gtf if 'mm10' in b else strain_gtf out_counts = out_b + '.counts.cds' cmd = ['featureCounts', '-T', '1', '-t', 'CDS', '-g', 'gene_id', '--primary', '--ignoreDup', '-Q', '30', '-a', ref, '-o', out_counts, out_b] if is_paired is True: cmd.append('-p') runProc(cmd)
def extract_reads(bam, offset=50000): tmp_reads = tmpFileGet(suffix='reads.fq') tmp_shuf = tmpFileGet() region_strs = [ '{}:{}-{}'.format(chrom, start - offset, stop + offset) for chrom, start, stop, para in regions ] view_cmd = ['samtools', 'view', '-b', bam] view_cmd.extend(region_strs) cmd = [ view_cmd, ['samtools', 'bamshuf', '-Ou', '-', tmp_shuf], ['samtools', 'bam2fq', '-'] ] with open(tmp_reads, 'w') as tmp_paired_h: runProc(cmd, stdout=tmp_reads) return tmp_reads
def testRunInOut(self): outf = self.getOutputFile(".txt") ret = procOps.runProc(["sort"], stdin=self.getInputFile("simple1.txt"), stdout=outf) self.failUnlessEqual(ret, 0) self.diffExpected(".txt")
def main(): args = parse_args() data = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) ref_path = os.path.join(args.ref_dir, args.experiment) tgt_path = os.path.join(args.tgt_dir, args.experiment) if args.genomes is None: args.genomes = os.listdir(ref_path) common_name_map = get_common_name_map(args.attrs) load_dir(data, 'ref', ref_path, args.genomes, common_name_map) load_dir(data, 'tgt', tgt_path, args.genomes, common_name_map) base_counts_dir = os.path.join(args.counts_dir, args.experiment) construct_counts_tables(data, base_counts_dir) deseq_path = os.path.join(args.deseq_dir, args.experiment) ensureDir(deseq_path) for g in args.genomes: runProc(['Rscript', 'run_DEseq.R', g, deseq_path, base_counts_dir]) out_dir = os.path.join(args.out_dir, args.experiment) generate_plots(args.genomes, deseq_path, out_dir)
def get_genes(database, gencode_version, gencode_set, name, out_dir, include_chroms, filter_chroms, convert_ucsc): if include_chroms is None and filter_chroms is None: cmd = ['hgsql', '-Ne', 'SELECT * FROM wgEncodeGencode{}{}'.format(gencode_set, gencode_version), database] else: l = 'SELECT * FROM wgEncodeGencode{}{} WHERE '.format(gencode_set, gencode_version) if include_chroms is not None: for c in include_chroms[:-1]: l += 'chrom = "{}" AND '.format(c) l += 'chrom = "{}"'.format(include_chroms[-1]) if filter_chroms is not None: for c in filter_chroms[:-1]: l += 'chrom != "{}" AND '.format(c) l += 'chrom != "{}"'.format(filter_chroms[-1]) cmd = ['hgsql', '-Ne', l, database] cmd = [cmd, ['cut', '-f', '2-']] # strip bin name if convert_ucsc is True: cmd += ['bin/ucscToEnsemblChrom', '-v', 'chromCol=2', '/dev/stdin'] with open(os.path.join(out_dir, name + '.gp'), 'w') as outf: runProc(cmd, stdout=outf)
def map_to_tgt(ref_fa, ref_sizes, tgt_fa, coding_transcripts, coding_genes, chain, tx_dict): bed = tmpFileGet() gp = tmpFileGet() fake_psl = tmpFileGet() fwd_unfiltered = tmpFileGet() fwd_filtered = tmpFileGet() r_intervals = load_tx_intervals(tx_dict, coding_genes, coding_transcripts) with open(bed, 'w') as outf: for i in r_intervals.itervalues(): outf.write(i) cmd = ['bedToGenePred', bed, gp] runProc(cmd) cmd = ['genePredToFakePsl', '-chromSize={}'.format(ref_sizes), 'na', gp, fake_psl, '/dev/null'] runProc(cmd) cmd = ['pslMap', '-chainMapFile', fake_psl, chain, fwd_unfiltered] runProc(cmd) cmd = [['sort', '-k10,10', fwd_unfiltered], ['pslCDnaFilter', '-localNearBest=0.05', '-filterWeirdOverlapped', '-decayMinCover', '/dev/stdin', fwd_filtered]] runProc(cmd) os.remove(bed) os.remove(gp) os.remove(fake_psl) os.remove(fwd_unfiltered) return fwd_filtered
def testRunErr(self): with self.assertRaises(pipeline.ProcException) as cm: procOps.runProc(["false"], stdin=self.getInputFile("simple1.txt")) self.assertEqual(str(cm.exception), 'process exited 1: false')
def testRunOutErrSameByFile(self): # same file handle for stdout/stderr; make sure it's not closed too soon outFile = self.getOutputFile(".stdouterr") with open(outFile, "w") as outFh: procOps.runProc(self.shOutErrCmd, stdout=outFh, stderr=outFh) self.diffExpected(".stdouterr")
def testRunOutErrByName(self): outFile = self.getOutputFile(".stdout") errFile = self.getOutputFile(".stderr") procOps.runProc(self.shOutErrCmd, stdout=outFile, stderr=errFile) self.diffExpected(".stdout") self.diffExpected(".stderr")
def testRunInOut(self): outf = self.getOutputFile(".txt") procOps.runProc(["sort"], stdin=self.getInputFile("simple1.txt"), stdout=outf) self.diffExpected(".txt")
def BROKEN_testRunFileOut(self): with open(self.getOutputFile(".txt"), "w") as outfh: procOps.runProc(["sort", self.getInputFile("simple1.txt")], stdout=outfh) self.diffExpected(".txt")
def testRunErr(self): ex = None try: procOps.runProc(["false"], stdin=self.getInputFile("simple1.txt")) except Exception, ex: pass
def testReadBzip2(self): tsvBz = self.getOutputFile("tsv.bz2") procOps.runProc( ["bzip2", "-c", self.getInputFile("mrna1.tsv")], stdout=tsvBz) self.readMRna1(tsvBz)
def testReadGzip(self): tsvGz = self.getOutputFile("tsv.gz") procOps.runProc( ["gzip", "-c", self.getInputFile("mrna1.tsv")], stdout=tsvGz) self.readMRna1(tsvGz)