Esempio n. 1
0
 def testRunOutErrByFile(self):
     outFile = self.getOutputFile(".stdout")
     errFile = self.getOutputFile(".stderr")
     with open(outFile, "w") as outFh, open(errFile, "w") as errFh:
         procOps.runProc(self.shOutErrCmd, stdout=outFh, stderr=errFh)
     self.diffExpected(".stdout")
     self.diffExpected(".stderr")
 def run_cmd(self, cmd):
     """
     Run a external command that will produce the output file for this task to stdout. Capture this to the file.
     """
     out_h = self.output().open('w')  # luigi localTargets guarantee atomicity if used as a handle
     runProc(cmd, stdout=out_h)
     out_h.close()
def run_htseq(target, genome, out_path, reference):
    ref = os.path.join(reference, genome, genome + '.gtf')
    bam = os.path.join(out_path, 'Aligned.sortedByCoord.out.bam')
    out_counts = os.path.join(out_path, 'htseq.counts')
    cmd = ['htseq-count', '-m', 'union', '-r', 'pos', '-i', 'gene_id', '-a', '10', '--stranded=no', '-f', 'bam',
           bam, ref]
    runProc(cmd, stdout=out_counts)
def run_bwa(target, ref_idx, tmp_fasta, out_dir, num_threads):
    tmp_sort = tmpFileGet()
    out_path = tmpFileGet(tmpDir=out_dir)
    cmd = [['bwa', 'mem', '-t', num_threads, ref_idx, tmp_fasta],
           ['samtools', 'view', '-b', '-'],
           ['samtools', 'sort', '-O', 'bam', '-T', tmp_sort, '-']]
    runProc(cmd, stdout=out_path)
def run_bwa(target, ref_idx, tmp_fasta, out_dir, num_threads):
    tmp_sort = tmpFileGet()
    out_path = tmpFileGet(tmpDir=out_dir)
    cmd = [['bwa', 'mem', '-t', num_threads, ref_idx, tmp_fasta],
           ['samtools', 'view', '-b', '-'],
           ['samtools', 'sort', '-O', 'bam', '-T', tmp_sort, '-']]
    runProc(cmd, stdout=out_path)
def run_single_star(target, genome, institute, tissue, reference, out_dir, experiment, fastq_path, num_threads, ref_genome):
    tmp_dir = os.path.join(target.getLocalTempDir(), "tmp_" + getRandomAlphaNumericString())
    out_path = build_out_dirs(out_dir, genome, institute, tissue, experiment) + "/"
    star_cmd = ['STAR', '--genomeDir', reference, '--readFilesIn', fastq_path,
                '--outFileNamePrefix', out_path, '--outTmpDir', tmp_dir, '--runThreadN', num_threads]
    runProc(star_cmd + star_flags)
    os.remove(fastq_path)
    target.setFollowOnTargetFn(run_feature_counts, args=(genome, out_path, reference, ref_genome, num_threads, False))
def run_rsem(target, genome, out_path, reference, is_paired):
    ref = os.path.join(reference, genome)
    bam = os.path.join(out_path, 'Aligned.toTranscriptome.out.bam')
    out_rsem = os.path.join(out_path, 'RSEM')
    cmd = ['rsem-calculate-expression', '--bam', bam, '--temporary-folder', target.getLocalTempDir(), ref, out_rsem]
    if is_paired is True:
        cmd.append('--paired-end')
    runProc(cmd)
def remap_reads(tmp_reads, index, out_bam):
    sort_tmp = tmpFileGet()
    cmd = [['bwa', 'mem', '-p', index, tmp_reads],
           ['samtools', 'view', '-b', '-'],
           ['samtools', 'sort', '-T', sort_tmp, '-O', 'bam', '-']]
    with open(out_bam, 'w') as f_h:
        runProc(cmd, stdout=f_h)
    cmd = ['samtools', 'index', out_bam]
    runProc(cmd)
def remap_reads(tmp_reads, index, out_bam):
    sort_tmp = tmpFileGet()
    cmd = [['bwa', 'mem', '-p', index, tmp_reads],
                  ['samtools', 'view', '-b', '-'],
                  ['samtools', 'sort', '-T', sort_tmp, '-O', 'bam', '-']]
    with open(out_bam, 'w') as f_h:
        runProc(cmd, stdout=f_h)
    cmd = ['samtools', 'index', out_bam]
    runProc(cmd)
Esempio n. 10
0
def pipelineCompress(cmds, outFile):
    """execute the pipeline commands, which must write to stdout, optionally
    compressing based on extension of outFile.  cmds can be a single command
    as a list or a list of lists". Create outFile atomically"""
    if isinstance(cmds[0], str):
        cmds = [cmds]
    outFileTmp = fileOps.atomicTmpFile(outFile)
    procOps.runProc(cmds + [[getCompressCmd(outFile)]], stdout=outFileTmp)
    fileOps.atomicInstall(outFileTmp, outFile)
def build_attributes(database, gencode_version, name, out_dir):
    header = '\t'.join(['GeneId', 'GeneName', 'GeneType', 'TranscriptId', 'TranscriptType']) + '\n'
    cmd = ['hgsql', '-Ne',
           'SELET geneId,geneName,geneType,transcriptId,transcriptType FROM '
           'wgEncodeGencodeAttrs{}'.format(gencode_version),
           database]
    with open(os.path.join(out_dir, name + '.tsv'), 'w', buffering=-1) as outf:
        outf.write(header)
        outf.flush()
        runProc(cmd, stdout=outf)
def cat(target, args):
    fofn = tmpFileGet()
    files = [os.path.join(target.getGlobalTempDir(), x) for x in os.listdir(target.getGlobalTempDir())]
    files = [x for x in files if os.path.isfile(x)]
    assert len(files) > 0
    with open(fofn, 'w') as outf:
        for x in files:
            outf.write(x + "\n")
    cmd = ['samtools', 'merge', '-b', fofn, args.outBam]
    runProc(cmd)
Esempio n. 13
0
    def testRead(self):
        inf = self.getInputFile("simple1.txt")
        infGz = self.getOutputFile(".txt.gz")
        procOps.runProc(("gzip", "-c", inf), stdout=infGz)

        pl = Pipeline(("gzip", "-dc"), "r", otherEnd=infGz)
        self.cpPlToFile(pl, ".out")
        pl.wait()

        self.diffExpected(".out")
Esempio n. 14
0
    def testRead(self):
        inf = self.getInputFile("simple1.txt")
        infGz = self.getOutputFile(".txt.gz")
        procOps.runProc(("gzip", "-c", inf), stdout=infGz)

        pl = Pipeline(("gzip", "-dc"), "r", otherEnd=infGz)
        self.cpPlToFile(pl, ".out")
        pl.wait()

        self.diffExpected(".out")
Esempio n. 15
0
    def testWrite(self):
        outf = self.getOutputFile(".out")
        outfGz = self.getOutputFile(".out.gz")

        pl = Pipeline(("gzip", "-1"), "w", otherEnd=outfGz)
        self.cpFileToPl("simple1.txt", pl)
        pl.wait()

        procOps.runProc(("zcat", outfGz), stdout=outf)
        self.diffExpected(".out")
Esempio n. 16
0
    def testWrite(self):
        outf = self.getOutputFile(".out")
        outfGz = self.getOutputFile(".out.gz")

        pl = Pipeline(("gzip", "-1"), "w", otherEnd=outfGz)
        self.cpFileToPl("simple1.txt", pl)
        pl.wait()

        procOps.runProc(("zcat", outfGz), stdout=outf)
        self.diffExpected(".out")
def run_single_star(target, genome, institute, tissue, reference, out_dir, experiment, fastq_path, num_threads, rsem,
                    htseq):
    tmp_dir = os.path.join(target.getLocalTempDir(), "tmp_" + getRandomAlphaNumericString())
    out_path = build_out_dirs(out_dir, genome, institute, tissue, experiment) + "/"
    star_cmd = ['STAR', '--genomeDir', reference, '--readFilesIn', fastq_path,
                '--outFileNamePrefix', out_path, '--outTmpDir', tmp_dir, '--runThreadN', num_threads]
    runProc(star_cmd + star_flags)
    if rsem is True:
        target.setFollowOnTargetFn(run_rsem, args=(genome, out_path, reference, False), cpu=1)
    if htseq is True:
        target.setFollowOnTargetFn(run_htseq, args=(genome, out_path, reference), cpu=1)
def run_paired_star(target, genome, institute, tissue, reference, out_dir, experiment, fwd_fastq_path, rev_fastq_path,
                    num_threads, ref_genome):
    # STAR wants a temp dir that doesn't exist, so we have to give it a fresh path because jobTree makes localTempDir()
    tmp_dir = os.path.join(target.getLocalTempDir(), "tmp_" + getRandomAlphaNumericString())
    out_path = build_out_dirs(out_dir, genome, institute, tissue, experiment) + "/"
    star_cmd = ['STAR', '--genomeDir', reference, '--readFilesIn', fwd_fastq_path, rev_fastq_path,
                '--outFileNamePrefix', out_path, '--outTmpDir', tmp_dir, '--runThreadN', num_threads]
    runProc(star_cmd + star_flags)
    os.remove(fwd_fastq_path)
    os.remove(rev_fastq_path)
    target.setFollowOnTargetFn(run_feature_counts, args=(genome, out_path, reference, ref_genome, num_threads, True))
def extract_reads(bam, offset=50000):
    tmp_reads = tmpFileGet(suffix='reads.fq')
    tmp_shuf = tmpFileGet()
    region_strs = ['{}:{}-{}'.format(chrom, start - offset, stop + offset) for chrom, start, stop, para in regions]
    view_cmd = ['samtools', 'view', '-b', bam]
    view_cmd.extend(region_strs)
    cmd = [view_cmd,
           ['samtools', 'bamshuf', '-Ou', '-', tmp_shuf],
           ['samtools', 'bam2fq', '-']]
    with open(tmp_reads, 'w') as tmp_paired_h:
        runProc(cmd, stdout=tmp_reads)
    return tmp_reads
Esempio n. 20
0
    def XXtestPassWrite(self):
        "using FIFO to pass pipe to another process for writing"
        # FIXME: should this be supported somehow
        inf = self.getInputFile("simple1.txt")
        outf = self.getOutputFile(".out")
        pipePath = self.getOutputFile(".fifo")

        pl = Pipeline(("sort", "-r"), "w", otherEnd=outf, pipePath=pipePath)
        procOps.runProc(["cat"],  stdin=inf, stdout=pl.pipePath)
        pl.wait()

        self.diffExpected(".out")
Esempio n. 21
0
    def XXtestPassWrite(self):
        "using FIFO to pass pipe to another process for writing"
        # FIXME: should this be supported somehow
        inf = self.getInputFile("simple1.txt")
        outf = self.getOutputFile(".out")
        pipePath = self.getOutputFile(".fifo")

        pl = Pipeline(("sort", "-r"), "w", otherEnd=outf, pipePath=pipePath)
        procOps.runProc(["cat"], stdin=inf, stdout=pl.pipePath)
        pl.wait()

        self.diffExpected(".out")
def cat(target, args):
    fofn = tmpFileGet()
    files = [
        os.path.join(target.getGlobalTempDir(), x)
        for x in os.listdir(target.getGlobalTempDir())
    ]
    files = [x for x in files if os.path.isfile(x)]
    assert len(files) > 0
    with open(fofn, 'w') as outf:
        for x in files:
            outf.write(x + "\n")
    cmd = ['samtools', 'merge', '-b', fofn, args.outBam]
    runProc(cmd)
Esempio n. 23
0
    def XXtestPassRead(self):
        "using FIFO to pass pipe to another process for reading"
        # FIXME: should this be supported somehow
        inf = self.getInputFile("simple1.txt")
        infGz = self.getOutputFile(".txt.gz")
        cpOut = self.getOutputFile(".out")
        procOps.runProc(("gzip", "-c", inf), stdout=infGz)

        pl = Pipeline(("gzip", "-dc"), "r", otherEnd=infGz)
        procOps.runProc(["cat"],  stdin=pl.pipePath, stdout=cpOut)
        pl.wait()

        self.diffExpected(".out")
Esempio n. 24
0
    def XXtestPassRead(self):
        "using FIFO to pass pipe to another process for reading"
        # FIXME: should this be supported somehow
        inf = self.getInputFile("simple1.txt")
        infGz = self.getOutputFile(".txt.gz")
        cpOut = self.getOutputFile(".out")
        procOps.runProc(("gzip", "-c", inf), stdout=infGz)

        pl = Pipeline(("gzip", "-dc"), "r", otherEnd=infGz)
        procOps.runProc(["cat"], stdin=pl.pipePath, stdout=cpOut)
        pl.wait()

        self.diffExpected(".out")
def run_feature_counts(target, genome, out_path, reference, ref_genome, num_threads, is_paired=False):
    if ref_genome is None:
        ref = os.path.join(reference, genome + '.gtf')
    else:
        ref = os.path.join(reference, ref_genome + '.gtf')
    assert os.path.exists(ref), ref
    bam = os.path.join(out_path, 'Aligned.out.bam')
    assert os.path.exists(bam), bam
    out_counts = os.path.join(out_path, 'Aligned.out.bam.counts.cds')
    cmd = ['featureCounts', '-T', num_threads, '-t', 'CDS', '-g', 'gene_id', '-a', ref, '-o', out_counts, bam]
    if is_paired is True:
        cmd.append('-p')
    runProc(cmd)
def extract_fastq(target, genome, institute, tissue, reference, out_dir, experiment, bam_path, num_threads, ref_genome):
    if is_paired_sequencing(bam_path):
        fwd_fastq_path = tmpFileGet(prefix=experiment, suffix='fwd.fastq')
        rev_fastq_path = tmpFileGet(prefix=experiment, suffix='rev.fastq')
        cmd = ['samtools', 'fastq', '-1', fwd_fastq_path, '-2', rev_fastq_path, bam_path]
        runProc(cmd)
        run_paired_star(target, genome, institute, tissue, reference, out_dir, experiment, fwd_fastq_path,
                        rev_fastq_path, num_threads, ref_genome)
    else:
        fastq_path = tmpFileGet(prefix=experiment)
        cmd = ['samtools', 'fastq', '-0', fastq_path, bam_path]
        runProc(cmd)
        run_single_star(target, genome, institute, tissue, reference, out_dir, experiment, fastq_path,
                        num_threads, ref_genome)
 def run_cmd(self, cmd, tmp_files):
     """
     Run a external command that will produce the output file for this task to many files.
     These files will be atomically installed.
     """
     runProc(cmd)
     for tmp_f, f in zip(*(tmp_files, self.output())):
         f.makedirs()
         if isinstance(tmp_f, luigi.LocalTarget):
             atomicInstall(tmp_f.path, f.path)
         elif isinstance(tmp_f, str):
             atomicInstall(tmp_f, f.path)
         else:
             raise NotImplementedError
def get_genes(database, name, out_dir, include_chroms, convert_ucsc):
    if include_chroms is None:
        cmd = ['hgsql', '-Ne', 'select * from ensGene', database]
    else:
        l = 'select * from ensGene where '
        for c in include_chroms[:-1]:
            l += 'chrom = "{}" and '.format(c)
        l += 'chrom = "{}"'.format(include_chroms[-1])
        cmd = ['hgsql', '-Ne', l, database]
    cmd = [cmd, ['cut', '-f', '2-']]  # strip bin name
    if convert_ucsc is True:
        cmd += ['bin/ucscToEnsemblChrom', '-v', 'chromCol=2', '/dev/stdin']
    with open(os.path.join(out_dir, name + '.gp'), 'w') as outf:
        runProc(cmd, stdout=outf)
def main():
    file_map = get_files()
    names = defaultdict(set)
    for e, (bam1, bam2) in file_map.iteritems():
        is_paired = is_paired_sequencing(bam1)
        unique_names = find_not_unique_reads(bam1, bam2, is_paired)
        for b in [bam1, bam2]:
            out_b = b + '.filtered.bam'
            filter_bam(b, out_b, unique_names)
            ref = mm10_gtf if 'mm10' in b else strain_gtf
            out_counts = out_b + '.counts.cds'
            cmd = ['featureCounts', '-T', '1', '-t', 'CDS', '-g', 'gene_id', '--primary', '--ignoreDup', '-Q', '30',
                   '-a', ref, '-o', out_counts, out_b]
            if is_paired is True:
                cmd.append('-p')
            runProc(cmd)
def extract_reads(bam, offset=50000):
    tmp_reads = tmpFileGet(suffix='reads.fq')
    tmp_shuf = tmpFileGet()
    region_strs = [
        '{}:{}-{}'.format(chrom, start - offset, stop + offset)
        for chrom, start, stop, para in regions
    ]
    view_cmd = ['samtools', 'view', '-b', bam]
    view_cmd.extend(region_strs)
    cmd = [
        view_cmd, ['samtools', 'bamshuf', '-Ou', '-', tmp_shuf],
        ['samtools', 'bam2fq', '-']
    ]
    with open(tmp_reads, 'w') as tmp_paired_h:
        runProc(cmd, stdout=tmp_reads)
    return tmp_reads
Esempio n. 31
0
 def testRunInOut(self):
     outf = self.getOutputFile(".txt")
     ret = procOps.runProc(["sort"],
                           stdin=self.getInputFile("simple1.txt"),
                           stdout=outf)
     self.failUnlessEqual(ret, 0)
     self.diffExpected(".txt")
def main():
    args = parse_args()
    data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    ref_path = os.path.join(args.ref_dir, args.experiment)
    tgt_path = os.path.join(args.tgt_dir, args.experiment)
    if args.genomes is None:
        args.genomes = os.listdir(ref_path)
    common_name_map = get_common_name_map(args.attrs)
    load_dir(data, 'ref', ref_path, args.genomes, common_name_map)
    load_dir(data, 'tgt', tgt_path, args.genomes, common_name_map)
    base_counts_dir = os.path.join(args.counts_dir, args.experiment)
    construct_counts_tables(data, base_counts_dir)
    deseq_path = os.path.join(args.deseq_dir, args.experiment)
    ensureDir(deseq_path)
    for g in args.genomes:
        runProc(['Rscript', 'run_DEseq.R', g, deseq_path, base_counts_dir])
    out_dir = os.path.join(args.out_dir, args.experiment)
    generate_plots(args.genomes, deseq_path, out_dir)
def get_genes(database, gencode_version, gencode_set, name, out_dir, include_chroms, filter_chroms, convert_ucsc):
    if include_chroms is None and filter_chroms is None:
        cmd = ['hgsql', '-Ne', 'SELECT * FROM wgEncodeGencode{}{}'.format(gencode_set, gencode_version), database]
    else:
        l = 'SELECT * FROM wgEncodeGencode{}{} WHERE '.format(gencode_set, gencode_version)
        if include_chroms is not None:
            for c in include_chroms[:-1]:
                l += 'chrom = "{}" AND '.format(c)
            l += 'chrom = "{}"'.format(include_chroms[-1])
        if filter_chroms is not None:
            for c in filter_chroms[:-1]:
                l += 'chrom != "{}" AND '.format(c)
            l += 'chrom != "{}"'.format(filter_chroms[-1])
        cmd = ['hgsql', '-Ne', l, database]
    cmd = [cmd, ['cut', '-f', '2-']]  # strip bin name
    if convert_ucsc is True:
        cmd += ['bin/ucscToEnsemblChrom', '-v', 'chromCol=2', '/dev/stdin']
    with open(os.path.join(out_dir, name + '.gp'), 'w') as outf:
        runProc(cmd, stdout=outf)
def map_to_tgt(ref_fa, ref_sizes, tgt_fa, coding_transcripts, coding_genes, chain, tx_dict):
    bed = tmpFileGet()
    gp = tmpFileGet()
    fake_psl = tmpFileGet()
    fwd_unfiltered = tmpFileGet()
    fwd_filtered = tmpFileGet()
    r_intervals = load_tx_intervals(tx_dict, coding_genes, coding_transcripts)
    with open(bed, 'w') as outf:
        for i in r_intervals.itervalues():
            outf.write(i)
    cmd = ['bedToGenePred', bed, gp]
    runProc(cmd)
    cmd = ['genePredToFakePsl', '-chromSize={}'.format(ref_sizes), 'na', gp, fake_psl, '/dev/null']
    runProc(cmd)
    cmd = ['pslMap', '-chainMapFile', fake_psl, chain, fwd_unfiltered]
    runProc(cmd)
    cmd = [['sort', '-k10,10', fwd_unfiltered],
           ['pslCDnaFilter', '-localNearBest=0.05', '-filterWeirdOverlapped', '-decayMinCover', '/dev/stdin', fwd_filtered]]
    runProc(cmd)
    os.remove(bed)
    os.remove(gp)
    os.remove(fake_psl)
    os.remove(fwd_unfiltered)
    return fwd_filtered
Esempio n. 35
0
 def testRunErr(self):
     with self.assertRaises(pipeline.ProcException) as cm:
         procOps.runProc(["false"], stdin=self.getInputFile("simple1.txt"))
     self.assertEqual(str(cm.exception), 'process exited 1: false')
Esempio n. 36
0
 def testRunOutErrSameByFile(self):
     # same file handle for stdout/stderr; make sure it's not closed too soon
     outFile = self.getOutputFile(".stdouterr")
     with open(outFile, "w") as outFh:
         procOps.runProc(self.shOutErrCmd, stdout=outFh, stderr=outFh)
     self.diffExpected(".stdouterr")
Esempio n. 37
0
 def testRunOutErrByName(self):
     outFile = self.getOutputFile(".stdout")
     errFile = self.getOutputFile(".stderr")
     procOps.runProc(self.shOutErrCmd, stdout=outFile, stderr=errFile)
     self.diffExpected(".stdout")
     self.diffExpected(".stderr")
Esempio n. 38
0
 def testRunInOut(self):
     outf = self.getOutputFile(".txt")
     procOps.runProc(["sort"], stdin=self.getInputFile("simple1.txt"), stdout=outf)
     self.diffExpected(".txt")
Esempio n. 39
0
 def BROKEN_testRunFileOut(self):
     with open(self.getOutputFile(".txt"), "w") as outfh:
         procOps.runProc(["sort", self.getInputFile("simple1.txt")], stdout=outfh)
         self.diffExpected(".txt")
Esempio n. 40
0
 def testRunErr(self):
     ex = None
     try:
         procOps.runProc(["false"], stdin=self.getInputFile("simple1.txt"))
     except Exception, ex:
         pass
Esempio n. 41
0
 def testReadBzip2(self):
     tsvBz = self.getOutputFile("tsv.bz2")
     procOps.runProc(
         ["bzip2", "-c", self.getInputFile("mrna1.tsv")], stdout=tsvBz)
     self.readMRna1(tsvBz)
Esempio n. 42
0
 def testReadGzip(self):
     tsvGz = self.getOutputFile("tsv.gz")
     procOps.runProc(
         ["gzip", "-c", self.getInputFile("mrna1.tsv")], stdout=tsvGz)
     self.readMRna1(tsvGz)