def testEchoList(self): """ We should be able to call echo using a list command. """ e = Executor() result = e.execute(['echo', 'hello']) self.assertEqual('hello', result.stdout.strip()) self.assertTrue('$ echo hello' in e.log)
def testPipe(self): """ We should be able to pipe echo into wc -c. """ e = Executor() result = e.execute('echo hello | wc -c') self.assertEqual('6', result.stdout.strip()) self.assertTrue('$ echo hello | wc -c' in e.log)
def testDryRunDefaultOverride(self): """ It must be possible to override the default dryRun setting by passing a value to C{execute}. """ e = Executor(dryRun=False) result = e.execute('date', dryRun=True) self.assertIsNone(result) self.assertEqual('$ date', e.log[-1])
def setUpClass(cls): """ Make a tiny p-value table in a temporary directory. """ e = Executor() tmpDir = mkdtemp() tableFile = join(tmpDir, 'table') e.execute('3seq -g %s 2' % tableFile) cls._tableFile = tableFile cls._tmpDir = tmpDir
def needle(reads, verbose=False, options=None): """ Run a Needleman-Wunsch alignment and return the two sequences. @param reads: An iterable of two reads. @param verbose: If C{True} print progress info to sys.stderr. @param options: Additional options to pass to needle. @return: A C{Reads} instance with the two aligned sequences. """ tempdir = mkdtemp() file1 = join(tempdir, 'file1.fasta') with open(file1, 'w') as fp: print(reads[0].toString('fasta'), end='', file=fp) file2 = join(tempdir, 'file2.fasta') with open(file2, 'w') as fp: print(reads[1].toString('fasta'), end='', file=fp) out = join(tempdir, 'result.fasta') def useStderr(e): return "Sequences too big. Try 'stretcher'" not in e.stderr if verbose: print('Running needle.', file=sys.stderr) try: Executor().execute("needle -asequence '%s' -bsequence '%s' %s " "-outfile '%s' -aformat fasta" % (file1, file2, options or '', out), useStderr=useStderr) except CalledProcessError as e: if useStderr(e): raise else: if verbose: print( 'Sequences too long for needle. Falling back to ' 'stretcher. Be patient!', file=sys.stderr) Executor().execute("stretcher -asequence '%s' -bsequence '%s' " "-auto " "-outfile '%s' -aformat fasta" % (file1, file2, out)) # Use 'list' in the following to force reading the FASTA from disk. result = Reads(list(FastaReads(out))) rmtree(tempdir) return result
def main(args): ex = Executor(args.dryRun) checkFastqSuffixes(args) readCounts = getReadCounts(args, ex) getMd5s(args, ex) commonCounts, duplicates = getCommonCounts(args) saveDuplicateIds(args, duplicates) saveCommonCounts(args, commonCounts) if args.csvfile: saveCommonCountsCsv(args, commonCounts, readCounts) for filename1 in sorted(commonCounts, key=_key): first = True for filename2 in sorted(commonCounts[filename1], key=_key): common = commonCounts[filename1][filename2] if common: count1 = readCounts[filename1] count2 = readCounts[filename2] if first: print('%s (%d reads)' % (shortFilename(filename1), count1)) first = False print(' %6d common reads (%5.2f%%, %5.2f%%): %s (%d reads)' % (common, common / count1 * 100.0, common / count2 * 100.0, shortFilename(filename2), count2)) if args.dryRun: print('\n'.join(ex.log))
def mafft(reads, verbose=False, options=None, threads=None): """ Run a MAFFT alignment and return the sequences. @param reads: An iterable of multiple reads. @param verbose: If C{True} print progress info to sys.stderr. @param options: A C{str} of options to pass to mafft. @return: A C{Reads} instance with the aligned sequences. """ tempdir = mkdtemp() infile = join(tempdir, 'input.fasta') out = join(tempdir, 'result.fasta') Reads(reads).save(infile) if verbose: print('Running mafft.', file=sys.stderr) Executor().execute("mafft %s %s '%s' > '%s'" % (('' if threads is None else '--thread %d' % threads), options or '', infile, out)) # Use 'list' in the following to force reading the FASTA from disk. result = Reads(list(FastaReads(out))) rmtree(tempdir) return result
def needle(reads): """ Run a Needleman-Wunsch alignment and return the two sequences. @param reads: An iterable of two reads. @return: A C{Reads} instance with the two aligned sequences. """ from tempfile import mkdtemp from shutil import rmtree dir = mkdtemp() file1 = join(dir, 'file1.fasta') with open(file1, 'w') as fp: print(reads[0].toString('fasta'), end='', file=fp) file2 = join(dir, 'file2.fasta') with open(file2, 'w') as fp: print(reads[1].toString('fasta'), end='', file=fp) out = join(dir, 'result.fasta') Executor().execute("needle -asequence '%s' -bsequence '%s' -auto " "-outfile '%s' -aformat fasta" % (file1, file2, out)) # Use 'list' in the following to force reading the FASTA from disk. result = Reads(list(FastaReads(out))) rmtree(dir) return result
def _writeAlignmentFileSummary(self, alignmentFile, outputDir): """ Write a summary of alignments. @param alignmentFile: The C{str} name of an alignment file. @param outputDir: The C{str} name of the output directory. """ shortAlignmentFilename = self.shortAlignmentFilename[alignmentFile] filename = join(outputDir, shortAlignmentFilename + '.stats') self.report(' Writing alignment statistics to', filename) e = Executor() e.execute('sam-reference-read-counts.py "%s" > %s' % (alignmentFile, filename)) if self.verbose > 1: for line in e.log: print(' ', line)
def diamondInstalled(): """ Test if DIAMOND is installed. @return: A C{bool}, which is C{True} if DIAMOND seems to be installed. """ try: Executor().execute('diamond help') except CalledProcessError: return False else: return True
def writeBcftoolsConsensus(self, referenceId, alignmentFile, outputDir): """ Write a reference consensus using bcftools. @param referenceId: The C{str} id of the reference sequence. @param alignmentFile: The C{str} name of an alignment file. @param outputDir: A C{str} directory path. """ filename = join(outputDir, 'reference-consensus-samtools.fasta') self.report(' Saving samtools reference consensus to', filename) referenceFilename = join(outputDir, 'reference.fasta') e = Executor() e.execute('samtools mpileup -u -f %s %s 2>/dev/null | ' 'bcftools call -c | vcfutils.pl vcf2fq | ' 'filter-fasta.py --fastq --saveAs fasta --quiet ' '--idLambda \'lambda _: "consensus-%s-samtools"\' > %s' % (referenceFilename, alignmentFile, referenceId, filename)) if self.verbose > 1: for line in e.log: print(' ', line)
def testUnknownCommand(self): """ An unknown command must raise CalledProcessError. """ e = Executor() # Presumably there will not be an executable with this name! command = '/'.join(['dev', 'non-existent', '@' * 20]) error = r"^Command '%s' returned non-zero exit status 127%s$" % ( command, '' if sys.version_info < (3, 6) else r'\.') assertRaisesRegex(self, CalledProcessError, error, e.execute, command, useStderr=False)
def fastaIdentityTable(filename, outputFilename, verbose, filename2=None): """ Call fasta-identity-table.py to produce an HTML identity table for one or two FASTA files. @param filename: A C{str} file name containing FASTA. @param outputFilename: A C{str} file name to store the HTML output into. @param verbose: The C{int} verbosity level. @param filename2: An optional second C{str} file name containing FASTA. """ colors = cl.scales['9']['seq']['GnBu'] colorArgs = [] for i in range(7): colorArgs.append('--color "%.2f %s"' % (0.65 + 0.05 * i, colors[i])) file2arg = ('--fastaFile2 "%s"' % filename2) if filename2 else '' e = Executor() e.execute('fasta-identity-table.py --showGaps --showLengths --footer ' '--removeDescriptions %s %s < %s > %s' % (' '.join(colorArgs), file2arg, filename, outputFilename)) if verbose > 1: for line in e.log: print(' ', line)
def __init__(self, executor=None, threads=None, verboseFp=None, dryRun=False, reference=None, tempdir=None, tmpChmod=None): self._executor = executor or Executor(dryRun) if dryRun: self.tempdir = tempdir or '/tmp/xxx' else: self.tempdir = mkdtemp(prefix='bt2-', dir=tempdir) if tmpChmod: self._executor.execute(f'chmod {tmpChmod} {self.tempdir}') self._samFile = join(self.tempdir, 'result.sam') self._bamFile = join(self.tempdir, 'result.bam') self._indexFile = join(self.tempdir, 'index') self._verboseFp = verboseFp self._indexCalled = self._samExists = self._bamExists = False self.nThreads = threads or multiprocessing.cpu_count() self._reference = reference
def main(args, logfp): """ Create genomes and reads for a multiple infection detection experiment. @param args: A namespace instance, as returned by parse_args @param logfp: A file object to write log information to. """ print('Invocation arguments', args, file=logfp) qOutputDir = quote(args.outputDir) genome1 = join(qOutputDir, 'genome-1.fasta') genome2 = join(qOutputDir, 'genome-2.fasta') genome2locations = join(qOutputDir, 'genome-2.locations') reads1 = join(qOutputDir, 'reads-1.fastq') reads2 = join(qOutputDir, 'reads-2.fastq') reads12 = join(qOutputDir, 'reads-12.fastq') executor = Executor(args.dryRun) if args.genome1Filename: executor.execute('ln -s %s %s' % (quote(args.genome1Filename), genome1)) else: if args.genomeLength < 1: print('Random initial genome length must be > 0.', file=sys.stderr) sys.exit(3) print('Writing random starting genome of length %d to %s' % (args.genomeLength, genome1), file=logfp) if not args.dryRun: sequence = ''.join( [choice('ACGT') for _ in range(args.genomeLength)]) with open(genome1, 'w') as fp: print('>genome-1\n%s' % sequence, file=fp) if args.genome2Filename: executor.execute('ln -s %s %s' % (quote(args.genome2Filename), genome2)) else: # Make a second genome using the given mutation rate. Print its # mutated locations to a file. (genome1read, ) = list(FastaReads(genome1)) offsets = mutateRead(genome1read, args.genome2MutationRate) with open(genome2locations, 'w') as fp: print('\n'.join(str(offset + 1) for offset in sorted(offsets)), file=fp) genome1read.id = 'genome-2' Reads([genome1read]).save(genome2) cmdPrefix = ('create-reads.py --maxReadLength %d --minReadLength %d ' '--meanLength %d --sdLength %d --rate %f ' % (args.maxReadLength, args.minReadLength, args.meanReadLength, args.sdReadLength, args.readMutationRate)) for info in [{ 'reads': reads1, 'fasta': genome1, 'number': 1, 'count': args.genome1ReadCount or args.readCount, }, { 'reads': reads2, 'fasta': genome2, 'number': 2, 'count': args.genome2ReadCount or args.readCount, }]: executor.execute(cmdPrefix + ('--idPrefix genome-%(number)d-read- ' '--count %(count)d < %(fasta)s > %(reads)s' % info)) executor.execute('cat %s %s > %s' % (reads1, reads2, reads12)) print('\n'.join(executor.log), file=logfp)
def testDryRunFalse(self): """ The dryRun attribute must be set when dryRun=False. """ e = Executor(dryRun=False) self.assertFalse(e.dryRun)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description=('Run bowtie2 on a FASTA file. Optionally convert the ' 'result to BAM, sorting, and indexing.')) parser.add_argument( '--index', help=('Either: an accession number, a filename or the name of a ' 'pre-existing bowtie2 index (created with bowtie2-build). If ' 'not given and --reference is used, the reference will be ' 'used to build a bowtie2 index.')) parser.add_argument( '--ignoreIndex', action='append', dest='ignoredIndices', help=('Either: an accession number, a filename or the name of a ' 'pre-existing bowtie2 index (created with bowtie2-build). ' 'Reads matching this index will be ignored. May be repeated.')) parser.add_argument( '--fastq1', '-1', help=('The FASTQ reads to match against the bowtie2 index given by ' '--index. Also use --fast2 if you have paired reads. ' 'If not given, single-end FASTQ reads will be read from ' 'standard input.')) parser.add_argument( '--fastq2', '-2', help=('The FASTQ reads to match against the bowtie2 index given by ' '--index. Use this with --fastq1 to specify the mate ' 'file for paired-end reads.')) parser.add_argument( '--bowtie2Args', default='--no-unal', help=('Extra arguments to be passed to Bowtie2 (use --threads to ' 'specify a thread count).')) parser.add_argument( '--samtoolsViewArgs', default='-F %d -q 30' % DEFAULT_SAMTOOLS_VIEW_FLAGS, help='Arguments to be passed to samtools view to create the BAM file.') parser.add_argument( '--tempdir', help=('The temporary directory to use. If not specified, the value ' 'of the TMPDIR environment variable (if any) is used, or else ' '/tmp.')) parser.add_argument( '--out', '-o', help=('The output file name. If not given, the resulting SAM or BAM ' 'will be written to standard output will be used.')) parser.add_argument( '--reference', help=('The reference FASTA file for use with --callHaplotypesGATK and ' '--callHaplotypesBcftools. This will be used to build a Bowtie2 ' 'index if --index is not given.')) parser.add_argument( '--vcfFile', help=('The file to write VCF info to if --callHaplotypesGATK or ' '--callHaplotypesBcftools are used.')) parser.add_argument( '--markDuplicatesGATK', default=False, action='store_true', help=('Use GATK to mark duplicates. See ' 'https://gatk.broadinstitute.org for details on GATK.')) parser.add_argument( '--markDuplicatesPicard', default=False, action='store_true', help=('Use Picard to mark duplicates. See ' 'https://github.com/broadinstitute/picard for details on ' 'Picard.')) parser.add_argument( '--picardJar', help=('The path to the Picard jar file. See ' 'https://github.com/broadinstitute/picard for details on ' 'Picard.')) parser.add_argument( '--removeDuplicates', default=False, action='store_true', help=('Remove duplicates from the resulting SAM/BAM file. Best used ' 'in combination with an option that marks duplicates, such as ' '--markDuplicatesGATK.')) parser.add_argument( '--verbose', default=False, action='store_true', help=('Print a description of commands as they are (or would be, if ' '--dryRun is used) executed.')) parser.add_argument( '--log', default=False, action='store_true', help=('Show a log of commands that were (or would be, if --dryRun is ' 'used) executed.')) parser.add_argument( '--threads', type=int, help='The number of threads to use when running bowtie2 commands.') parser.add_argument('--noAlign', default=True, action='store_false', dest='align', help='Do not align with Bowtie2, just build an index.') parser.add_argument('--noBAM', default=True, action='store_false', dest='bam', help='Do not convert SAM to BAM.') parser.add_argument('--noSort', default=True, action='store_false', dest='sort', help='Do not sort the BAM.') parser.add_argument('--noIndexBAM', default=True, action='store_false', dest='indexBAM', help='Do not index the BAM file.') parser.add_argument( '--noClean', default=True, action='store_false', dest='clean', help='Do not remove intermediate files or the temporary directory.') parser.add_argument('--force', default=False, action='store_true', help='Overwrite pre-existing output file.') parser.add_argument( '--dryRun', default=False, action='store_true', help='Do not run commands, just print what would be done.') haplotypeCaller = parser.add_mutually_exclusive_group() haplotypeCaller.add_argument( '--callHaplotypesGATK', default=False, action='store_true', help=('Use GATK to call haplotypes. See ' 'https://gatk.broadinstitute.org for details on GATK.')) haplotypeCaller.add_argument('--callHaplotypesBcftools', default=False, action='store_true', help='Use bcftools call to call haplotypes.') args = parser.parse_args() if args.indexBAM and not args.bam: print( 'The --indexBAM option only makes sense if you do not use ' '--noBAM.', file=sys.stderr) sys.exit(1) e = Executor(args.dryRun) if args.tempdir is None: args.tempdir = os.environ.get('TMPDIR', '/tmp') if args.ignoredIndices: ignoresDir = processIgnores(args, e) processMatch(args, e) if args.ignoredIndices: if args.clean: e.execute("rm -r '%s'" % ignoresDir) else: print('Temporary directory with non-ignored inputs %r.' % ignoresDir, file=sys.stderr) if args.dryRun or args.log: print('\n'.join(e.log), file=sys.stderr)
def __init__(self, pValueFile, dryRun=False): self.pValueFile = pValueFile self.tmpDir = None self.executor = Executor(dryRun=dryRun)
def makeBAM(template, bamReferences=None, fastaReferences=None): """ A context manager decorator to make a simple BAM file from a template. Note that this code invokes samtools. @param template: An iterable of C{str} sequences. The first will be treated as the reference, and then subsequent pairs (if any) will be treated as read and quality strings. Reads and quality strings can be indented with spaces to show where the read aligns with the reference. @return: A context manager that produces a 2-tuple containing the reference C{DNARead} instance and the C{Path} of the BAM file. """ if len(template) % 2 != 1: raise ValueError( 'The template must have an odd number of strings, specifying the ' 'reference sequence, then zero or more read/quality pairs.') leftPaddedReference = template[0] templateSequence = leftPaddedReference.lstrip().replace('-', '') if bamReferences is None: matchedReference = DNARead(REF_ID, templateSequence) bamReferences = Reads([matchedReference]) else: matchedReference = bamReferences[0] # Sanity check: The first BAM reference must have the same sequence # as the template. assert matchedReference.sequence == templateSequence bamReferences = Reads(bamReferences) fastaReferences = Reads( bamReferences if fastaReferences is None else fastaReferences) nSeqs = (len(template) - 1) >> 1 dirname = mkdtemp(prefix='test-consensus-') e = Executor() try: fastaFile = Path(dirname) / 'references.fasta' samFile = Path(dirname) / 'file.sam' bamFile = Path(dirname) / 'file.bam' fastaReferences.save(fastaFile) with open(samFile, 'w') as fp: for reference in bamReferences: print(f'@SQ\tSN:{reference.id}\tLN:{len(reference)}', file=fp) for count in range(nSeqs): leftPaddedQuery = template[count * 2 + 1].rstrip() leftPaddedQuality = template[count * 2 + 2].rstrip() assert len(leftPaddedQuery) == len(leftPaddedQuality) query = leftPaddedQuery.lstrip() quality = leftPaddedQuality.lstrip() queryNoGaps = qualityNoGaps = '' for queryBase, qualityBase in zip(query, quality): if queryBase != '-': queryNoGaps += queryBase qualityNoGaps += qualityBase print( '\t'.join( map( str, ( f'read{count}', # QNAME (query name) 0, # FLAGS matchedReference.id, # RNAME (reference name) matchOffset(leftPaddedReference, leftPaddedQuery) + 1, 30, # MAPQ (mapping quality) makeCigar(leftPaddedReference, leftPaddedQuery), # CIGAR '*', # MRNM (mate reference name) 0, # MPOS (mate position) 0, # ISIZE (insert size) queryNoGaps, # SEQ qualityNoGaps, # QUAL ))), file=fp) e.execute(f'samtools sort -O BAM --write-index -o {str(bamFile)!r} ' f'{str(samFile)!r}') yield (fastaFile, bamFile) finally: # import sys; print(f'{samFile}', file=sys.stderr) e.execute(f'rm -fr {dirname!r}')
class RecombinationAnalysis(object): """ Perform a 3seq recombination analysis. @param pValueFile: The C{str} file name containing precomputed p-values (as generated by 3seq -g). See Steps 3a/b of Section 3 of the 3seq manual (mentioned in ../README.md) for how to generate or obtain a p-value file. @param dryRun: If C{True} do not execute any 3seq commands, just log what would have been run (see self.executor.log for details). """ def __init__(self, pValueFile, dryRun=False): self.pValueFile = pValueFile self.tmpDir = None self.executor = Executor(dryRun=dryRun) def check(self): """ Use the -check function to ensure a correct p-value table can be checked. @return: A C{subprocess.CompletedProcess} instance. """ return self.executor.execute('3seq -check "%s"' % self.pValueFile) def run(self, reads, t=0.05): """ Run 3seq on some reads. Sets self.tmpDir as a side-effect. @param reads: Either a C{dark.reads.Reads} instance or a C{str} filename. @param t: A C{str} or C{float} error threshold, e.g. 0.01, '1e-6' that will be passed on the command line to 3seq. See section 7.10 of the 3seq manual for details. @return: A C{subprocess.CompletedProcess} instance. """ self.tmpDir = mkdtemp() if isinstance(reads, six.string_types): inputFile = reads else: inputFile = join(self.tmpDir, 'input.fasta') reads.save(inputFile, format_='fasta') # Note that the 3seq manual (as of 2018-12-29) says you can use # '-fullrun' but that doesn't work. The source code looks for # either -f or -full. But -f seems ambiguous in the manual (it # also means 'first') so I'm going with -full. return self.executor.execute( 'echo y | 3seq -full "%s" -ptable "%s" -id "%s" -t%s' % (inputFile, self.pValueFile, join(self.tmpDir, _OUTPUT_PREFIX), str(t))) def recombinantFile(self): """ Get the name of the main 3seq recombination output file. @raise RuntimeError: If no analysis has been run. @return: A C{str} path to the output file. """ if self.tmpDir is None: raise RuntimeError('No analysis has been run yet') else: # The string in the following is always used by 3seq. return join(self.tmpDir, _OUTPUT_PREFIX + '.3s.rec') def removeOutput(self): """ Remove 3seq output files. @raise RuntimeError: if no analysis has been run. """ if self.tmpDir is None: raise RuntimeError('No analysis has been run yet') else: shutil.rmtree(self.tmpDir)
def testDryRunDefault(self): e = Executor(dryRun=True) result = e.execute('date') self.assertIsNone(result) self.assertEqual('$ date', e.log[-1])
def testDryRunTrue(self): """ The dryRun attribute must be set when dryRun=True. """ e = Executor(dryRun=True) self.assertTrue(e.dryRun)
'2 = maximal output).')) parser.add_argument('--force', default=False, action='store_true', help='If given, overwrite pre-existing files.') parser.add_argument('--outputDir', metavar='DIRNAME', help='The directory to save result files to.') args = parser.parse_args() outputDir = makeOuputDir(args.outputDir, args.force) executor = Executor() pileuppers = (('samtools-mpileup', samtoolsMpileup), ('bcftools-mpileup', bcftoolsMpileup)) callers = (('bcftools-c', bcftoolsCallConsensus), ('bcftools-m', bcftoolsCallMulti)) consensusers = (('bcftools-consensus', bcftoolsConsensus), ('vcfutils-vcf2fq', vcfutilsConsensus)) consensusFiles = [] for pileupName, pileupFunc in pileuppers: pileupMiddle = pileupName pileupFile = join(outputDir, 'pileup-' + pileupMiddle + '.vcf')
bamFile = base + '.bam' sortedBamFile = base + '-sorted.bam' if not (args.force or args.dryRun): existing = [] for filename in samFile, bamFile, sortedBamFile: if exists(filename): existing.append(filename) if existing: print('Will not overwrite pre-existing file%s %s. ' 'Use --force to make me.' % ('' if len(existing) == 1 else 's', ', '.join(existing)), file=sys.stderr) sys.exit(2) e = Executor(args.dryRun) e.execute("bwa %s '%s' '%s' > '%s'" % (args.bwaArgs, args.bwaIndex, args.fastaFile, samFile)) if not args.noBAM: e.execute("samtools view -b < '%s' > '%s'" % (samFile, bamFile)) if not args.noClean: e.execute("rm '%s'" % samFile) if not args.noSort: e.execute("samtools sort '%s' > '%s'" % (bamFile, sortedBamFile)) if not args.noClean: e.execute("rm '%s'" % bamFile)
sortedBamFile = base + '-sorted.bam' if not (args.force or args.dryRun): existing = [] for filename in samFile, bamFile, sortedBamFile: if exists(filename): existing.append(filename) if existing: print('Will not overwrite pre-existing file%s %s. ' 'Use --force to make me.' % ( '' if len(existing) == 1 else 's', ', '.join(existing)), file=sys.stderr) sys.exit(2) e = Executor(args.dryRun) e.execute("bwa %s '%s' '%s' > '%s'" % ( args.bwaArgs, args.bwaIndex, args.fastaFile, samFile)) if not args.noBAM: e.execute("samtools view -b < '%s' > '%s'" % (samFile, bamFile)) if not args.noClean: e.execute("rm '%s'" % samFile) if not args.noSort: e.execute("samtools sort '%s' > '%s'" % (bamFile, sortedBamFile)) if not args.noClean: e.execute("rm '%s'" % bamFile)
class DiamondExecutor(object): """ @param dryRun: If C{True} do not actually execute the DIAMOND commands. """ SUBJECTS_FILENAME = 'subjects.fasta' QUERIES_FILENAME = 'queries.fasta' OUTPUT_FILENAME = 'diamond.tsv' def __init__(self, dryRun=False): self._dirty = False self._dir = mkdtemp() self._subjectsFp = None self._subjectsExist = False self._executor = Executor(dryRun) def addSubject(self, subject): """ Add a subject sequence to the database. @param subject: A C{dark.reads.Read} instance. """ if self._subjectsFp is None: if six.PY3: self._subjectsFp = open(join(self._dir, self.SUBJECTS_FILENAME), 'a', encoding='utf-8') else: self._subjectsFp = open( join(self._dir, self.SUBJECTS_FILENAME), 'a') print(subject.toString('fasta'), end='', file=self._subjectsFp) self._subjectsExist = self._dirty = True def cleanup(self): """ Remove the temporary directory we made. """ if self._subjectsFp: self._subjectsFp.close() self._subjectsFp = None rmtree(self._dir) def search(self, reads, fieldNames=None): """ Match reads against the database. @param reads: An instance of C{dark.reads.Reads}. @param fieldNames: An iterable of C{str} field names for DIAMOND tabular output (format 6). See diamond help for the names of all available fields. @return: A generator that yields C{dict}s with keys as in C{fieldNames}. """ if not self._subjectsExist: raise ValueError('No subject sequences in the database') with cd(self._dir): if self._dirty: self._subjectsFp.close() self._subjectsFp = None self._executor.execute('diamond makedb --db database --in %s' % self.SUBJECTS_FILENAME) with open(self.QUERIES_FILENAME, 'w') as fp: count = reads.save(fp, format_='fastq') if count == 0: raise ValueError('No query sequences were passed') fieldNames = fieldNames or FIELDS.split() self._executor.execute( 'diamond blastx --db database --query %s --outfmt 6 %s > %s' % (self.QUERIES_FILENAME, ' '.join(fieldNames), self.OUTPUT_FILENAME)) dtf = DiamondTabularFormat(fieldNames) for diamondDict in dtf.diamondTabularFormatToDicts( self.OUTPUT_FILENAME): yield diamondDict def __enter__(self): return self def __exit__(self, excType, excValue, traceback): self.cleanup()
def __init__(self, dryRun=False): self._dirty = False self._dir = mkdtemp() self._subjectsFp = None self._subjectsExist = False self._executor = Executor(dryRun)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description='Make a consensus sequence.') parser.add_argument('--reference', required=True, help='The reference FASTA file.') parser.add_argument( '--bam', help=('The BAM file from which the consensus should be made. ' 'Required if --maskLowCoverage is used. If no BAM file is ' 'given, a VCF file must be provided. If both a BAM and a VCF ' 'file are given, the VCF file will take precedence.')) parser.add_argument( '--vcfFile', help=('The VCF file. If omitted, bcftools will be used to make a VCF ' 'file from the BAM file.')) group = parser.add_mutually_exclusive_group() group.add_argument( '--id', help=('The id to use in the consensus sequence in the output FASTA. ' 'If not given, the reference sequence id will be used.')) group.add_argument( '--idLambda', metavar='LAMBDA-FUNCTION', help=('A one-argument function taking and returning a read id. ' 'This can be used to set the id of the reference sequence based ' 'on the id of the reference sequence (the function will be ' 'called with the id of the reference sequence). E.g., ' '--idLambda "lambda id: id.split(\'_\')[0]" or ' '--idLambda "lambda id: id[:10] + \'-consensus\'".')) parser.add_argument( '--sample', help=('The name of the sample (from the @RG SM tag in the original ' 'alignment BAM file) for which a consensus should be made. ' 'If not given, the first sample name (from the #CHROM header) ' 'in the VCF file will be used.')) parser.add_argument( '--dryRun', action='store_true', help='Do not run commands, just print what would be done.') parser.add_argument( '--maskLowCoverage', default=0, type=int, help=('Put an N into sites where the coverage is below the specified ' 'cutoff. If you specify a negative numer, masking will be ' 'turned off. Requires --bam.')) parser.add_argument( '--log', action='store_true', help=('Show a log of commands that were (or would be, if --dryRun is ' 'used) executed.')) parser.add_argument( '--noClean', action='store_false', dest='clean', help=('Do not remove intermediate files or the temporary directory.')) parser.add_argument( '--callHaplotypesGATK', action='store_true', help=('Use GATK to call haplotypes. See ' 'https://gatk.broadinstitute.org for details on GATK.')) parser.add_argument( '--picardJar', help=('The path to the Picard jar file. See ' 'https://github.com/broadinstitute/picard for details on ' 'Picard.')) parser.add_argument( '--ivar', action='store_true', help='If given, ivar will be used to call the consensus.') parser.add_argument( '--ivarFrequencyThreshold', type=float, help=(f'The frequency threshold used by ivar when calling the ' f'consensus. If the frequency of the most-common nucleotide at ' f'a site meets this threshold, the nucleotide will be called. ' f'Otherwise, an ambiguous nucleotide code will be produced, ' f'based on the smallest set of most-frequent nucleotides whose ' f'summed frequencies meet the threshold. See {IVAR_DOCS} for ' f'more information. If not given, ' f'{IVAR_FREQUENCY_THRESHOLD_DEFAULT} is used. Can only be used ' f'if --ivar is also specified.')) parser.add_argument( '--ivarBedFile', help=('If ivar should trim primers, a BED file of the primer ' 'positions.')) args = parser.parse_args() if not (args.bam or args.vcfFile): print('At least one of --bam or --vcfFile must be given.', file=sys.stderr) sys.exit(1) if args.maskLowCoverage and not args.bam: print('If --maskLowCoverage is used, --bam must be too.', file=sys.stderr) sys.exit(1) if args.ivar and not args.bam: print('If --ivar is used, --bam must be too.', file=sys.stderr) sys.exit(1) if args.ivarFrequencyThreshold is not None and not args.ivar: print('If --ivarFrequencyThreshold is used, --ivar must be too.', file=sys.stderr) sys.exit(1) if args.ivar and args.ivarFrequencyThreshold is None: args.ivarFrequencyThreshold = IVAR_FREQUENCY_THRESHOLD_DEFAULT e = Executor(args.dryRun) tempdir = mkdtemp(prefix='consensus-') if args.vcfFile: vcfFile = args.vcfFile else: # No VCF file provided, so make one. vcfFile = join(tempdir, 'vcf.gz') if args.callHaplotypesGATK: e.execute("samtools index '%s'" % args.bam) if args.picardJar: picardJar = args.picardJar else: try: picardJar = os.environ['PICARD_JAR'] except KeyError: print( 'If you use --callHaplotypesGATK, you must give a ' 'Picard JAR file with --picardJar or else set ' 'PICARD_JAR in your environment.', file=sys.stderr) sys.exit(1) indexFile = args.reference + '.fai' if os.path.exists(indexFile): removeIndex = False else: removeIndex = True e.execute("samtools faidx '%s'" % args.reference) if args.reference.lower().endswith('.fasta'): dictFile = args.reference[:-len('.fasta')] + '.dict' else: dictFile = args.reference + '.dict' if os.path.exists(dictFile): removeDict = False else: removeDict = True e.execute( "java -jar '%s' CreateSequenceDictionary R='%s' O='%s'" % (picardJar, args.reference, dictFile)) e.execute('gatk --java-options -Xmx4g HaplotypeCaller ' "--reference '%s' " "--input '%s' " "--output '%s' " "--sample-ploidy 1 " '-ERC GVCF' % (args.reference, args.bam, vcfFile)) if removeIndex: e.execute("rm '%s'" % indexFile) if removeDict: e.execute("rm '%s'" % dictFile) else: e.execute("bcftools mpileup --max-depth 5000 -Ou -f '%s' '%s' | " "bcftools call --ploidy 1 -mv -Oz -o '%s'" % (args.reference, args.bam, vcfFile)) e.execute("bcftools index '%s'" % vcfFile) if args.maskLowCoverage >= 0: # Make a BED file. bedFile = join(tempdir, 'mask.bed') # The doubled-% below are so that Python doesn't try to fill in the # values and instead just generates a single % that awk sees. e.execute("samtools depth -a '%s' | " "awk '$3 < %d {printf \"%%s\\t%%d\\t%%d\\n\", " "$1, $2 - 1, $2}' > '%s'" % (args.bam, args.maskLowCoverage, bedFile)) maskArg = '--mask ' + bedFile else: maskArg = '' if args.sample: sample = args.sample else: result = e.execute("gunzip -c '%s' | egrep -m 1 '^#CHROM' | cut -f10" % vcfFile) sample = 'SAMPLE-NAME' if args.dryRun else result.stdout.strip() consensusFile = join(tempdir, 'consensus.fasta') if args.ivar: if args.ivarBedFile: tempBamFile = join(tempdir, basename(args.bam) + '-trimmed') result = e.execute("ivar trim -i %r -b %r -p %r -e" % (args.bam, args.ivarBedFile, tempBamFile)) ivarTempBamFile = tempBamFile + '.bam' sortedIvarTempBamFile = tempBamFile + '-trimmed-sorted.bam' result = e.execute("samtools sort %r -o %r" % (ivarTempBamFile, sortedIvarTempBamFile)) bamFile = sortedIvarTempBamFile else: bamFile = args.bam ivarConsensusFile = join(tempdir, 'temporary-consensus') result = e.execute("samtools mpileup -A -Q 0 %r | " "ivar consensus -p %r -q 20 -t %r -m %r" % (bamFile, ivarConsensusFile, args.ivarFrequencyThreshold, args.maskLowCoverage)) result = e.execute("mv %s %s" % (ivarConsensusFile + '.fa', consensusFile)) else: result = e.execute( "bcftools consensus --sample '%s' --iupac-codes %s --fasta-ref " "'%s' '%s' > '%s'" % (sample, maskArg, args.reference, vcfFile, consensusFile)) if result.stderr: print(result.stderr, end='', file=sys.stderr) if not args.dryRun: consensus = list(FastaReads(consensusFile))[0] if args.id is not None: consensus.id = args.id elif args.idLambda is not None: idLambda = eval(args.idLambda) consensus.id = idLambda(consensus.id) print(consensus.toString('fasta'), end='') if args.dryRun or args.log: print('\n'.join(e.log), file=sys.stderr) if tempdir: if args.clean: e.execute("rm -r '%s'" % tempdir) else: print('Temporary directory %r.' % tempdir, file=sys.stderr)
help=('The integer verbosity level (0 = no output, 1 = some output, ' '2 = maximal output).')) parser.add_argument( '--force', default=False, action='store_true', help='If given, overwrite pre-existing files.') parser.add_argument( '--outputDir', metavar='DIRNAME', help='The directory to save result files to.') args = parser.parse_args() outputDir = makeOuputDir(args.outputDir, args.force) executor = Executor() pileuppers = ( ('samtools-mpileup', samtoolsMpileup), ('bcftools-mpileup', bcftoolsMpileup)) callers = ( ('bcftools-c', bcftoolsCallConsensus), ('bcftools-m', bcftoolsCallMulti)) consensusers = ( ('bcftools-consensus', bcftoolsConsensus), ('vcfutils-vcf2fq', vcfutilsConsensus)) consensusFiles = []