コード例 #1
0
 def setUpClass(cls):
     """
     Make a tiny p-value table in a temporary directory.
     """
     e = Executor()
     tmpDir = mkdtemp()
     tableFile = join(tmpDir, 'table')
     e.execute('3seq -g %s 2' % tableFile)
     cls._tableFile = tableFile
     cls._tmpDir = tmpDir
コード例 #2
0
ファイル: analysis.py プロジェクト: acorg/midtools
    def _writeAlignmentFileSummary(self, alignmentFile, outputDir):
        """
        Write a summary of alignments.

        @param alignmentFile: The C{str} name of an alignment file.
        @param outputDir: The C{str} name of the output directory.
        """
        shortAlignmentFilename = self.shortAlignmentFilename[alignmentFile]
        filename = join(outputDir, shortAlignmentFilename + '.stats')
        self.report('  Writing alignment statistics to', filename)
        e = Executor()
        e.execute('sam-reference-read-counts.py "%s" > %s' %
                  (alignmentFile, filename))
        if self.verbose > 1:
            for line in e.log:
                print('    ', line)
コード例 #3
0
ファイル: test_process.py プロジェクト: TaliVeith/dark-matter
 def testEchoList(self):
     """
     We should be able to call echo using a list command.
     """
     e = Executor()
     result = e.execute(['echo', 'hello'])
     self.assertEqual('hello', result.stdout.strip())
     self.assertTrue('$ echo hello' in e.log)
コード例 #4
0
ファイル: test_process.py プロジェクト: TaliVeith/dark-matter
 def testPipe(self):
     """
     We should be able to pipe echo into wc -c.
     """
     e = Executor()
     result = e.execute('echo hello | wc -c')
     self.assertEqual('6', result.stdout.strip())
     self.assertTrue('$ echo hello | wc -c' in e.log)
コード例 #5
0
ファイル: test_process.py プロジェクト: TaliVeith/dark-matter
 def testDryRunDefaultOverride(self):
     """
     It must be possible to override the default dryRun setting by passing
     a value to C{execute}.
     """
     e = Executor(dryRun=False)
     result = e.execute('date', dryRun=True)
     self.assertIsNone(result)
     self.assertEqual('$ date', e.log[-1])
コード例 #6
0
ファイル: analysis.py プロジェクト: acorg/midtools
    def writeBcftoolsConsensus(self, referenceId, alignmentFile, outputDir):
        """
        Write a reference consensus using bcftools.

        @param referenceId: The C{str} id of the reference sequence.
        @param alignmentFile: The C{str} name of an alignment file.
        @param outputDir: A C{str} directory path.
        """
        filename = join(outputDir, 'reference-consensus-samtools.fasta')
        self.report('    Saving samtools reference consensus to', filename)
        referenceFilename = join(outputDir, 'reference.fasta')

        e = Executor()

        e.execute('samtools mpileup -u -f %s %s 2>/dev/null | '
                  'bcftools call -c | vcfutils.pl vcf2fq | '
                  'filter-fasta.py --fastq --saveAs fasta --quiet '
                  '--idLambda \'lambda _: "consensus-%s-samtools"\' > %s' %
                  (referenceFilename, alignmentFile, referenceId, filename))

        if self.verbose > 1:
            for line in e.log:
                print('    ', line)
コード例 #7
0
def fastaIdentityTable(filename, outputFilename, verbose, filename2=None):
    """
    Call fasta-identity-table.py to produce an HTML identity table
    for one or two FASTA files.

    @param filename: A C{str} file name containing FASTA.
    @param outputFilename: A C{str} file name to store the HTML output into.
    @param verbose: The C{int} verbosity level.
    @param filename2: An optional second C{str} file name containing FASTA.
    """
    colors = cl.scales['9']['seq']['GnBu']
    colorArgs = []
    for i in range(7):
        colorArgs.append('--color "%.2f %s"' % (0.65 + 0.05 * i, colors[i]))

    file2arg = ('--fastaFile2 "%s"' % filename2) if filename2 else ''

    e = Executor()
    e.execute('fasta-identity-table.py --showGaps --showLengths --footer '
              '--removeDescriptions %s %s < %s > %s' %
              (' '.join(colorArgs), file2arg, filename, outputFilename))
    if verbose > 1:
        for line in e.log:
            print('       ', line)
コード例 #8
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description=('Run bowtie2 on a FASTA file. Optionally convert the '
                     'result to BAM, sorting, and indexing.'))

    parser.add_argument(
        '--index',
        help=('Either: an accession number, a filename or the name of a '
              'pre-existing bowtie2 index (created with bowtie2-build). If '
              'not given and --reference is used, the reference will be '
              'used to build a bowtie2 index.'))

    parser.add_argument(
        '--ignoreIndex',
        action='append',
        dest='ignoredIndices',
        help=('Either: an accession number, a filename or the name of a '
              'pre-existing bowtie2 index (created with bowtie2-build). '
              'Reads matching this index will be ignored. May be repeated.'))

    parser.add_argument(
        '--fastq1',
        '-1',
        help=('The FASTQ reads to match against the bowtie2 index given by '
              '--index. Also use --fast2 if you have paired reads. '
              'If not given, single-end FASTQ reads will be read from '
              'standard input.'))

    parser.add_argument(
        '--fastq2',
        '-2',
        help=('The FASTQ reads to match against the bowtie2 index given by '
              '--index. Use this with --fastq1 to specify the mate '
              'file for paired-end reads.'))

    parser.add_argument(
        '--bowtie2Args',
        default='--no-unal',
        help=('Extra arguments to be passed to Bowtie2 (use --threads to '
              'specify a thread count).'))

    parser.add_argument(
        '--samtoolsViewArgs',
        default='-F %d -q 30' % DEFAULT_SAMTOOLS_VIEW_FLAGS,
        help='Arguments to be passed to samtools view to create the BAM file.')

    parser.add_argument(
        '--tempdir',
        help=('The temporary directory to use. If not specified, the value '
              'of the TMPDIR environment variable (if any) is used, or else '
              '/tmp.'))

    parser.add_argument(
        '--out',
        '-o',
        help=('The output file name. If not given, the resulting SAM or BAM '
              'will be written to standard output will be used.'))

    parser.add_argument(
        '--reference',
        help=('The reference FASTA file for use with --callHaplotypesGATK and '
              '--callHaplotypesBcftools. This will be used to build a Bowtie2 '
              'index if --index is not given.'))

    parser.add_argument(
        '--vcfFile',
        help=('The file to write VCF info to if --callHaplotypesGATK or '
              '--callHaplotypesBcftools are used.'))

    parser.add_argument(
        '--markDuplicatesGATK',
        default=False,
        action='store_true',
        help=('Use GATK to mark duplicates. See '
              'https://gatk.broadinstitute.org for details on GATK.'))

    parser.add_argument(
        '--markDuplicatesPicard',
        default=False,
        action='store_true',
        help=('Use Picard to mark duplicates. See '
              'https://github.com/broadinstitute/picard for details on '
              'Picard.'))

    parser.add_argument(
        '--picardJar',
        help=('The path to the Picard jar file. See '
              'https://github.com/broadinstitute/picard for details on '
              'Picard.'))

    parser.add_argument(
        '--removeDuplicates',
        default=False,
        action='store_true',
        help=('Remove duplicates from the resulting SAM/BAM file. Best used '
              'in combination with an option that marks duplicates, such as '
              '--markDuplicatesGATK.'))

    parser.add_argument(
        '--verbose',
        default=False,
        action='store_true',
        help=('Print a description of commands as they are (or would be, if '
              '--dryRun is used) executed.'))

    parser.add_argument(
        '--log',
        default=False,
        action='store_true',
        help=('Show a log of commands that were (or would be, if --dryRun is '
              'used) executed.'))

    parser.add_argument(
        '--threads',
        type=int,
        help='The number of threads to use when running bowtie2 commands.')

    parser.add_argument('--noAlign',
                        default=True,
                        action='store_false',
                        dest='align',
                        help='Do not align with Bowtie2, just build an index.')

    parser.add_argument('--noBAM',
                        default=True,
                        action='store_false',
                        dest='bam',
                        help='Do not convert SAM to BAM.')

    parser.add_argument('--noSort',
                        default=True,
                        action='store_false',
                        dest='sort',
                        help='Do not sort the BAM.')

    parser.add_argument('--noIndexBAM',
                        default=True,
                        action='store_false',
                        dest='indexBAM',
                        help='Do not index the BAM file.')

    parser.add_argument(
        '--noClean',
        default=True,
        action='store_false',
        dest='clean',
        help='Do not remove intermediate files or the temporary directory.')

    parser.add_argument('--force',
                        default=False,
                        action='store_true',
                        help='Overwrite pre-existing output file.')

    parser.add_argument(
        '--dryRun',
        default=False,
        action='store_true',
        help='Do not run commands, just print what would be done.')

    haplotypeCaller = parser.add_mutually_exclusive_group()

    haplotypeCaller.add_argument(
        '--callHaplotypesGATK',
        default=False,
        action='store_true',
        help=('Use GATK to call haplotypes. See '
              'https://gatk.broadinstitute.org for details on GATK.'))

    haplotypeCaller.add_argument('--callHaplotypesBcftools',
                                 default=False,
                                 action='store_true',
                                 help='Use bcftools call to call haplotypes.')

    args = parser.parse_args()

    if args.indexBAM and not args.bam:
        print(
            'The --indexBAM option only makes sense if you do not use '
            '--noBAM.',
            file=sys.stderr)
        sys.exit(1)

    e = Executor(args.dryRun)

    if args.tempdir is None:
        args.tempdir = os.environ.get('TMPDIR', '/tmp')

    if args.ignoredIndices:
        ignoresDir = processIgnores(args, e)

    processMatch(args, e)

    if args.ignoredIndices:
        if args.clean:
            e.execute("rm -r '%s'" % ignoresDir)
        else:
            print('Temporary directory with non-ignored inputs %r.' %
                  ignoresDir,
                  file=sys.stderr)

    if args.dryRun or args.log:
        print('\n'.join(e.log), file=sys.stderr)
コード例 #9
0
class RecombinationAnalysis(object):
    """
    Perform a 3seq recombination analysis.

    @param pValueFile: The C{str} file name containing precomputed p-values
        (as generated by 3seq -g). See Steps 3a/b of Section 3 of the 3seq
        manual (mentioned in ../README.md) for how to generate or obtain a
        p-value file.
    @param dryRun: If C{True} do not execute any 3seq commands, just log what
        would have been run (see self.executor.log for details).
    """
    def __init__(self, pValueFile, dryRun=False):
        self.pValueFile = pValueFile
        self.tmpDir = None
        self.executor = Executor(dryRun=dryRun)

    def check(self):
        """
        Use the -check function to ensure a correct p-value table can be
        checked.

        @return: A C{subprocess.CompletedProcess} instance.
        """
        return self.executor.execute('3seq -check "%s"' % self.pValueFile)

    def run(self, reads, t=0.05):
        """
        Run 3seq on some reads. Sets self.tmpDir as a side-effect.

        @param reads: Either a C{dark.reads.Reads} instance or a C{str}
            filename.
        @param t: A C{str} or C{float} error threshold, e.g. 0.01, '1e-6'
            that will be passed on the command line to 3seq. See section
            7.10 of the 3seq manual for details.
        @return: A C{subprocess.CompletedProcess} instance.
        """
        self.tmpDir = mkdtemp()

        if isinstance(reads, six.string_types):
            inputFile = reads
        else:
            inputFile = join(self.tmpDir, 'input.fasta')
            reads.save(inputFile, format_='fasta')

        # Note that the 3seq manual (as of 2018-12-29) says you can use
        # '-fullrun' but that doesn't work. The source code looks for
        # either -f or -full.  But -f seems ambiguous in the manual (it
        # also means 'first') so I'm going with -full.
        return self.executor.execute(
            'echo y | 3seq -full "%s" -ptable "%s" -id "%s" -t%s' %
            (inputFile, self.pValueFile, join(self.tmpDir,
                                              _OUTPUT_PREFIX), str(t)))

    def recombinantFile(self):
        """
        Get the name of the main 3seq recombination output file.

        @raise RuntimeError: If no analysis has been run.
        @return: A C{str} path to the output file.
        """
        if self.tmpDir is None:
            raise RuntimeError('No analysis has been run yet')
        else:
            # The string in the following is always used by 3seq.
            return join(self.tmpDir, _OUTPUT_PREFIX + '.3s.rec')

    def removeOutput(self):
        """
        Remove 3seq output files.

        @raise RuntimeError: if no analysis has been run.
        """
        if self.tmpDir is None:
            raise RuntimeError('No analysis has been run yet')
        else:
            shutil.rmtree(self.tmpDir)
コード例 #10
0
def makeBAM(template, bamReferences=None, fastaReferences=None):
    """
    A context manager decorator to make a simple BAM file from a template.
    Note that this code invokes samtools.

    @param template: An iterable of C{str} sequences. The first will be treated
        as the reference, and then subsequent pairs (if any) will be treated as
        read and quality strings. Reads and quality strings can be indented
        with spaces to show where the read aligns with the reference.
    @return: A context manager that produces a 2-tuple containing the reference
        C{DNARead} instance and the C{Path} of the BAM file.
    """
    if len(template) % 2 != 1:
        raise ValueError(
            'The template must have an odd number of strings, specifying the '
            'reference sequence, then zero or more read/quality pairs.')

    leftPaddedReference = template[0]
    templateSequence = leftPaddedReference.lstrip().replace('-', '')

    if bamReferences is None:
        matchedReference = DNARead(REF_ID, templateSequence)
        bamReferences = Reads([matchedReference])
    else:
        matchedReference = bamReferences[0]
        # Sanity check: The first BAM reference must have the same sequence
        # as the template.
        assert matchedReference.sequence == templateSequence
        bamReferences = Reads(bamReferences)

    fastaReferences = Reads(
        bamReferences if fastaReferences is None else fastaReferences)

    nSeqs = (len(template) - 1) >> 1
    dirname = mkdtemp(prefix='test-consensus-')
    e = Executor()

    try:
        fastaFile = Path(dirname) / 'references.fasta'
        samFile = Path(dirname) / 'file.sam'
        bamFile = Path(dirname) / 'file.bam'

        fastaReferences.save(fastaFile)

        with open(samFile, 'w') as fp:
            for reference in bamReferences:
                print(f'@SQ\tSN:{reference.id}\tLN:{len(reference)}', file=fp)

            for count in range(nSeqs):
                leftPaddedQuery = template[count * 2 + 1].rstrip()
                leftPaddedQuality = template[count * 2 + 2].rstrip()
                assert len(leftPaddedQuery) == len(leftPaddedQuality)
                query = leftPaddedQuery.lstrip()
                quality = leftPaddedQuality.lstrip()
                queryNoGaps = qualityNoGaps = ''
                for queryBase, qualityBase in zip(query, quality):
                    if queryBase != '-':
                        queryNoGaps += queryBase
                        qualityNoGaps += qualityBase

                print(
                    '\t'.join(
                        map(
                            str,
                            (
                                f'read{count}',  # QNAME (query name)
                                0,  # FLAGS
                                matchedReference.id,  # RNAME (reference name)
                                matchOffset(leftPaddedReference,
                                            leftPaddedQuery) + 1,
                                30,  # MAPQ (mapping quality)
                                makeCigar(leftPaddedReference,
                                          leftPaddedQuery),  # CIGAR
                                '*',  # MRNM (mate reference name)
                                0,  # MPOS (mate position)
                                0,  # ISIZE (insert size)
                                queryNoGaps,  # SEQ
                                qualityNoGaps,  # QUAL
                            ))),
                    file=fp)

        e.execute(f'samtools sort -O BAM --write-index -o {str(bamFile)!r} '
                  f'{str(samFile)!r}')
        yield (fastaFile, bamFile)
    finally:
        # import sys; print(f'{samFile}', file=sys.stderr)
        e.execute(f'rm -fr {dirname!r}')
コード例 #11
0
        for callerName, callerFunc in callers:
            callerMiddle = pileupMiddle + '-' + callerName
            callerFile = join(outputDir, 'calls-' + callerMiddle + '.vcf')
            callerFunc(callerFile, pileupFile, executor)

            for consensusName, consensusFunc in consensusers:
                consensusMiddle = callerMiddle + '-' + consensusName
                consensusFile = join(outputDir,
                                     'consensus-' + consensusMiddle + '.fasta')
                consensusFiles.append(consensusFile)
                consensusFunc(consensusFile, callerFile, consensusMiddle,
                              args.referenceFile, executor)

    # Let's assume there's at least one consensus file.
    consensusesFile = join(outputDir, 'consensuses.fasta')
    executor.execute(
        'cat %s > %s' % (' '.join(consensusFiles), consensusesFile))

    htmlFile = join(outputDir, 'consensus-identity.html')
    executor.execute(
        ('fasta-identity-table.py --footer --showGaps --showLengths < %s | '
         "perl -pe 's/-(bcftools|vcfutils)/ $1/g' > %s") %
        (consensusesFile, htmlFile))

    verbose = args.verbose
    if verbose > 0:
        print('The following commands were executed:')
        for line in executor.log:
            if line.startswith('#'):
                if verbose > 1:
                    print(line)
            else:
コード例 #12
0
        for callerName, callerFunc in callers:
            callerMiddle = pileupMiddle + '-' + callerName
            callerFile = join(outputDir, 'calls-' + callerMiddle + '.vcf')
            callerFunc(callerFile, pileupFile, executor)

            for consensusName, consensusFunc in consensusers:
                consensusMiddle = callerMiddle + '-' + consensusName
                consensusFile = join(outputDir,
                                     'consensus-' + consensusMiddle + '.fasta')
                consensusFiles.append(consensusFile)
                consensusFunc(consensusFile, callerFile, consensusMiddle,
                              args.referenceFile, executor)

    # Let's assume there's at least one consensus file.
    consensusesFile = join(outputDir, 'consensuses.fasta')
    executor.execute('cat %s > %s' %
                     (' '.join(consensusFiles), consensusesFile))

    htmlFile = join(outputDir, 'consensus-identity.html')
    executor.execute(
        ('fasta-identity-table.py --footer --showGaps --showLengths < %s | '
         "perl -pe 's/-(bcftools|vcfutils)/ $1/g' > %s") %
        (consensusesFile, htmlFile))

    verbose = args.verbose
    if verbose > 0:
        print('The following commands were executed:')
        for line in executor.log:
            if line.startswith('#'):
                if verbose > 1:
                    print(line)
            else:
コード例 #13
0
if not (args.force or args.dryRun):
    existing = []
    for filename in samFile, bamFile, sortedBamFile:
        if exists(filename):
            existing.append(filename)
    if existing:
        print('Will not overwrite pre-existing file%s %s. '
              'Use --force to make me.' %
              ('' if len(existing) == 1 else 's', ', '.join(existing)),
              file=sys.stderr)
        sys.exit(2)

e = Executor(args.dryRun)

e.execute("bwa %s '%s' '%s' > '%s'" %
          (args.bwaArgs, args.bwaIndex, args.fastaFile, samFile))

if not args.noBAM:
    e.execute("samtools view -b < '%s' > '%s'" % (samFile, bamFile))

    if not args.noClean:
        e.execute("rm '%s'" % samFile)

    if not args.noSort:
        e.execute("samtools sort '%s' > '%s'" % (bamFile, sortedBamFile))

        if not args.noClean:
            e.execute("rm '%s'" % bamFile)

        if not args.noIndex:
            e.execute("samtools index '%s'" % sortedBamFile)
コード例 #14
0
ファイル: test_process.py プロジェクト: TaliVeith/dark-matter
 def testDryRunDefault(self):
     e = Executor(dryRun=True)
     result = e.execute('date')
     self.assertIsNone(result)
     self.assertEqual('$ date', e.log[-1])
コード例 #15
0
ファイル: run-bwa.py プロジェクト: acorg/dark-matter
if not (args.force or args.dryRun):
    existing = []
    for filename in samFile, bamFile, sortedBamFile:
        if exists(filename):
            existing.append(filename)
    if existing:
        print('Will not overwrite pre-existing file%s %s. '
              'Use --force to make me.' % (
                  '' if len(existing) == 1 else 's',
                  ', '.join(existing)),
              file=sys.stderr)
        sys.exit(2)

e = Executor(args.dryRun)

e.execute("bwa %s '%s' '%s' > '%s'" % (
    args.bwaArgs, args.bwaIndex, args.fastaFile, samFile))

if not args.noBAM:
    e.execute("samtools view -b < '%s' > '%s'" % (samFile, bamFile))

    if not args.noClean:
        e.execute("rm '%s'" % samFile)

    if not args.noSort:
        e.execute("samtools sort '%s' > '%s'" % (bamFile, sortedBamFile))

        if not args.noClean:
            e.execute("rm '%s'" % bamFile)

        if not args.noIndex:
            e.execute("samtools index '%s'" % sortedBamFile)
コード例 #16
0
def main(args, logfp):
    """
    Create genomes and reads for a multiple infection detection experiment.

    @param args: A namespace instance, as returned by parse_args
    @param logfp: A file object to write log information to.
    """
    print('Invocation arguments', args, file=logfp)

    qOutputDir = quote(args.outputDir)
    genome1 = join(qOutputDir, 'genome-1.fasta')
    genome2 = join(qOutputDir, 'genome-2.fasta')
    genome2locations = join(qOutputDir, 'genome-2.locations')
    reads1 = join(qOutputDir, 'reads-1.fastq')
    reads2 = join(qOutputDir, 'reads-2.fastq')
    reads12 = join(qOutputDir, 'reads-12.fastq')

    executor = Executor(args.dryRun)

    if args.genome1Filename:
        executor.execute('ln -s %s %s' %
                         (quote(args.genome1Filename), genome1))
    else:
        if args.genomeLength < 1:
            print('Random initial genome length must be > 0.', file=sys.stderr)
            sys.exit(3)
        print('Writing random starting genome of length %d to %s' %
              (args.genomeLength, genome1),
              file=logfp)
        if not args.dryRun:
            sequence = ''.join(
                [choice('ACGT') for _ in range(args.genomeLength)])
            with open(genome1, 'w') as fp:
                print('>genome-1\n%s' % sequence, file=fp)

    if args.genome2Filename:
        executor.execute('ln -s %s %s' %
                         (quote(args.genome2Filename), genome2))
    else:
        # Make a second genome using the given mutation rate. Print its
        # mutated locations to a file.
        (genome1read, ) = list(FastaReads(genome1))
        offsets = mutateRead(genome1read, args.genome2MutationRate)
        with open(genome2locations, 'w') as fp:
            print('\n'.join(str(offset + 1) for offset in sorted(offsets)),
                  file=fp)
        genome1read.id = 'genome-2'
        Reads([genome1read]).save(genome2)

    cmdPrefix = ('create-reads.py --maxReadLength %d --minReadLength %d '
                 '--meanLength %d --sdLength %d --rate %f ' %
                 (args.maxReadLength, args.minReadLength, args.meanReadLength,
                  args.sdReadLength, args.readMutationRate))

    for info in [{
            'reads': reads1,
            'fasta': genome1,
            'number': 1,
            'count': args.genome1ReadCount or args.readCount,
    }, {
            'reads': reads2,
            'fasta': genome2,
            'number': 2,
            'count': args.genome2ReadCount or args.readCount,
    }]:
        executor.execute(cmdPrefix +
                         ('--idPrefix genome-%(number)d-read- '
                          '--count %(count)d < %(fasta)s > %(reads)s' % info))

    executor.execute('cat %s %s > %s' % (reads1, reads2, reads12))

    print('\n'.join(executor.log), file=logfp)
コード例 #17
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
        description='Make a consensus sequence.')

    parser.add_argument('--reference',
                        required=True,
                        help='The reference FASTA file.')

    parser.add_argument(
        '--bam',
        help=('The BAM file from which the consensus should be made. '
              'Required if --maskLowCoverage is used. If no BAM file is '
              'given, a VCF file must be provided. If both a BAM and a VCF '
              'file are given, the VCF file will take precedence.'))

    parser.add_argument(
        '--vcfFile',
        help=('The VCF file. If omitted, bcftools will be used to make a VCF '
              'file from the BAM file.'))

    group = parser.add_mutually_exclusive_group()

    group.add_argument(
        '--id',
        help=('The id to use in the consensus sequence in the output FASTA. '
              'If not given, the reference sequence id will be used.'))

    group.add_argument(
        '--idLambda',
        metavar='LAMBDA-FUNCTION',
        help=('A one-argument function taking and returning a read id. '
              'This can be used to set the id of the reference sequence based '
              'on the id of the reference sequence (the function will be '
              'called with the id of the reference sequence). E.g., '
              '--idLambda "lambda id: id.split(\'_\')[0]" or '
              '--idLambda "lambda id: id[:10] + \'-consensus\'".'))

    parser.add_argument(
        '--sample',
        help=('The name of the sample (from the @RG SM tag in the original '
              'alignment BAM file) for which a consensus should be made. '
              'If not given, the first sample name (from the #CHROM header) '
              'in the VCF file will be used.'))

    parser.add_argument(
        '--dryRun',
        action='store_true',
        help='Do not run commands, just print what would be done.')

    parser.add_argument(
        '--maskLowCoverage',
        default=0,
        type=int,
        help=('Put an N into sites where the coverage is below the specified '
              'cutoff. If you specify a negative numer, masking will be '
              'turned off. Requires --bam.'))

    parser.add_argument(
        '--log',
        action='store_true',
        help=('Show a log of commands that were (or would be, if --dryRun is '
              'used) executed.'))

    parser.add_argument(
        '--noClean',
        action='store_false',
        dest='clean',
        help=('Do not remove intermediate files or the temporary directory.'))

    parser.add_argument(
        '--callHaplotypesGATK',
        action='store_true',
        help=('Use GATK to call haplotypes. See '
              'https://gatk.broadinstitute.org for details on GATK.'))

    parser.add_argument(
        '--picardJar',
        help=('The path to the Picard jar file. See '
              'https://github.com/broadinstitute/picard for details on '
              'Picard.'))

    parser.add_argument(
        '--ivar',
        action='store_true',
        help='If given, ivar will be used to call the consensus.')

    parser.add_argument(
        '--ivarFrequencyThreshold',
        type=float,
        help=(f'The frequency threshold used by ivar when calling the '
              f'consensus. If the frequency of the most-common nucleotide at '
              f'a site meets this threshold, the nucleotide will be called. '
              f'Otherwise, an ambiguous nucleotide code will be produced, '
              f'based on the smallest set of most-frequent nucleotides whose '
              f'summed frequencies meet the threshold. See {IVAR_DOCS} for '
              f'more information. If not given, '
              f'{IVAR_FREQUENCY_THRESHOLD_DEFAULT} is used. Can only be used '
              f'if --ivar is also specified.'))

    parser.add_argument(
        '--ivarBedFile',
        help=('If ivar should trim primers, a BED file of the primer '
              'positions.'))

    args = parser.parse_args()

    if not (args.bam or args.vcfFile):
        print('At least one of --bam or --vcfFile must be given.',
              file=sys.stderr)
        sys.exit(1)

    if args.maskLowCoverage and not args.bam:
        print('If --maskLowCoverage is used, --bam must be too.',
              file=sys.stderr)
        sys.exit(1)

    if args.ivar and not args.bam:
        print('If --ivar is used, --bam must be too.', file=sys.stderr)
        sys.exit(1)

    if args.ivarFrequencyThreshold is not None and not args.ivar:
        print('If --ivarFrequencyThreshold is used, --ivar must be too.',
              file=sys.stderr)
        sys.exit(1)

    if args.ivar and args.ivarFrequencyThreshold is None:
        args.ivarFrequencyThreshold = IVAR_FREQUENCY_THRESHOLD_DEFAULT

    e = Executor(args.dryRun)

    tempdir = mkdtemp(prefix='consensus-')

    if args.vcfFile:
        vcfFile = args.vcfFile
    else:
        # No VCF file provided, so make one.
        vcfFile = join(tempdir, 'vcf.gz')
        if args.callHaplotypesGATK:
            e.execute("samtools index '%s'" % args.bam)
            if args.picardJar:
                picardJar = args.picardJar
            else:
                try:
                    picardJar = os.environ['PICARD_JAR']
                except KeyError:
                    print(
                        'If you use --callHaplotypesGATK, you must give a '
                        'Picard JAR file with --picardJar or else set '
                        'PICARD_JAR in your environment.',
                        file=sys.stderr)
                    sys.exit(1)

            indexFile = args.reference + '.fai'
            if os.path.exists(indexFile):
                removeIndex = False
            else:
                removeIndex = True
                e.execute("samtools faidx '%s'" % args.reference)

            if args.reference.lower().endswith('.fasta'):
                dictFile = args.reference[:-len('.fasta')] + '.dict'
            else:
                dictFile = args.reference + '.dict'

            if os.path.exists(dictFile):
                removeDict = False
            else:
                removeDict = True
                e.execute(
                    "java -jar '%s' CreateSequenceDictionary R='%s' O='%s'" %
                    (picardJar, args.reference, dictFile))

            e.execute('gatk --java-options -Xmx4g HaplotypeCaller '
                      "--reference '%s' "
                      "--input '%s' "
                      "--output '%s' "
                      "--sample-ploidy 1 "
                      '-ERC GVCF' % (args.reference, args.bam, vcfFile))

            if removeIndex:
                e.execute("rm '%s'" % indexFile)

            if removeDict:
                e.execute("rm '%s'" % dictFile)
        else:
            e.execute("bcftools mpileup --max-depth 5000 -Ou -f '%s' '%s' | "
                      "bcftools call --ploidy 1 -mv -Oz -o '%s'" %
                      (args.reference, args.bam, vcfFile))

            e.execute("bcftools index '%s'" % vcfFile)

    if args.maskLowCoverage >= 0:
        # Make a BED file.
        bedFile = join(tempdir, 'mask.bed')
        # The doubled-% below are so that Python doesn't try to fill in the
        # values and instead just generates a single % that awk sees.
        e.execute("samtools depth -a '%s' | "
                  "awk '$3 < %d {printf \"%%s\\t%%d\\t%%d\\n\", "
                  "$1, $2 - 1, $2}' > '%s'" %
                  (args.bam, args.maskLowCoverage, bedFile))
        maskArg = '--mask ' + bedFile
    else:
        maskArg = ''

    if args.sample:
        sample = args.sample
    else:
        result = e.execute("gunzip -c '%s' | egrep -m 1 '^#CHROM' | cut -f10" %
                           vcfFile)
        sample = 'SAMPLE-NAME' if args.dryRun else result.stdout.strip()

    consensusFile = join(tempdir, 'consensus.fasta')

    if args.ivar:
        if args.ivarBedFile:
            tempBamFile = join(tempdir, basename(args.bam) + '-trimmed')
            result = e.execute("ivar trim -i %r -b %r -p %r -e" %
                               (args.bam, args.ivarBedFile, tempBamFile))
            ivarTempBamFile = tempBamFile + '.bam'
            sortedIvarTempBamFile = tempBamFile + '-trimmed-sorted.bam'
            result = e.execute("samtools sort %r -o %r" %
                               (ivarTempBamFile, sortedIvarTempBamFile))
            bamFile = sortedIvarTempBamFile
        else:
            bamFile = args.bam

        ivarConsensusFile = join(tempdir, 'temporary-consensus')
        result = e.execute("samtools mpileup -A -Q 0 %r | "
                           "ivar consensus -p %r -q 20 -t %r -m %r" %
                           (bamFile, ivarConsensusFile,
                            args.ivarFrequencyThreshold, args.maskLowCoverage))

        result = e.execute("mv %s %s" %
                           (ivarConsensusFile + '.fa', consensusFile))

    else:
        result = e.execute(
            "bcftools consensus --sample '%s' --iupac-codes %s --fasta-ref "
            "'%s' '%s' > '%s'" %
            (sample, maskArg, args.reference, vcfFile, consensusFile))

        if result.stderr:
            print(result.stderr, end='', file=sys.stderr)

    if not args.dryRun:
        consensus = list(FastaReads(consensusFile))[0]
        if args.id is not None:
            consensus.id = args.id
        elif args.idLambda is not None:
            idLambda = eval(args.idLambda)
            consensus.id = idLambda(consensus.id)

        print(consensus.toString('fasta'), end='')

    if args.dryRun or args.log:
        print('\n'.join(e.log), file=sys.stderr)

    if tempdir:
        if args.clean:
            e.execute("rm -r '%s'" % tempdir)
        else:
            print('Temporary directory %r.' % tempdir, file=sys.stderr)
コード例 #18
0
class DiamondExecutor(object):
    """

    @param dryRun: If C{True} do not actually execute the DIAMOND commands.
    """
    SUBJECTS_FILENAME = 'subjects.fasta'
    QUERIES_FILENAME = 'queries.fasta'
    OUTPUT_FILENAME = 'diamond.tsv'

    def __init__(self, dryRun=False):
        self._dirty = False
        self._dir = mkdtemp()
        self._subjectsFp = None
        self._subjectsExist = False
        self._executor = Executor(dryRun)

    def addSubject(self, subject):
        """
        Add a subject sequence to the database.

        @param subject: A C{dark.reads.Read} instance.
        """
        if self._subjectsFp is None:
            if six.PY3:
                self._subjectsFp = open(join(self._dir,
                                             self.SUBJECTS_FILENAME),
                                        'a',
                                        encoding='utf-8')
            else:
                self._subjectsFp = open(
                    join(self._dir, self.SUBJECTS_FILENAME), 'a')

        print(subject.toString('fasta'), end='', file=self._subjectsFp)
        self._subjectsExist = self._dirty = True

    def cleanup(self):
        """
        Remove the temporary directory we made.
        """
        if self._subjectsFp:
            self._subjectsFp.close()
            self._subjectsFp = None
        rmtree(self._dir)

    def search(self, reads, fieldNames=None):
        """
        Match reads against the database.

        @param reads: An instance of C{dark.reads.Reads}.
        @param fieldNames: An iterable of C{str} field names for DIAMOND
            tabular output (format 6). See diamond help for the names of all
            available fields.
        @return: A generator that yields C{dict}s with keys as in
            C{fieldNames}.
        """
        if not self._subjectsExist:
            raise ValueError('No subject sequences in the database')

        with cd(self._dir):
            if self._dirty:
                self._subjectsFp.close()
                self._subjectsFp = None
                self._executor.execute('diamond makedb --db database --in %s' %
                                       self.SUBJECTS_FILENAME)

            with open(self.QUERIES_FILENAME, 'w') as fp:
                count = reads.save(fp, format_='fastq')

            if count == 0:
                raise ValueError('No query sequences were passed')

            fieldNames = fieldNames or FIELDS.split()

            self._executor.execute(
                'diamond blastx --db database --query %s --outfmt 6 %s > %s' %
                (self.QUERIES_FILENAME, ' '.join(fieldNames),
                 self.OUTPUT_FILENAME))

            dtf = DiamondTabularFormat(fieldNames)

            for diamondDict in dtf.diamondTabularFormatToDicts(
                    self.OUTPUT_FILENAME):
                yield diamondDict

    def __enter__(self):
        return self

    def __exit__(self, excType, excValue, traceback):
        self.cleanup()