Esempio n. 1
0
 def testSaveToFileDescriptor(self):
     """
     A Reads instance must save to a file-like object if not passed a string
     filename.
     """
     reads = Reads()
     read1 = Read('id1', 'AT')
     read2 = Read('id2', 'AC')
     reads.add(read1)
     reads.add(read2)
     fp = StringIO()
     reads.save(fp)
     self.assertEqual('>id1\nAT\n>id2\nAC\n', fp.getvalue())
Esempio n. 2
0
 def testSaveAsFASTA(self):
     """
     A Reads instance must be able to save in FASTA format.
     """
     reads = Reads()
     read1 = Read('id1', 'AT')
     read2 = Read('id2', 'AC')
     reads.add(read1)
     reads.add(read2)
     mockOpener = mockOpen()
     with patch('__builtin__.open', mockOpener, create=True):
         reads.save('filename', 'fasta')
     handle = mockOpener()
     self.assertEqual([call('>id1\nAT\n'), call('>id2\nAC\n')],
                      handle.write.call_args_list)
Esempio n. 3
0
 def testSaveWithUppercaseFormat(self):
     """
     A Reads instance must save correctly when the format string is
     given in upper case.
     """
     reads = Reads()
     read1 = Read('id1', 'AT')
     read2 = Read('id2', 'AC')
     reads.add(read1)
     reads.add(read2)
     mockOpener = mockOpen()
     with patch('__builtin__.open', mockOpener, create=True):
         reads.save('filename', 'FASTA')
     handle = mockOpener()
     self.assertEqual([call('>id1\nAT\n'), call('>id2\nAC\n')],
                      handle.write.call_args_list)
Esempio n. 4
0
    def add(self, pathogenName, sampleName):
        """
        Add a (pathogen name, sample name) combination and get its FASTA/FASTQ
        file name and unique read count. Write the FASTA/FASTQ file if it does
        not already exist. Save the unique read count into
        C{self._proteinGrouper}.

        @param pathogenName: A C{str} pathogen name.
        @param sampleName: A C{str} sample name.
        @return: A C{str} giving the FASTA/FASTQ file name holding all the
            reads (without duplicates, by id) from the sample that matched the
            proteins in the given pathogen.
        """
        pathogenIndex = self._pathogens.setdefault(pathogenName,
                                                   len(self._pathogens))
        sampleIndex = self._samples.setdefault(sampleName, len(self._samples))

        try:
            return self._readsFilenames[(pathogenIndex, sampleIndex)]
        except KeyError:
            reads = Reads()
            for proteinMatch in self._proteinGrouper.pathogenNames[
                    pathogenName][sampleName]['proteins'].values():
                for read in self._readsClass(proteinMatch['readsFilename']):
                    reads.add(read)
            saveFilename = join(
                proteinMatch['outDir'], 'pathogen-%d-sample-%d.%s' %
                (pathogenIndex, sampleIndex, self._format))
            reads.filter(removeDuplicatesById=True)
            nReads = reads.save(saveFilename, format_=self._format)
            # Save the unique read count into self._proteinGrouper
            self._proteinGrouper.pathogenNames[pathogenName][sampleName][
                'uniqueReadCount'] = nReads
            self._readsFilenames[(pathogenIndex, sampleIndex)] = saveFilename
            return saveFilename
Esempio n. 5
0
    def _writeFASTA(self, i, image):
        """
        Write a FASTA file containing the set of reads that hit a sequence.

        @param i: The number of the image in self._images.
        @param image: A member of self._images.
        @return: A C{dark.reads.Reads} instance holding the reads for the
            image title.
        """
        reads = Reads()
        title = image['title']
        titleAlignments = self._titlesAlignments[title]
        for titleAlignment in titleAlignments:
            reads.add(titleAlignment.read)
        filename = '%s/%d.fasta' % (self._outputDir, i)
        reads.save(filename, 'fasta')
        return reads
Esempio n. 6
0
    def _writeFASTA(self, i, image):
        """
        Write a FASTA file containing the set of reads that hit a sequence.

        @param i: The number of the image in self._images.
        @param image: A member of self._images.
        @return: A C{dark.reads.Reads} instance holding the reads for the
            image title.
        """
        reads = Reads()
        title = image['title']
        titleAlignments = self._titlesAlignments[title]
        for titleAlignment in titleAlignments:
            reads.add(titleAlignment.read)
        filename = '%s/%d.fasta' % (self._outputDir, i)
        reads.save(filename, 'fasta')
        return reads
Esempio n. 7
0
    def add(self, pathogenName, sampleName):
        """
        Add a (pathogen name, sample name) combination and get its FASTA/FASTQ
        file name and unique read count. Write the FASTA/FASTQ file if it does
        not already exist. Save the unique read count into
        C{self._proteinGrouper}.

        @param pathogenName: A C{str} pathogen name.
        @param sampleName: A C{str} sample name.
        @return: A C{str} giving the FASTA/FASTQ file name holding all the
            reads (without duplicates, by id) from the sample that matched the
            proteins in the given pathogen.
        """
        pathogenIndex = self._pathogens.setdefault(pathogenName,
                                                   len(self._pathogens))
        sampleIndex = self._samples.setdefault(sampleName, len(self._samples))

        try:
            return self._readsFilenames[(pathogenIndex, sampleIndex)]
        except KeyError:
            reads = Reads()
            for proteinMatch in self._proteinGrouper.pathogenNames[
                    pathogenName][sampleName]['proteins'].values():
                for read in self._readsClass(proteinMatch['readsFilename']):
                    reads.add(read)
            saveFilename = join(
                proteinMatch['outDir'],
                'pathogen-%d-sample-%d.%s' % (pathogenIndex, sampleIndex,
                                              self._format))
            reads.filter(removeDuplicatesById=True)
            nReads = reads.save(saveFilename, format_=self._format)
            # Save the unique read count into self._proteinGrouper
            self._proteinGrouper.pathogenNames[
                pathogenName][sampleName]['uniqueReadCount'] = nReads
            self._readsFilenames[(pathogenIndex, sampleIndex)] = saveFilename
            return saveFilename
if args.alignmentFile:
    args.align = True

if args.align:
    len1, len2 = map(len, reads)
    if len1 == len2:
        print('Pre-alignment, sequence lengths were identical: %s' % len1)
    else:
        print('Pre-alignment, sequence lengths: %d, %d (difference %d)' %
              (len1, len2, abs(len1 - len2)))

    # Align.
    reads = needle(reads)

    if args.alignmentFile:
        assert reads.save(args.alignmentFile) == 2

offsets = (parseRangeString(args.sites, convertToZeroBased=True)
           if args.sites else None)

read1, read2 = reads
len1, len2 = map(len, reads)
identicalLengths = len1 == len2

# Sanity check.
if args.align:
    assert identicalLengths

match = compareDNAReads(read1,
                        read2,
                        matchAmbiguous=(not args.strict),
Esempio n. 9
0
if args.alignmentFile:
    args.align = True

if args.align:
    len1, len2 = map(len, reads)
    if len1 == len2:
        print('Pre-alignment, sequence lengths were identical: %s' % len1)
    else:
        print('Pre-alignment, sequence lengths: %d, %d (difference %d)' % (
            len1, len2, abs(len1 - len2)))

    # Align.
    reads = needle(reads)

    if args.alignmentFile:
        assert reads.save(args.alignmentFile) == 2

offsets = (parseRangeString(args.sites, convertToZeroBased=True)
           if args.sites else None)

read1, read2 = reads
len1, len2 = map(len, reads)
identicalLengths = len1 == len2

# Sanity check.
if args.align:
    assert identicalLengths

match = compareDNAReads(read1, read2, matchAmbiguous=(not args.strict),
                        offsets=offsets)
Esempio n. 10
0
def makeBAM(template, bamReferences=None, fastaReferences=None):
    """
    A context manager decorator to make a simple BAM file from a template.
    Note that this code invokes samtools.

    @param template: An iterable of C{str} sequences. The first will be treated
        as the reference, and then subsequent pairs (if any) will be treated as
        read and quality strings. Reads and quality strings can be indented
        with spaces to show where the read aligns with the reference.
    @return: A context manager that produces a 2-tuple containing the reference
        C{DNARead} instance and the C{Path} of the BAM file.
    """
    if len(template) % 2 != 1:
        raise ValueError(
            'The template must have an odd number of strings, specifying the '
            'reference sequence, then zero or more read/quality pairs.')

    leftPaddedReference = template[0]
    templateSequence = leftPaddedReference.lstrip().replace('-', '')

    if bamReferences is None:
        matchedReference = DNARead(REF_ID, templateSequence)
        bamReferences = Reads([matchedReference])
    else:
        matchedReference = bamReferences[0]
        # Sanity check: The first BAM reference must have the same sequence
        # as the template.
        assert matchedReference.sequence == templateSequence
        bamReferences = Reads(bamReferences)

    fastaReferences = Reads(
        bamReferences if fastaReferences is None else fastaReferences)

    nSeqs = (len(template) - 1) >> 1
    dirname = mkdtemp(prefix='test-consensus-')
    e = Executor()

    try:
        fastaFile = Path(dirname) / 'references.fasta'
        samFile = Path(dirname) / 'file.sam'
        bamFile = Path(dirname) / 'file.bam'

        fastaReferences.save(fastaFile)

        with open(samFile, 'w') as fp:
            for reference in bamReferences:
                print(f'@SQ\tSN:{reference.id}\tLN:{len(reference)}', file=fp)

            for count in range(nSeqs):
                leftPaddedQuery = template[count * 2 + 1].rstrip()
                leftPaddedQuality = template[count * 2 + 2].rstrip()
                assert len(leftPaddedQuery) == len(leftPaddedQuality)
                query = leftPaddedQuery.lstrip()
                quality = leftPaddedQuality.lstrip()
                queryNoGaps = qualityNoGaps = ''
                for queryBase, qualityBase in zip(query, quality):
                    if queryBase != '-':
                        queryNoGaps += queryBase
                        qualityNoGaps += qualityBase

                print(
                    '\t'.join(
                        map(
                            str,
                            (
                                f'read{count}',  # QNAME (query name)
                                0,  # FLAGS
                                matchedReference.id,  # RNAME (reference name)
                                matchOffset(leftPaddedReference,
                                            leftPaddedQuery) + 1,
                                30,  # MAPQ (mapping quality)
                                makeCigar(leftPaddedReference,
                                          leftPaddedQuery),  # CIGAR
                                '*',  # MRNM (mate reference name)
                                0,  # MPOS (mate position)
                                0,  # ISIZE (insert size)
                                queryNoGaps,  # SEQ
                                qualityNoGaps,  # QUAL
                            ))),
                    file=fp)

        e.execute(f'samtools sort -O BAM --write-index -o {str(bamFile)!r} '
                  f'{str(samFile)!r}')
        yield (fastaFile, bamFile)
    finally:
        # import sys; print(f'{samFile}', file=sys.stderr)
        e.execute(f'rm -fr {dirname!r}')