Example #1
0
    def testTwoFiles(self):
        """
        It must be possible to read from two FASTA files.
        """
        class SideEffect(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('file1.fasta', filename)
                    self.count += 1
                    return File(['>id1\n', 'ACTG\n'])
                elif self.count == 1:
                    self.test.assertEqual('file2.fasta', filename)
                    self.count += 1
                    return File(['>id2\n', 'CAGT\n'])
                else:
                    self.fail('We are only supposed to be called twice!')

        sideEffect = SideEffect(self)
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect.sideEffect
            reads = FastaReads(['file1.fasta', 'file2.fasta'])
            self.assertEqual([
                DNARead('id1', 'ACTG'),
                DNARead('id2', 'CAGT'),
            ], list(reads))
Example #2
0
    def testNtSequencesChangesTuple(self):
        """
        It must be possible to retrieve aligned nucleotide sequences
        and check on changes using a tuple specification.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        # Note: 0-based offsets.
        testCount, errorCount, result = genome.checkFeature(
            'spike',
            (('A', 0, 'A'), ('T', 1, 'A'), ('A', 2, 'T'), ('T', 3, 'T')), True)

        self.assertEqual(4, testCount)
        self.assertEqual(3, errorCount)
        self.assertEqual((True, 'A', True, 'A'), result[('A', 0, 'A')])
        self.assertEqual((True, 'T', False, 'T'), result[('T', 1, 'A')])
        self.assertEqual((False, 'T', True, 'T'), result[('A', 2, 'T')])
        self.assertEqual((False, 'C', False, 'C'), result[('T', 3, 'T')])
Example #3
0
    def testCorrectNumberOfAlignments(self):
        """
        A JSONRecordsReader must return the expected number of alignments.
        """
        reads = Reads([
            DNARead(
                'BIOMICS-HISEQTP:140:HJFH5BCXX:1:1101:9489:4234 1:N:0:TGACCA',
                'AGGGCTCGGATGCTGTGGGTGTTTGTGTGGAGTTGGGTGTGTTTTCGGGG'
                'GTGGTTGAGTGGAGGGATTGCTGTTGGATTGTGTGTTTTGTTGTGGTTGCG'),
            DNARead(
                'BIOMICS-HISEQTP:140:HJFH5BCXX:1:1101:19964:6287 1:N:0:TGACCA',
                'TTTTTCTCCTGCGTAGATGAACCTACCCATGGCTTAGTAGGTCCTCTTTC'
                'ACCACGAGTTAAACCATTAACATTATATTTTTCTATAATTATACCACTGGC'),
            DNARead(
                'BIOMICS-HISEQTP:140:HJFH5BCXX:1:1101:11488:7488 1:N:0:TGACCA',
                'ACCTCCGCCTCCCAGGTTCAAGCAATTCTCCTGCCTTAGCCTCCTGAATA'
                'GCTGGGATTACAGGTATGCAGGAGGCTAAGGCAGGAGAATTGCTTGAACCT'),
            DNARead(
                'BIOMICS-HISEQTP:140:HJFH5BCXX:1:1101:14734:7512 1:N:0:TGACCA',
                'GAGGGTGGAGGTAACTGAGGAAGCAAAGGCTTGGAGACAGGGCCCCTCAT'
                'AGCCAGTGAGTGCGCCATTTTCTTTGGAGCAATTGGGTGGGGAGATGGGGC'),
        ])

        mockOpener = mockOpen(read_data=JSON)
        with patch.object(builtins, 'open', mockOpener):
            reader = JSONRecordsReader('file.json')
            alignments = list(reader.readAlignments(reads))
            self.assertEqual(4, len(alignments))
Example #4
0
    def testDictLookupWithTwoFiles(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected reads when sequences are added from two files.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0 or self.count == 2 or self.count == 3:
                    self.test.assertEqual('filename1.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1 or self.count == 4:
                    self.test.assertEqual('filename2.fasta', filename)
                    self.count += 1
                    return StringIO('>seq3\nAAACCC\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename1.fasta')
            index.addFile('filename2.fasta')
            self.assertEqual(DNARead('id1', 'ACTG'), index['id1'])
            self.assertEqual(DNARead('id2', 'AACCTTGG'), index['id2'])
            self.assertEqual(DNARead('seq3', 'AAACCC'), index['seq3'])
            index.close()
Example #5
0
    def testCorrectNumberOfAlignmentsWhenReadIdsAreAbbreviated(self):
        """
        A JSONRecordsReader must return the expected number of alignments
        when read ids are truncated at the first space. That is, the BLAST
        output has query names that are long and which contain a space but
        the reads in the FASTA have just the first part of those names (up to
        the first space).
        """
        reads = Reads([
            DNARead(
                'BIOMICS-HISEQTP:140:HJFH5BCXX:1:1101:9489:4234',
                'AGGGCTCGGATGCTGTGGGTGTTTGTGTGGAGTTGGGTGTGTTTTCGGGG'
                'GTGGTTGAGTGGAGGGATTGCTGTTGGATTGTGTGTTTTGTTGTGGTTGCG'),
            DNARead(
                'BIOMICS-HISEQTP:140:HJFH5BCXX:1:1101:19964:6287',
                'TTTTTCTCCTGCGTAGATGAACCTACCCATGGCTTAGTAGGTCCTCTTTC'
                'ACCACGAGTTAAACCATTAACATTATATTTTTCTATAATTATACCACTGGC'),
            DNARead(
                'BIOMICS-HISEQTP:140:HJFH5BCXX:1:1101:11488:7488',
                'ACCTCCGCCTCCCAGGTTCAAGCAATTCTCCTGCCTTAGCCTCCTGAATA'
                'GCTGGGATTACAGGTATGCAGGAGGCTAAGGCAGGAGAATTGCTTGAACCT'),
            DNARead(
                'BIOMICS-HISEQTP:140:HJFH5BCXX:1:1101:14734:7512',
                'GAGGGTGGAGGTAACTGAGGAAGCAAAGGCTTGGAGACAGGGCCCCTCAT'
                'AGCCAGTGAGTGCGCCATTTTCTTTGGAGCAATTGGGTGGGGAGATGGGGC'),
        ])

        mockOpener = mockOpen(read_data=JSON)
        with patch.object(builtins, 'open', mockOpener):
            reader = JSONRecordsReader('file.json')
            alignments = list(reader.readAlignments(reads))
            self.assertEqual(4, len(alignments))
Example #6
0
    def testNtSequencesChangesIndexErrorIgnore(self):
        """
        If we check on nucleotide sequences with an out-of-range
        check, no error should be printed if we pass onError='ignore'
        and the expected error result must be returned.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A100000A', nt=True, onError='ignore', errFp=err)
        self.assertEqual('', err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])
Example #7
0
    def testAaSequencesChangesTranslationErrorIgnore(self):
        """
        Check that no error is printed when checking AA sequences and
        onError='ignore' and that the expected result is returned.
        """
        features = Features(
            {
                'orf1ab': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()

        testCount, errorCount, result = genome.checkFeature(
            'orf1ab', 'A100000A', nt=False, onError='ignore', errFp=err)
        self.assertEqual('', err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])
Example #8
0
    def testNtSequencesChangesString(self):
        """
        It must be possible to retrieve aligned nucleotide sequences
        and check on changes using a string specification.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        # Note: 1-based locations.
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A1A T2A A3T T4T', True)

        self.assertEqual(4, testCount)
        self.assertEqual(3, errorCount)
        self.assertEqual((True, 'A', True, 'A'), result['A1A'])
        self.assertEqual((True, 'T', False, 'T'), result['T2A'])
        self.assertEqual((False, 'T', True, 'T'), result['A3T'])
        self.assertEqual((False, 'C', False, 'C'), result['T4T'])
Example #9
0
    def _checkTranslation(self, genome, ranges, protein):
        """
        Make sure all protein sequences supposed to be in the genome can in
        fact be obtained by translating the genome.

        @param genome: A C{dict} with genome information from our sqlite3
            protein/genome database, as returned by
            C{dark.civ.proteins.SqliteIndex.findGenome.
        @param ranges: A C{list} of (start, stop, forward) nucleotide ranges
            for the protein in the genome.
        @param protein: A C{dict} with protein information from our sqlite3
            protein/genome database, as returned by
            C{dark.civ.proteins.SqliteIndex.findProtein.
        """
        proteinSequence = protein['sequence'] + '*'

        # print('protein name', protein['product'], 'ranges', ranges)
        sequence = ''.join(
            [genome['sequence'][start:stop] for (start, stop, _) in ranges])

        genomeRead = DNARead('id', sequence)
        translations = list(genomeRead.translations())
        index = 0 if protein['forward'] else 3
        if translations[index].sequence != proteinSequence:
            # TODO: improve this error to show what actually went wrong.
            raise ValueError(
                'Could not translate genome range to get protein sequence')
Example #10
0
    def testFindTwoKozakConsensi(self):
        """
        In a given sequence with two Kozak consensuses with different offsets
        and qualities, the output should be as expected.
        """
        read = DNARead('id', 'ATTGCCGCCATGGGGGGCCATGG')
        expectedRead1 = DNARead('id', 'ATTGCCGCCATGGGGGGCCATGG')
        expectedRead2 = DNARead('id', 'ATTGCCGCCATGGGGGGCCATGG')
        expectedKozakRead1 = DNAKozakRead(expectedRead1, 3, 13, 100.0)
        expectedKozakRead2 = DNAKozakRead(expectedRead2, 13, 23, 60.0)

        self.assertEqual([expectedKozakRead1, expectedKozakRead2],
                         list(findKozakConsensus(read)))
Example #11
0
 def testTwoReads(self):
     """
     A FASTQ file with two reads must be read properly and its
     sequences must be returned in the correct order.
     """
     data = '\n'.join(['@id1', 'ACGT', '+', '!!!!',
                       '@id2', 'TGCA', '+', '????'])
     mockOpener = mockOpen(read_data=data)
     with patch.object(builtins, 'open', mockOpener):
         reads = list(FastqReads('filename.fastq'))
         self.assertEqual(2, len(reads))
         self.assertEqual([DNARead('id1', 'ACGT', '!!!!'),
                           DNARead('id2', 'TGCA', '????')], reads)
Example #12
0
    def testDictLookupSequenceCrossesNewlines(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read when the sequence spans multiple lines of the input file,
        including lines ending in \n and \r\n.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0 or self.count == 1:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta')
            self.assertEqual(DNARead('id1', 'ACTGCCCCGGG'), index['id1'])
            index.close()
Example #13
0
    def testDictLookupGzipDataWithBGZsuffix(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read when the index file is in BGZF format and has a .bgz
        suffix.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count <= 1:
                    self.test.assertEqual('filename.fasta.bgz', filename)
                    self.count += 1
                    writerIO = BytesIO()
                    writer = bgzf.BgzfWriter(fileobj=writerIO)
                    writer.write(b'>id0\nAC\n')
                    writer.flush()
                    fileobj = BytesIO(writerIO.getvalue())
                    fileobj.mode = 'rb'
                    return bgzf.BgzfReader(fileobj=fileobj)
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(bgzf, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta.bgz')
            self.assertEqual(DNARead('id0', 'AC'), index['id0'])
            index.close()
Example #14
0
 def testKozakConsensusAtEndPart(self):
     """
     In a given sequence without a Kozak consensus, the output should be
     as expected.
     """
     read = DNARead('id', 'AAAAAAATTGCCGCCATG')
     self.assertEqual([], list(findKozakConsensus(read)))
Example #15
0
    def testDictLookupWithFastaDirectory(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read, obtained from the expected file name, when a FASTA base
        directory is specified.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('/tmp/f.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                if self.count == 1:
                    self.test.assertEqual(
                        os.path.join('/usr/local/fasta', 'f.fasta'), filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:', fastaDirectory='/usr/local/fasta')
            index.addFile('/tmp/f.fasta')
            self.assertEqual(DNARead('id1', 'ACTGCCCCGGG'), index['id1'])
            index.close()
Example #16
0
 def testPassingRefence(self):
     """
     It must be possible to pass a reference
     """
     reference = DNARead('refId', 'ATTC')
     features = Features({}, reference)
     self.assertIs(reference, features.reference)
Example #17
0
    def testTwoReads(self):
        """
        It must be possible to access a FASTA file with two reads like a dict.
        """

        pyfaidxIndex = StringIO()

        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return BytesIO(b'>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 2:
                    self.count += 1
                    return self.manager
                elif self.count == 3:
                    self.count += 1
                    return StringIO(pyfaidxIndex.getvalue())
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        @contextmanager
        def manager():
            yield pyfaidxIndex

        sideEffect = Open(self, manager()).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            reads = FastaFaiReads('filename.fasta')
            self.assertEqual(DNARead('id1', 'ACTG'), reads['id1'])
            self.assertEqual(DNARead('id2', 'AACCTTGG'), reads['id2'])
            # Check that the fai index was built correctly.
            self.assertEqual(pyfaidxIndex.getvalue(),
                             'id1\t4\t5\t4\t5\nid2\t8\t15\t8\t9\n')
Example #18
0
 def testOneRead(self):
     """
     A FASTQ file with one read must be read properly.
     """
     data = '\n'.join(['@id1', 'ACGT', '+', '!!!!'])
     with patch.object(builtins, 'open', mock_open(read_data=data)):
         reads = list(FastqReads('filename.fastq'))
         self.assertEqual([DNARead('id1', 'ACGT', '!!!!')], reads)
Example #19
0
 def testInvalidMinLength(self):
     """
     If a minLength value less than 1 is passed, a ValueError must be
     raised.
     """
     read = DNARead('id', '')
     error = '^minLength must be at least 1$'
     assertRaisesRegex(self, ValueError, error, sequenceCategoryLengths,
                       read, {}, minLength=0)
Example #20
0
 def testOneKozakConsensus(self):
     """
     In a given sequence with an exact Kozak consensus sequence, the offset
     and quality percentage should be as expected.
     """
     read = DNARead('id', 'ATTGCCGCCATGGGGG')
     expectedKozakRead = DNAKozakRead(read, 3, 13, 100.0)
     (result, ) = list(findKozakConsensus(read))
     self.assertEqual(expectedKozakRead, result)
Example #21
0
 def testKozakConsensusAtEnd(self):
     """
     In a given sequence without a Kozak consensus, the output should be
     as expected.
     """
     read = DNARead('id', 'AAAAAAATTGCCGCCATGG')
     expectedKozakRead = DNAKozakRead(read, 9, 19, 100.0)
     (result, ) = list(findKozakConsensus(read))
     self.assertEqual(expectedKozakRead, result)
Example #22
0
    def ntSequences(self, featureName):
        """
        Get the aligned nucelotide sequences.

        @param featureName: A C{str} feature name.
        @return: A 2-C{tuple} of C{dark.reads.DNARead} instances, holding
            the nucleotides for the feature as located in the reference
            genome and then the corresponding nucleotides from the genome being
            examined.
        """
        try:
            return self._cache['nt'][featureName]
        except KeyError:
            pass

        feature = self.features[featureName]
        name = feature['name']
        length = feature['stop'] - feature['start']
        offset = self.offsetMap[feature['start']]
        end = alignmentEnd(self.referenceAligned.sequence, offset, length)

        referenceNt = DNARead(self.features.reference.id + f' ({name})',
                              self.referenceAligned.sequence[offset:end])

        # In general, there should not be insertions to the reference. There
        # are lineages with insertions in the Spike (e.g. B.1.214.2) that we
        # can correct for in the downstream processing, therefore the error is
        # not raised for the Spike.
        if '-' in referenceNt.sequence and name != 'surface glycoprotein':
            raise ReferenceInsertionError(
                f'MAFFT suggests a reference insertion into {featureName!r}.')

        genomeNt = DNARead(self.genome.id + f' ({name})',
                           self.genomeAligned.sequence[offset:end])

        if DEBUG:
            print('NT MATCH:')
            print('ref  nt:', referenceNt.sequence[SLICE])
            print('gen  nt:', genomeNt.sequence[SLICE])

        self._cache['nt'][featureName] = referenceNt, genomeNt

        return referenceNt, genomeNt
Example #23
0
    def testNtSequencesGenomeGap(self):
        """
        The genome must be able to have a gap relative to the reference.
        """
        referenceSequence = 'TGGCGTGGA' + ('T' * 20) + 'CAAATCGG'
        genomeFeature = 'TGGA' + ('T' * 19) + 'CAAATCGG'
        genomeSequence = 'CCCGGTGGCG' + genomeFeature + 'CCCCCCC'

        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': referenceSequence,
                    'start': 5,
                    'stop': len(referenceSequence),
                },
            },
            DNARead('refId', referenceSequence))

        genome = SARS2Genome(DNARead('genId', genomeSequence), features)

        # The genome offset is initialized to None and isn't set until
        # after ntSequences is called.
        # self.assertEqual(None, alignment.genomeOffset)

        referenceNt, genomeNt = genome.ntSequences('spike')

        # self.assertEqual(5, alignment.genomeOffset)

        self.assertEqual(referenceSequence[5:], referenceNt.sequence)
        self.assertEqual('refId (spike)', referenceNt.id)

        expected = 'TGGA-' + ('T' * 19) + 'CAAATCGG'
        self.assertEqual(expected, genomeNt.sequence)
        self.assertEqual('genId (spike)', genomeNt.id)

        testCount, errorCount, result = genome.checkFeature(
            'spike', 'T5-', True)

        self.assertEqual(1, testCount)
        self.assertEqual(0, errorCount)
        self.assertEqual((True, 'T', True, '-'), result['T5-'])
Example #24
0
    def testDictLookupGzipData(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected reads when sequences span multiple lines of the input file,
        and include lines ending in \n and \r\n and have been compressed with
        bgzip, including when sequences are more than 64K bytes into the input
        file.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count <= 4:
                    self.test.assertEqual('filename.fasta.gz', filename)
                    self.count += 1
                    writerIO = BytesIO()
                    writer = bgzf.BgzfWriter(fileobj=writerIO)
                    writer.write(b'>id0\nAC\n' + b'>id1\n' + (b'A' * 70000) +
                                 b'\n' + b'>id2\r\nACTG\r\nCCCC\r\nGGG\r\n' +
                                 b'>id3\nAACCTG\n')
                    writer.flush()
                    fileobj = BytesIO(writerIO.getvalue())
                    fileobj.mode = 'rb'
                    return bgzf.BgzfReader(fileobj=fileobj)
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(bgzf, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta.gz')
            self.assertEqual(DNARead('id0', 'AC'), index['id0'])
            self.assertEqual(DNARead('id1', 'A' * 70000), index['id1'])
            self.assertEqual(DNARead('id2', 'ACTGCCCCGGG'), index['id2'])
            self.assertEqual(DNARead('id3', 'AACCTG'), index['id3'])
            index.close()
Example #25
0
    def testAaSequencesTranslationNoSlipperySequenceRaise(self):
        """
        The aaSequences function must raise if it can't translate an
        'ORF1ab polyprotein' sequence due to a missing slippery sequence.
        """
        features = Features(
            {
                'ORF1ab polyprotein': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        error = r'^No slippery sequence found\.$'
        self.assertRaisesRegex(NoSlipperySequenceError, error,
                               genome.aaSequences, 'ORF1ab polyprotein')
Example #26
0
    def testUnspecifiedReference(self):
        """
        If a reference is not specified and the BAM file mentions more than
        one, UnspecifiedReference must be raised.
        """
        template = ('ACGTTCCG', )

        bamReferences = [DNARead('ref-1', template[0]), DNARead('ref-2', 'AA')]
        fastaReferences = [DNARead('ref-3', 'AAA')]
        with makeBAM(template,
                     bamReferences=bamReferences,
                     fastaReferences=fastaReferences) as (fastaFilename,
                                                          bamFilename):
            error = (
                r'^Could not infer a BAM reference. Available references are: '
                r'ref-1, ref-2\.$')
            self.assertRaisesRegex(UnspecifiedReference,
                                   error,
                                   consensusFromBAM,
                                   bamFilename,
                                   referenceFasta=fastaFilename)
Example #27
0
    def testNtSequencesChangesIndexErrorPrint(self):
        """
        If we check on nucleotide sequences with an out-of-range
        check, an error must be printed if we pass onError='print'
        and the expected error result must be returned.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()

        # Two lines of error output are printed.
        error = (
            r"Index 99999 out of range trying to access feature "
            r"'spike' of length 4 sequence 'refId (spike)' via "
            r"expected change specification 'A100000A'."
            "\n"
            r"Index 99999 out of range trying to access feature "
            r"'spike' of length 4 sequence 'genId (spike)' via "
            r"expected change specification 'A100000A'."
            "\n"
        )
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A100000A', nt=True, onError='print', errFp=err)
        self.assertEqual(error, err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])
Example #28
0
 def testUnknownCategory(self):
     """
     If a base has no category, the summary must have C{None} as the
     category for those bases.
     """
     read = DNARead('id', 'ACCGGTTT')
     categories = {
         'A': 'a',
         'G': 'g',
         'T': 't',
     }
     self.assertEqual([('a', 1), (None, 2), ('g', 2), ('t', 3)],
                      sequenceCategoryLengths(read, categories))
Example #29
0
 def testUnknownCategoryWithDefault(self):
     """
     If a base has no category, the summary must have the passed default
     category as the category for those bases.
     """
     read = DNARead('id', 'ACCGGTTT')
     categories = {
         'A': 'a',
         'G': 'g',
         'T': 't',
     }
     self.assertEqual([('a', 1), ('xxx', 2), ('g', 2), ('t', 3)],
                      sequenceCategoryLengths(read, categories, 'xxx'))
Example #30
0
    def testAaSequencesChangesTranslationErrorRaise(self):
        """
        Check that a TranslationError is raised when checking AA
        sequences.
        """
        features = Features(
            {
                'orf1ab': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        error = r"^No slippery sequence found\.$"
        self.assertRaisesRegex(
            NoSlipperySequenceError, error, genome.checkFeature,
            'orf1ab', 'A100000A', False)