def testAaSequencesChangesTranslationErrorIgnore(self): """ Check that no error is printed when checking AA sequences and onError='ignore' and that the expected result is returned. """ features = Features( { 'orf1ab': { 'name': 'ORF1ab polyprotein', 'sequence': 'ATTC', 'start': 0, 'stop': 4, }, }, DNARead('refId', 'ATTC')) genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features) err = StringIO() testCount, errorCount, result = genome.checkFeature( 'orf1ab', 'A100000A', nt=False, onError='ignore', errFp=err) self.assertEqual('', err.getvalue()) self.assertEqual(1, testCount) self.assertEqual(1, errorCount) self.assertEqual((False, None, False, None), result['A100000A'])
def testNtSequencesChangesIndexErrorIgnore(self): """ If we check on nucleotide sequences with an out-of-range check, no error should be printed if we pass onError='ignore' and the expected error result must be returned. """ features = Features( { 'spike': { 'name': 'spike', 'sequence': 'ATTC', 'start': 0, 'stop': 4, }, }, DNARead('refId', 'ATTC')) genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features) err = StringIO() testCount, errorCount, result = genome.checkFeature( 'spike', 'A100000A', nt=True, onError='ignore', errFp=err) self.assertEqual('', err.getvalue()) self.assertEqual(1, testCount) self.assertEqual(1, errorCount) self.assertEqual((False, None, False, None), result['A100000A'])
def testNtSequencesChangesString(self): """ It must be possible to retrieve aligned nucleotide sequences and check on changes using a string specification. """ features = Features( { 'spike': { 'name': 'spike', 'sequence': 'ATTC', 'start': 0, 'stop': 4, }, }, DNARead('refId', 'ATTC')) genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features) # Note: 1-based locations. testCount, errorCount, result = genome.checkFeature( 'spike', 'A1A T2A A3T T4T', True) self.assertEqual(4, testCount) self.assertEqual(3, errorCount) self.assertEqual((True, 'A', True, 'A'), result['A1A']) self.assertEqual((True, 'T', False, 'T'), result['T2A']) self.assertEqual((False, 'T', True, 'T'), result['A3T']) self.assertEqual((False, 'C', False, 'C'), result['T4T'])
def testPassingRefence(self): """ It must be possible to pass a reference """ reference = DNARead('refId', 'ATTC') features = Features({}, reference) self.assertIs(reference, features.reference)
def testNtSequencesChangesTuple(self): """ It must be possible to retrieve aligned nucleotide sequences and check on changes using a tuple specification. """ features = Features( { 'spike': { 'name': 'spike', 'sequence': 'ATTC', 'start': 0, 'stop': 4, }, }, DNARead('refId', 'ATTC')) genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features) # Note: 0-based offsets. testCount, errorCount, result = genome.checkFeature( 'spike', (('A', 0, 'A'), ('T', 1, 'A'), ('A', 2, 'T'), ('T', 3, 'T')), True) self.assertEqual(4, testCount) self.assertEqual(3, errorCount) self.assertEqual((True, 'A', True, 'A'), result[('A', 0, 'A')]) self.assertEqual((True, 'T', False, 'T'), result[('T', 1, 'A')]) self.assertEqual((False, 'T', True, 'T'), result[('A', 2, 'T')]) self.assertEqual((False, 'C', False, 'C'), result[('T', 3, 'T')])
def testKnownFeatures(self): """ Only known feature names are allowed. """ features = Features() for variant in VARIANTS: for featureName in VARIANTS[variant]['changes']: self.assertIsInstance(features[featureName], dict)
def main(args): """ Describe SARS-CoV-2 annotations. @param args: A C{Namespace} instance as returned by argparse with values for command-line options. """ features = Features(args.gbFile) print(f'Features for {features.reference.id}:') for featureName, feature in sorted(features.items()): print(f'{featureName}:') print(' start:', feature['start']) print(' stop:', feature['stop']) print(' length:', feature['stop'] - feature['start']) try: print(' product:', feature['product']) except KeyError: pass try: print(' function:', feature['function']) except KeyError: pass sequence = feature['sequence'] print(f' sequence (len {len(sequence):5d} nt):', (sequence[:args.maxLen] + '...') if len(sequence) > args.maxLen else sequence) try: translation = feature['translation'] except KeyError: # Some features (e.g., UTR, stem loops) do not have a translation. pass else: print(f' translation (len {len(translation):5d} aa):', (translation[:args.maxLen] + '...') if len(translation) > args.maxLen else translation)
def testPassingDict(self): """ It must be possible to initialize a Features instance via a dict. """ value = { 'name': 'spike', 'sequence': 'ATTC', 'start': 0, 'stop': 4, } features = Features({'spike': value}) self.assertIn('spike', features) self.assertEqual(value, features['spike'])
def testNtSequencesGenomeGap(self): """ The genome must be able to have a gap relative to the reference. """ referenceSequence = 'TGGCGTGGA' + ('T' * 20) + 'CAAATCGG' genomeFeature = 'TGGA' + ('T' * 19) + 'CAAATCGG' genomeSequence = 'CCCGGTGGCG' + genomeFeature + 'CCCCCCC' features = Features( { 'spike': { 'name': 'spike', 'sequence': referenceSequence, 'start': 5, 'stop': len(referenceSequence), }, }, DNARead('refId', referenceSequence)) genome = SARS2Genome(DNARead('genId', genomeSequence), features) # The genome offset is initialized to None and isn't set until # after ntSequences is called. # self.assertEqual(None, alignment.genomeOffset) referenceNt, genomeNt = genome.ntSequences('spike') # self.assertEqual(5, alignment.genomeOffset) self.assertEqual(referenceSequence[5:], referenceNt.sequence) self.assertEqual('refId (spike)', referenceNt.id) expected = 'TGGA-' + ('T' * 19) + 'CAAATCGG' self.assertEqual(expected, genomeNt.sequence) self.assertEqual('genId (spike)', genomeNt.id) testCount, errorCount, result = genome.checkFeature( 'spike', 'T5-', True) self.assertEqual(1, testCount) self.assertEqual(0, errorCount) self.assertEqual((True, 'T', True, '-'), result['T5-'])
def testAaSequencesTranslationNoSlipperySequenceRaise(self): """ The aaSequences function must raise if it can't translate an 'ORF1ab polyprotein' sequence due to a missing slippery sequence. """ features = Features( { 'ORF1ab polyprotein': { 'name': 'ORF1ab polyprotein', 'sequence': 'ATTC', 'start': 0, 'stop': 4, }, }, DNARead('refId', 'ATTC')) genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features) error = r'^No slippery sequence found\.$' self.assertRaisesRegex(NoSlipperySequenceError, error, genome.aaSequences, 'ORF1ab polyprotein')
def testNtSequencesChangesIndexErrorPrint(self): """ If we check on nucleotide sequences with an out-of-range check, an error must be printed if we pass onError='print' and the expected error result must be returned. """ features = Features( { 'spike': { 'name': 'spike', 'sequence': 'ATTC', 'start': 0, 'stop': 4, }, }, DNARead('refId', 'ATTC')) genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features) err = StringIO() # Two lines of error output are printed. error = ( r"Index 99999 out of range trying to access feature " r"'spike' of length 4 sequence 'refId (spike)' via " r"expected change specification 'A100000A'." "\n" r"Index 99999 out of range trying to access feature " r"'spike' of length 4 sequence 'genId (spike)' via " r"expected change specification 'A100000A'." "\n" ) testCount, errorCount, result = genome.checkFeature( 'spike', 'A100000A', nt=True, onError='print', errFp=err) self.assertEqual(error, err.getvalue()) self.assertEqual(1, testCount) self.assertEqual(1, errorCount) self.assertEqual((False, None, False, None), result['A100000A'])
def testAaSequencesChangesTranslationErrorRaise(self): """ Check that a TranslationError is raised when checking AA sequences. """ features = Features( { 'orf1ab': { 'name': 'ORF1ab polyprotein', 'sequence': 'ATTC', 'start': 0, 'stop': 4, }, }, DNARead('refId', 'ATTC')) genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features) error = r"^No slippery sequence found\.$" self.assertRaisesRegex( NoSlipperySequenceError, error, genome.checkFeature, 'orf1ab', 'A100000A', False)
def testNtSequencesChangesIndexErrorRaise(self): """ If we check on nucleotide sequences with an out-of-range check, an IndexError must be raised. """ features = Features( { 'spike': { 'name': 'spike', 'sequence': 'ATTC', 'start': 0, 'stop': 4, }, }, DNARead('refId', 'ATTC')) genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features) error = (r"^Index 99999 out of range trying to access feature " r"'spike' of length 4 sequence 'refId \(spike\)' via " r"expected change specification 'A100000A'\.$") self.assertRaisesRegex(IndexError, error, genome.checkFeature, 'spike', 'A100000A', True)
def testNtSequencesGenomeSNP(self): """ The genome must be able to have a SNP relative to the reference. """ referenceSequence = 'TGGCGTGGA' + ('T' * 20) + 'CAAATCGG' genomeFeature = 'TGGCGTGGA' + ('T' * 9) + 'A' + ('T' * 10) + 'CAAATCGG' genomeSequence = 'CCCGG' + genomeFeature + 'CCCCCCC' features = Features( { 'spike': { 'name': 'spike', 'sequence': referenceSequence, 'start': 0, 'stop': len(referenceSequence), }, }, DNARead('refId', referenceSequence)) genome = SARS2Genome(DNARead('genId', genomeSequence), features) referenceNt, genomeNt = genome.ntSequences('spike') expected = 'TGGCGTGGA' + ('T' * 9) + 'A' + ('T' * 10) + 'CAAATCGG' self.assertEqual(expected, genomeNt.sequence) self.assertEqual('genId (spike)', genomeNt.id) self.assertEqual(referenceSequence, referenceNt.sequence) self.assertEqual('refId (spike)', referenceNt.id) testCount, errorCount, result = genome.checkFeature( 'spike', 'T19A', True) self.assertEqual(1, testCount) self.assertEqual(0, errorCount) self.assertEqual((True, 'T', True, 'A'), result['T19A'])
def testNtSequences(self): """ It must be possible to retrieve aligned nucleotide sequences. """ features = Features( { 'spike': { 'name': 'spike', 'sequence': 'ATTC', 'start': 0, 'stop': 4, }, }, DNARead('refId', 'ATTC')) genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features) referenceNt, genomeNt = genome.ntSequences('spike') self.assertEqual('ATTC', genomeNt.sequence) self.assertEqual('genId (spike)', genomeNt.id) self.assertEqual('ATTC', referenceNt.sequence) self.assertEqual('refId (spike)', referenceNt.id)
from unittest import TestCase from dark.reads import DNARead from sars2seq.features import Features _FEATURES = Features() class TestFeatures(TestCase): """ Test the Features class. """ def testGetFeatures(self): """ The getitem method must return a dict. """ self.assertIsInstance(_FEATURES['spike'], dict) def testUnknownFeature(self): """ If an unknown feature is asked for, a KeyError must be raised. """ self.assertRaisesRegex(KeyError, "^'xx'$", _FEATURES.__getitem__, 'xx') def testPassingDict(self): """ It must be possible to initialize a Features instance via a dict. """ value = { 'name': 'spike',
def main(args): """ Describe a SARS-CoV-2 genome. @param args: A C{Namespace} instance as returned by argparse with values for command-line options. """ outDir = args.outDir if outDir: if not exists(outDir): os.makedirs(outDir) features = Features(args.gbFile) if args.feature: if args.canonicalNames: wantedFeatures = map(features.canonicalName, args.feature) else: wantedFeatures = args.feature else: if args.noFeatures: wantedFeatures = [] else: wantedFeatures = sorted(features) namedMatches = defaultdict(list) foundSets = defaultdict(list) reads = list(FastaReads(args.genome)) print('SEQUENCE SHORT NAMES\n') maxLen = 0 nameSummary = [] for read in reads: shortId = read.id.split()[0] if len(shortId) > maxLen: maxLen = len(shortId) nameSummary.append((shortId, read.id)) read.id = shortId for shortId, longId in nameSummary: print(f'{shortId:{maxLen}s} = {longId}') print('\nPER-SEQUENCE RESULTS\n') for read in reads: genome = SARS2Genome(read, features) if args.checkVariant: with genomeFilePointer(read, args, '-variant-summary.txt') as fp: nCount = genome.genome.sequence.count('N') genomeLen = len(genome.genome) nonNCount = genomeLen - nCount coverage = nonNCount / genomeLen print(f'{read.id} (coverage {nonNCount}/{genomeLen} = ' f'{coverage * 100.0:.2f} %)', file=fp) theseNamedMatches, theseFoundSets = printVariantSummary( genome, fp, args) for match, ids in theseNamedMatches.items(): namedMatches[match].extend(ids) for match, ids in theseFoundSets.items(): foundSets[match].extend(ids) print(file=fp) for i, featureName in enumerate(wantedFeatures): with featureFilePointers(read, featureName, args) as fps: processFeature(featureName, features, genome, fps, i, args) print('\nSUMMARY\n') if namedMatches: print('Named change sets:') for changeSet in sorted(CHANGE_SETS): desc = ', '.join(sorted(CHANGE_SETS[changeSet], key=key)) print(f' {changeSet}: {desc}') print() print('Known variant combinations matched (count):') for match in sorted(namedMatches): print(f' {match} ({len(namedMatches[match])}):') for name in sorted(namedMatches[match]): print(f' {name}') if foundSets: print() if foundSets: print('Sets of changes found (count):') for match in sorted(foundSets): desc = ', '.join(sorted(match, key=key)) print(f' {desc} ({len(foundSets[match])}):') for name in sorted(foundSets[match]): print(f' {name}')
def main(args): """ Describe a SARS-CoV-2 genome. @param args: A C{Namespace} instance as returned by argparse with values for command-line options. @return: An C{int} exit status. """ outDir = args.outDir if outDir: if not exists(outDir): os.makedirs(outDir) features = Features(args.gbFile) if args.feature: if args.canonicalNames: wantedFeatures = map(features.canonicalName, args.feature) else: wantedFeatures = args.feature else: if args.noFeatures: wantedFeatures = [] else: wantedFeatures = sorted(features) if not (args.checkVariant or wantedFeatures): print('No action specified - I have nothing to do!', file=sys.stderr) return 1 count = ignoredDueToCoverageCount = 0 for count, read in enumerate(FastaReads(args.genome), start=1): if args.minReferenceCoverage is not None: coverage = ((len(read) - read.sequence.upper().count('N')) / len(features.reference)) if coverage < args.minReferenceCoverage: ignoredDueToCoverageCount += 1 print( f'Genome {read.id!r} ignored due to low ' f'({coverage * 100.0:.2f}%) coverage of the reference.', file=sys.stderr) continue genome = SARS2Genome(read, features) if args.checkVariant: with genomeFilePointer(read, args, '-variant-summary.txt') as fp: print(read.id, file=fp) printVariantSummary(genome, fp, args) for i, featureName in enumerate(wantedFeatures): with featureFilePointers(read, featureName, args) as fps: processFeature(featureName, genome, fps, i, args) print(f'Examined {count} genomes.') if args.minReferenceCoverage is not None: print(f'Ignored {ignoredDueToCoverageCount} genomes due to low ' f'coverage.') return 0
from unittest import TestCase from os.path import dirname, join from .fasta import getSequence import sars2seq from sars2seq.checker import Checker, AAChecker, NTChecker from sars2seq.features import Features from sars2seq.genome import SARS2Genome DATA_DIR = join(dirname(dirname(sars2seq.__file__)), 'data') REF_GB = join(DATA_DIR, 'NC_045512.2.gb') FEATURES = Features(REF_GB) class Test_EPI_ISL_601443(TestCase): """ Test the EPI_ISL_601433 sequence. This is the variant of concern (VOC 202012/01) referred to in https://www.gov.uk/government/publications/ investigation-of-novel-sars-cov-2-variant-variant-of-concern-20201201 """ genomeRead = getSequence(join(DATA_DIR, 'EPI_ISL_601443.fasta')) genome = SARS2Genome(genomeRead, FEATURES) def testIndexError(self): """ If an check on a non-existent index is attempted, an IndexError must be raised. """ checker = Checker('spike', 'N500001Y', False)