Esempi in Python per Features, esempi in Python per sars2seq.features.Features

Esempio n. 1

0

Mostra file

File: test_genome.py Progetto: VirologyCharite/sars2seq

    def testAaSequencesChangesTranslationErrorIgnore(self):
        """
        Check that no error is printed when checking AA sequences and
        onError='ignore' and that the expected result is returned.
        """
        features = Features(
            {
                'orf1ab': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()

        testCount, errorCount, result = genome.checkFeature(
            'orf1ab', 'A100000A', nt=False, onError='ignore', errFp=err)
        self.assertEqual('', err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])

Esempio n. 2

0

Mostra file

File: test_genome.py Progetto: VirologyCharite/sars2seq

    def testNtSequencesChangesIndexErrorIgnore(self):
        """
        If we check on nucleotide sequences with an out-of-range
        check, no error should be printed if we pass onError='ignore'
        and the expected error result must be returned.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A100000A', nt=True, onError='ignore', errFp=err)
        self.assertEqual('', err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])

Esempio n. 3

0

Mostra file

File: test_genome.py Progetto: VirologyCharite/sars2seq

    def testNtSequencesChangesString(self):
        """
        It must be possible to retrieve aligned nucleotide sequences
        and check on changes using a string specification.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        # Note: 1-based locations.
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A1A T2A A3T T4T', True)

        self.assertEqual(4, testCount)
        self.assertEqual(3, errorCount)
        self.assertEqual((True, 'A', True, 'A'), result['A1A'])
        self.assertEqual((True, 'T', False, 'T'), result['T2A'])
        self.assertEqual((False, 'T', True, 'T'), result['A3T'])
        self.assertEqual((False, 'C', False, 'C'), result['T4T'])

Esempio n. 4

0

Mostra file

 def testPassingRefence(self):
     """
     It must be possible to pass a reference
     """
     reference = DNARead('refId', 'ATTC')
     features = Features({}, reference)
     self.assertIs(reference, features.reference)

Esempio n. 5

0

Mostra file

File: test_genome.py Progetto: VirologyCharite/sars2seq

    def testNtSequencesChangesTuple(self):
        """
        It must be possible to retrieve aligned nucleotide sequences
        and check on changes using a tuple specification.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        # Note: 0-based offsets.
        testCount, errorCount, result = genome.checkFeature(
            'spike',
            (('A', 0, 'A'), ('T', 1, 'A'), ('A', 2, 'T'), ('T', 3, 'T')), True)

        self.assertEqual(4, testCount)
        self.assertEqual(3, errorCount)
        self.assertEqual((True, 'A', True, 'A'), result[('A', 0, 'A')])
        self.assertEqual((True, 'T', False, 'T'), result[('T', 1, 'A')])
        self.assertEqual((False, 'T', True, 'T'), result[('A', 2, 'T')])
        self.assertEqual((False, 'C', False, 'C'), result[('T', 3, 'T')])

Esempio n. 6

0

Mostra file

File: test_variants.py Progetto: VirologyCharite/sars2seq

 def testKnownFeatures(self):
     """
     Only known feature names are allowed.
     """
     features = Features()
     for variant in VARIANTS:
         for featureName in VARIANTS[variant]['changes']:
             self.assertIsInstance(features[featureName], dict)

Esempio n. 7

0

Mostra file

File: describe-features.py Progetto: VirologyCharite/sars2seq

def main(args):
    """
    Describe SARS-CoV-2 annotations.

    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    """
    features = Features(args.gbFile)

    print(f'Features for {features.reference.id}:')

    for featureName, feature in sorted(features.items()):
        print(f'{featureName}:')
        print('  start:', feature['start'])
        print('  stop:', feature['stop'])
        print('  length:', feature['stop'] - feature['start'])
        try:
            print('  product:', feature['product'])
        except KeyError:
            pass
        try:
            print('  function:', feature['function'])
        except KeyError:
            pass

        sequence = feature['sequence']
        print(f'  sequence    (len {len(sequence):5d} nt):',
              (sequence[:args.maxLen] +
               '...') if len(sequence) > args.maxLen else sequence)

        try:
            translation = feature['translation']
        except KeyError:
            # Some features (e.g., UTR, stem loops) do not have a translation.
            pass
        else:
            print(f'  translation (len {len(translation):5d} aa):',
                  (translation[:args.maxLen] +
                   '...') if len(translation) > args.maxLen else translation)

Esempio n. 8

0

Mostra file

    def testPassingDict(self):
        """
        It must be possible to initialize a Features instance via a dict.
        """
        value = {
            'name': 'spike',
            'sequence': 'ATTC',
            'start': 0,
            'stop': 4,
        }
        features = Features({'spike': value})

        self.assertIn('spike', features)
        self.assertEqual(value, features['spike'])

Esempio n. 9

0

Mostra file

File: test_genome.py Progetto: VirologyCharite/sars2seq

    def testNtSequencesGenomeGap(self):
        """
        The genome must be able to have a gap relative to the reference.
        """
        referenceSequence = 'TGGCGTGGA' + ('T' * 20) + 'CAAATCGG'
        genomeFeature = 'TGGA' + ('T' * 19) + 'CAAATCGG'
        genomeSequence = 'CCCGGTGGCG' + genomeFeature + 'CCCCCCC'

        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': referenceSequence,
                    'start': 5,
                    'stop': len(referenceSequence),
                },
            },
            DNARead('refId', referenceSequence))

        genome = SARS2Genome(DNARead('genId', genomeSequence), features)

        # The genome offset is initialized to None and isn't set until
        # after ntSequences is called.
        # self.assertEqual(None, alignment.genomeOffset)

        referenceNt, genomeNt = genome.ntSequences('spike')

        # self.assertEqual(5, alignment.genomeOffset)

        self.assertEqual(referenceSequence[5:], referenceNt.sequence)
        self.assertEqual('refId (spike)', referenceNt.id)

        expected = 'TGGA-' + ('T' * 19) + 'CAAATCGG'
        self.assertEqual(expected, genomeNt.sequence)
        self.assertEqual('genId (spike)', genomeNt.id)

        testCount, errorCount, result = genome.checkFeature(
            'spike', 'T5-', True)

        self.assertEqual(1, testCount)
        self.assertEqual(0, errorCount)
        self.assertEqual((True, 'T', True, '-'), result['T5-'])

Esempio n. 10

0

Mostra file

File: test_genome.py Progetto: VirologyCharite/sars2seq

    def testAaSequencesTranslationNoSlipperySequenceRaise(self):
        """
        The aaSequences function must raise if it can't translate an
        'ORF1ab polyprotein' sequence due to a missing slippery sequence.
        """
        features = Features(
            {
                'ORF1ab polyprotein': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        error = r'^No slippery sequence found\.$'
        self.assertRaisesRegex(NoSlipperySequenceError, error,
                               genome.aaSequences, 'ORF1ab polyprotein')

Esempio n. 11

0

Mostra file

File: test_genome.py Progetto: VirologyCharite/sars2seq

    def testNtSequencesChangesIndexErrorPrint(self):
        """
        If we check on nucleotide sequences with an out-of-range
        check, an error must be printed if we pass onError='print'
        and the expected error result must be returned.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        err = StringIO()

        # Two lines of error output are printed.
        error = (
            r"Index 99999 out of range trying to access feature "
            r"'spike' of length 4 sequence 'refId (spike)' via "
            r"expected change specification 'A100000A'."
            "\n"
            r"Index 99999 out of range trying to access feature "
            r"'spike' of length 4 sequence 'genId (spike)' via "
            r"expected change specification 'A100000A'."
            "\n"
        )
        testCount, errorCount, result = genome.checkFeature(
            'spike', 'A100000A', nt=True, onError='print', errFp=err)
        self.assertEqual(error, err.getvalue())

        self.assertEqual(1, testCount)
        self.assertEqual(1, errorCount)
        self.assertEqual((False, None, False, None), result['A100000A'])

Esempio n. 12

0

Mostra file

File: test_genome.py Progetto: VirologyCharite/sars2seq

    def testAaSequencesChangesTranslationErrorRaise(self):
        """
        Check that a TranslationError is raised when checking AA
        sequences.
        """
        features = Features(
            {
                'orf1ab': {
                    'name': 'ORF1ab polyprotein',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        error = r"^No slippery sequence found\.$"
        self.assertRaisesRegex(
            NoSlipperySequenceError, error, genome.checkFeature,
            'orf1ab', 'A100000A', False)

Esempio n. 13

0

Mostra file

File: test_genome.py Progetto: VirologyCharite/sars2seq

    def testNtSequencesChangesIndexErrorRaise(self):
        """
        If we check on nucleotide sequences with an out-of-range
        check, an IndexError must be raised.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        error = (r"^Index 99999 out of range trying to access feature "
                 r"'spike' of length 4 sequence 'refId \(spike\)' via "
                 r"expected change specification 'A100000A'\.$")
        self.assertRaisesRegex(IndexError, error, genome.checkFeature,
                               'spike', 'A100000A', True)

Esempio n. 14

0

Mostra file

File: test_genome.py Progetto: VirologyCharite/sars2seq

    def testNtSequencesGenomeSNP(self):
        """
        The genome must be able to have a SNP relative to the reference.
        """
        referenceSequence = 'TGGCGTGGA' + ('T' * 20) + 'CAAATCGG'
        genomeFeature = 'TGGCGTGGA' + ('T' * 9) + 'A' + ('T' * 10) + 'CAAATCGG'
        genomeSequence = 'CCCGG' + genomeFeature + 'CCCCCCC'

        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': referenceSequence,
                    'start': 0,
                    'stop': len(referenceSequence),
                },
            },
            DNARead('refId', referenceSequence))

        genome = SARS2Genome(DNARead('genId', genomeSequence), features)

        referenceNt, genomeNt = genome.ntSequences('spike')

        expected = 'TGGCGTGGA' + ('T' * 9) + 'A' + ('T' * 10) + 'CAAATCGG'
        self.assertEqual(expected, genomeNt.sequence)
        self.assertEqual('genId (spike)', genomeNt.id)

        self.assertEqual(referenceSequence, referenceNt.sequence)
        self.assertEqual('refId (spike)', referenceNt.id)

        testCount, errorCount, result = genome.checkFeature(
            'spike', 'T19A', True)

        self.assertEqual(1, testCount)
        self.assertEqual(0, errorCount)
        self.assertEqual((True, 'T', True, 'A'), result['T19A'])

Esempio n. 15

0

Mostra file

File: test_genome.py Progetto: VirologyCharite/sars2seq

    def testNtSequences(self):
        """
        It must be possible to retrieve aligned nucleotide sequences.
        """
        features = Features(
            {
                'spike': {
                    'name': 'spike',
                    'sequence': 'ATTC',
                    'start': 0,
                    'stop': 4,
                },
            },
            DNARead('refId', 'ATTC'))

        genome = SARS2Genome(DNARead('genId', 'GGATTCGG'), features)

        referenceNt, genomeNt = genome.ntSequences('spike')

        self.assertEqual('ATTC', genomeNt.sequence)
        self.assertEqual('genId (spike)', genomeNt.id)

        self.assertEqual('ATTC', referenceNt.sequence)
        self.assertEqual('refId (spike)', referenceNt.id)

Esempio n. 16

0

Mostra file

from unittest import TestCase

from dark.reads import DNARead

from sars2seq.features import Features

_FEATURES = Features()


class TestFeatures(TestCase):
    """
    Test the Features class.
    """
    def testGetFeatures(self):
        """
        The getitem method must return a dict.
        """
        self.assertIsInstance(_FEATURES['spike'], dict)

    def testUnknownFeature(self):
        """
        If an unknown feature is asked for, a KeyError must be raised.
        """
        self.assertRaisesRegex(KeyError, "^'xx'$", _FEATURES.__getitem__, 'xx')

    def testPassingDict(self):
        """
        It must be possible to initialize a Features instance via a dict.
        """
        value = {
            'name': 'spike',

Esempio n. 17

0

Mostra file

def main(args):
    """
    Describe a SARS-CoV-2 genome.

    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    """
    outDir = args.outDir
    if outDir:
        if not exists(outDir):
            os.makedirs(outDir)

    features = Features(args.gbFile)

    if args.feature:
        if args.canonicalNames:
            wantedFeatures = map(features.canonicalName, args.feature)
        else:
            wantedFeatures = args.feature
    else:
        if args.noFeatures:
            wantedFeatures = []
        else:
            wantedFeatures = sorted(features)

    namedMatches = defaultdict(list)
    foundSets = defaultdict(list)

    reads = list(FastaReads(args.genome))

    print('SEQUENCE SHORT NAMES\n')
    maxLen = 0
    nameSummary = []
    for read in reads:
        shortId = read.id.split()[0]
        if len(shortId) > maxLen:
            maxLen = len(shortId)
        nameSummary.append((shortId, read.id))
        read.id = shortId

    for shortId, longId in nameSummary:
        print(f'{shortId:{maxLen}s} = {longId}')

    print('\nPER-SEQUENCE RESULTS\n')

    for read in reads:
        genome = SARS2Genome(read, features)

        if args.checkVariant:
            with genomeFilePointer(read, args, '-variant-summary.txt') as fp:
                nCount = genome.genome.sequence.count('N')
                genomeLen = len(genome.genome)
                nonNCount = genomeLen - nCount
                coverage = nonNCount / genomeLen
                print(f'{read.id} (coverage {nonNCount}/{genomeLen} = '
                      f'{coverage * 100.0:.2f} %)', file=fp)

                theseNamedMatches, theseFoundSets = printVariantSummary(
                    genome, fp, args)

                for match, ids in theseNamedMatches.items():
                    namedMatches[match].extend(ids)

                for match, ids in theseFoundSets.items():
                    foundSets[match].extend(ids)

                print(file=fp)

        for i, featureName in enumerate(wantedFeatures):
            with featureFilePointers(read, featureName, args) as fps:
                processFeature(featureName, features, genome, fps, i, args)

    print('\nSUMMARY\n')

    if namedMatches:
        print('Named change sets:')
        for changeSet in sorted(CHANGE_SETS):
            desc = ', '.join(sorted(CHANGE_SETS[changeSet], key=key))
            print(f'  {changeSet}: {desc}')
        print()

        print('Known variant combinations matched (count):')
        for match in sorted(namedMatches):
            print(f'  {match} ({len(namedMatches[match])}):')
            for name in sorted(namedMatches[match]):
                print(f'    {name}')
        if foundSets:
            print()

    if foundSets:
        print('Sets of changes found (count):')
        for match in sorted(foundSets):
            desc = ', '.join(sorted(match, key=key))
            print(f'  {desc} ({len(foundSets[match])}):')
            for name in sorted(foundSets[match]):
                print(f'    {name}')

Esempio n. 18

0

Mostra file

File: describe-genome.py Progetto: VirologyCharite/sars2seq

def main(args):
    """
    Describe a SARS-CoV-2 genome.

    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    @return: An C{int} exit status.
    """
    outDir = args.outDir
    if outDir:
        if not exists(outDir):
            os.makedirs(outDir)

    features = Features(args.gbFile)

    if args.feature:
        if args.canonicalNames:
            wantedFeatures = map(features.canonicalName, args.feature)
        else:
            wantedFeatures = args.feature
    else:
        if args.noFeatures:
            wantedFeatures = []
        else:
            wantedFeatures = sorted(features)

    if not (args.checkVariant or wantedFeatures):
        print('No action specified - I have nothing to do!', file=sys.stderr)
        return 1

    count = ignoredDueToCoverageCount = 0

    for count, read in enumerate(FastaReads(args.genome), start=1):
        if args.minReferenceCoverage is not None:
            coverage = ((len(read) - read.sequence.upper().count('N')) /
                        len(features.reference))
            if coverage < args.minReferenceCoverage:
                ignoredDueToCoverageCount += 1
                print(
                    f'Genome {read.id!r} ignored due to low '
                    f'({coverage * 100.0:.2f}%) coverage of the reference.',
                    file=sys.stderr)
                continue

        genome = SARS2Genome(read, features)

        if args.checkVariant:
            with genomeFilePointer(read, args, '-variant-summary.txt') as fp:
                print(read.id, file=fp)
                printVariantSummary(genome, fp, args)

        for i, featureName in enumerate(wantedFeatures):
            with featureFilePointers(read, featureName, args) as fps:
                processFeature(featureName, genome, fps, i, args)

    print(f'Examined {count} genomes.')

    if args.minReferenceCoverage is not None:
        print(f'Ignored {ignoredDueToCoverageCount} genomes due to low '
              f'coverage.')

    return 0

Esempio n. 19

0

Mostra file

from unittest import TestCase

from os.path import dirname, join

from .fasta import getSequence

import sars2seq
from sars2seq.checker import Checker, AAChecker, NTChecker
from sars2seq.features import Features
from sars2seq.genome import SARS2Genome

DATA_DIR = join(dirname(dirname(sars2seq.__file__)), 'data')
REF_GB = join(DATA_DIR, 'NC_045512.2.gb')
FEATURES = Features(REF_GB)


class Test_EPI_ISL_601443(TestCase):
    """
    Test the EPI_ISL_601433 sequence. This is the variant of concern
    (VOC 202012/01) referred to in https://www.gov.uk/government/publications/
    investigation-of-novel-sars-cov-2-variant-variant-of-concern-20201201
    """
    genomeRead = getSequence(join(DATA_DIR, 'EPI_ISL_601443.fasta'))
    genome = SARS2Genome(genomeRead, FEATURES)

    def testIndexError(self):
        """
        If an check on a non-existent index is attempted, an IndexError must
        be raised.
        """
        checker = Checker('spike', 'N500001Y', False)