def testPrefixSuffix(self):
     seq = '>hey\nagtccgatcg'
     trimmed_seq = '>hey\nccga'
     result = trimReads(3, 3, StringIO(seq))
     test_result = list(SeqIO.parse(StringIO(trimmed_seq), 'fasta'))
     self.assertEqual(list(map(str, tuple(result))),
                      list(map(str, tuple(test_result))))
Exemple #2
0
    def testThreeFiles(self):
        """
        Subtraction of three files must work correctly.
        """
        fasta1 = '\n'.join([
            '>one',
            'agtcagtcagtc',
            '>two',
            'acctg',
            '>three',
            'atgggtc',
            '>four',
            'atggctattgaactgtatct',
        ])
        fasta2 = '\n'.join([
            '>one',
            'agtcagtcagtc',
        ])
        fasta3 = '\n'.join([
            '>two',
            'acctg',
            '>three',
            'atgggtc',
        ])

        result = list(
            fastaSubtract(
                [StringIO(fasta1),
                 StringIO(fasta2),
                 StringIO(fasta3)]))
        self.assertEqual(len(result), 1)
        self.assertEqual(str(result[0].seq), 'atggctattgaactgtatct')
        self.assertEqual(str(result[0].id), 'four')
Exemple #3
0
 def testRemovalOfSuffix(self):
     """
     A sequence that is a suffix of another is removed.
     """
     s1 = SeqIO.read(StringIO('>s1\nagtcagtcagtc'), 'fasta')
     s2 = SeqIO.read(StringIO('>s2\ncagtc'), 'fasta')
     self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2])), [s1])
Exemple #4
0
 def testInitializedRead(self):
     """
     It must be possible to read from a StringIO instance that is
     initialized on creation.
     """
     s = StringIO('hey')
     self.assertEqual('hey', s.getvalue())
Exemple #5
0
 def testInitializedRead(self):
     """
     It must be possible to read from a StringIO instance that is
     initialized on creation.
     """
     s = StringIO('hey')
     self.assertEqual('hey', s.getvalue())
Exemple #6
0
    def testSubtractNothing(self):
        """
        When two input files have no overlap, subtraction must result in the
        same reads as are in the first input.
        """
        fasta1 = '\n'.join([
            '>one',
            'agtcagtcagtc',
            '>two',
            'acctg',
            '>three',
            'atgggtc',
            '>four',
            'atggctattgaactgtatct',
        ])
        fasta2 = '\n'.join([
            '>five',
            'agtcagtcagtc',
            '>six',
            'acctg',
        ])

        result = list(fastaSubtract([StringIO(fasta1), StringIO(fasta2)]))
        self.assertEqual(['four', 'one', 'three', 'two'],
                         sorted([seq.id for seq in result]))
Exemple #7
0
 def testWriteRead(self):
     """
     It must be possible to write and read to/from a StringIO instance as
     normal.
     """
     s = StringIO()
     s.write('hey')
     self.assertEqual('hey', s.getvalue())
Exemple #8
0
 def testOrderIndependent(self):
     """
     A sequence that is a prefix of another is removed when it appears
     first.
     """
     s1 = SeqIO.read(StringIO('>s1\nagtcag'), 'fasta')
     s2 = SeqIO.read(StringIO('>s2\nagtcagtcagtc'), 'fasta')
     self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2])), [s2])
Exemple #9
0
 def testWriteRead(self):
     """
     It must be possible to write and read to/from a StringIO instance as
     normal.
     """
     s = StringIO()
     s.write('hey')
     self.assertEqual('hey', s.getvalue())
Exemple #10
0
 def testRemovalOfIdenticalSequences(self):
     """
     A list with 2 copies of the same seq is de-duped to have 1 copy.
     """
     seq = '>hey\nagtcagtcagtc'
     s1 = SeqIO.read(StringIO(seq), 'fasta')
     s2 = SeqIO.read(StringIO(seq), 'fasta')
     self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2])), [s1])
Exemple #11
0
 def testRemovalOfPrefixSuffixAndDuplicate(self):
     """
     Prefixes, suffixes, and duplicates should collectively all be removed.
     """
     s1 = SeqIO.read(StringIO('>s1\nagtcagtcagtc'), 'fasta')
     s2 = SeqIO.read(StringIO('>s2\nagtcagtcagtc'), 'fasta')
     s3 = SeqIO.read(StringIO('>s3\nagtcagt'), 'fasta')
     s4 = SeqIO.read(StringIO('>s4\ntcagtc'), 'fasta')
     self.assertEqual(list(dePrefixAndSuffixFasta([s1, s2, s3, s4])), [s1])
Exemple #12
0
 def sideEffect(self, filename, **kwargs):
     if self.count == 0:
         self.test.assertEqual('file1.fasta', filename)
         self.count += 1
         return StringIO('>id1\nACTG\n')
     elif self.count == 1:
         self.test.assertEqual('file2.fasta', filename)
         self.count += 1
         return StringIO('>id2\nCAGT\n')
     else:
         self.test.fail('We are only supposed to be called twice!')
Exemple #13
0
    def testOpenNotCalledOnRepeatedCall(self):
        """
        If a repeated call to pathogenSampleFiles.add is made with the same
        arguments, no file should be read because the original result value is
        cached.
        """
        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('out/0.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n')
                elif self.count == 1:
                    self.test.assertEqual('out/pathogen-0-sample-0.fasta',
                                          filename)
                    self.count += 1
                    return self.manager
                else:
                    self.test.fail(
                        'We are only supposed to be called twice. '
                        'Filename: %r, Args: %r, Keyword args: %r.' %
                        (filename, args, kwargs))

        fp = StringIO(
            '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
        )
        fastaIO = StringIO()

        @contextmanager
        def manager():
            yield fastaIO

        pg = ProteinGrouper()
        pg.addFile('filename-1', fp)
        pathogenSampleFiles = PathogenSampleFiles(pg)

        sideEffect = Open(self, manager()).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1')
            self.assertEqual('out/pathogen-0-sample-0.fasta', filename)
            self.assertEqual('>id1\nACTG\n', fastaIO.getvalue())

            # Repeated call. The side effect open will fail if open is
            # called at this point.
            filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1')
            self.assertEqual('out/pathogen-0-sample-0.fasta', filename)
Exemple #14
0
 def sideEffect(self, filename, *args, **kwargs):
     if self.count == 0:
         self.test.assertEqual('filename1.fasta', filename)
         self.count += 1
         return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
     elif self.count == 1:
         self.test.assertEqual('filename2.fasta', filename)
         self.count += 1
         return StringIO('>sequence3\nAAACCC\n')
     else:
         self.test.fail(
             'Open called too many times. Filename: %r, Args: %r, '
             'Keyword args: %r.' % (filename, args, kwargs))
Exemple #15
0
 def testDuplicatePathogenProteinSample(self):
     """
     If a protein grouper is given duplicate information for a
     pathogen/protein/sample combination it must raise a ValueError.
     """
     fp = StringIO(
         '0.77 46.6 48.1 5 6 74 gi|327|X|I44.6 ubiquitin [Lausannevirus]\n')
     pg = ProteinGrouper()
     pg.addFile('sample', fp)
     fp.seek(0)
     error = ("^Protein 'gi\\|327\\|X\\|I44.6 ubiquitin' already seen for "
              "pathogen 'Lausannevirus' sample 'sample'\\.$")
     assertRaisesRegex(self, ValueError, error, pg.addFile, 'sample', fp)
Exemple #16
0
    def testIdenticalReadsRemoved(self):
        """
        If two proteins in the same pathogen are matched by the same read, the
        de-duplicated FASTA for the pathogen must have only one copy of the
        duplicated read.
        """
        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.expectedFilenames = {'out/0.fasta', 'out/1.fasta',
                                          'out/pathogen-0-sample-0.fasta'}

            def sideEffect(self, filename, *args, **kwargs):
                try:
                    self.expectedFilenames.remove(filename)
                except KeyError:
                    self.test.fail(
                        'Open called with unexpected filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))
                else:
                    if filename == 'out/0.fasta':
                        return StringIO('>id1\nACTG\n')
                    elif filename == 'out/1.fasta':
                        return StringIO('>id1\nACTG\n>id2\nCAGT\n')
                    else:
                        return self.manager

        fp = StringIO(
            '0.63 41.3 44.2 9 9 12 gi|327410| protein 77 [Lausannevirus]\n'
            '0.77 46.6 48.1 5 6 74 gi|327409| ubiquitin [Lausannevirus]\n'
        )
        fastaIO = StringIO()

        @contextmanager
        def manager():
            yield fastaIO

        pg = ProteinGrouper()
        pg.addFile('filename-1', fp)
        pathogenSampleFiles = PathogenSampleFiles(pg)

        opener = Open(self, manager())
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = opener.sideEffect
            filename = pathogenSampleFiles.add('Lausannevirus', 'filename-1')

        self.assertEqual('out/pathogen-0-sample-0.fasta', filename)
        self.assertEqual('>id1\nACTG\n>id2\nCAGT\n', fastaIO.getvalue())
        # Make sure all expected filenames were seen by the mocked open.
        self.assertEqual(set(), opener.expectedFilenames)
Exemple #17
0
 def sideEffect(self, filename, *args, **kwargs):
     if self.count == 0:
         self.test.assertEqual('/tmp/f.fasta', filename)
         self.count += 1
         return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
     if self.count == 1:
         self.test.assertEqual(
             os.path.join('/usr/local/fasta', 'f.fasta'), filename)
         self.count += 1
         return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
     else:
         self.test.fail(
             'Open called too many times. Filename: %r, Args: %r, '
             'Keyword args: %r.' % (filename, args, kwargs))
Exemple #18
0
    def testSubtractFromNothing(self):
        """
        When the first file is empty, the result shoud be too.
        """
        fasta1 = ''
        fasta2 = '\n'.join([
            '>five',
            'agtcagtcagtc',
            '>six',
            'acctg',
        ])

        result = list(fastaSubtract([StringIO(fasta1), StringIO(fasta2)]))
        self.assertEqual([], result)
Exemple #19
0
    def testWriteSampleIndex(self):
        """
        The writeSampleIndex function must write a file with the expected
        content.
        """
        pathogenSampleFiles = PathogenSampleFiles(None)
        pathogenSampleFiles._samples = {
            'NEO11': 500,
            'NEO33': 24,
            'NEO66': 333,
        }

        fp = StringIO()
        pathogenSampleFiles.writeSampleIndex(fp)
        self.assertEqual('24 NEO33\n333 NEO66\n500 NEO11\n', fp.getvalue())
Exemple #20
0
    def testWritePathogenIndex(self):
        """
        The writePatogenIndex function must write a file with the expected
        content.
        """
        pathogenSampleFiles = PathogenSampleFiles(None)
        pathogenSampleFiles._pathogens = {
            'virus b': 4,
            'virus a': 3,
            'virus c': 9,
        }

        fp = StringIO()
        pathogenSampleFiles.writePathogenIndex(fp)
        self.assertEqual('3 virus a\n4 virus b\n9 virus c\n', fp.getvalue())
Exemple #21
0
 def testContextManager(self):
     """
     It must be possible to use a StringIO instance as a context manager.
     """
     with StringIO() as s:
         s.write('hey')
         self.assertEqual('hey', s.getvalue())
Exemple #22
0
 def testLengthOne(self):
     """
     A FASTA list with just one item gets de-duped to the same one item.
     """
     seq = '>hey\nagtcagtcagtc'
     s1 = SeqIO.read(StringIO(seq), 'fasta')
     self.assertEqual(list(dePrefixAndSuffixFasta([s1])), [s1])
Exemple #23
0
    def testTwoReads(self):
        """
        It must be possible to access a FASTA file with two reads like a dict.
        """

        pyfaidxIndex = StringIO()

        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return BytesIO(b'>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 2:
                    self.count += 1
                    return self.manager
                elif self.count == 3:
                    self.count += 1
                    return StringIO(pyfaidxIndex.getvalue())
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        @contextmanager
        def manager():
            yield pyfaidxIndex

        sideEffect = Open(self, manager()).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            reads = FastaFaiReads('filename.fasta')
            self.assertEqual(DNARead('id1', 'ACTG'), reads['id1'])
            self.assertEqual(DNARead('id2', 'AACCTTGG'), reads['id2'])
            # Check that the fai index was built correctly.
            self.assertEqual(pyfaidxIndex.getvalue(),
                             'id1\t4\t5\t4\t5\nid2\t8\t15\t8\t9\n')
Exemple #24
0
    def testSequencesAreChecked(self):
        """
        If a two reads with the same id do not have the same sequence,
        an assertion error must be raised.
        """
        fasta1 = '\n'.join([
            '>one',
            'ag',
        ])
        fasta2 = '\n'.join([
            '>one',
            'at',
        ])

        self.assertRaises(
            AssertionError, fastaSubtract,
            [StringIO(fasta1), StringIO(fasta2)])
Exemple #25
0
    def testTwoReads(self):
        """
        It must be possible to access a FASTA file with two reads like a dict.
        """

        pyfaidxIndex = StringIO()

        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return BytesIO(b'>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 2:
                    self.count += 1
                    return self.manager
                elif self.count == 3:
                    self.count += 1
                    return StringIO(pyfaidxIndex.getvalue())
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        @contextmanager
        def manager():
            yield pyfaidxIndex

        sideEffect = Open(self, manager()).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            reads = FastaFaiReads('filename.fasta')
            self.assertEqual(DNARead('id1', 'ACTG'), reads['id1'])
            self.assertEqual(DNARead('id2', 'AACCTTGG'), reads['id2'])
            # Check that the fai index was built correctly.
            self.assertEqual(pyfaidxIndex.getvalue(),
                             'id1\t4\t5\t4\t5\nid2\t8\t15\t8\t9\n')
Exemple #26
0
 def testBaseCountsTwoReads(self):
     seq = '>hey\nagtcagtcagtc\n>you\nacctg'
     result = summarizeReads(StringIO(seq), 'fasta')
     self.assertEqual(result['base_counts'], {
         'a': 4,
         'c': 5,
         't': 4,
         'g': 4
     })
Exemple #27
0
 def sideEffect(self, filename, *args, **kwargs):
     if self.count == 0 or self.count == 1:
         self.test.assertEqual('filename.fasta', filename)
         self.count += 1
         return StringIO('>id1\nMM\n>id2\n')
     else:
         self.test.fail(
             'Open called too many times. Filename: %r, Args: %r, '
             'Keyword args: %r.' % (filename, args, kwargs))
Exemple #28
0
    def testSubtractEverything(self):
        """
        When two input files have the same reads, subtraction must result in an
        empty (no reads) output.
        """
        fasta1 = '\n'.join([
            '>one',
            'agtcagtcagtc',
            '>two',
            'acctg',
            '>three',
            'atgggtc',
            '>four',
            'atggctattgaactgtatct',
        ])

        result = list(fastaSubtract([StringIO(fasta1), StringIO(fasta1)]))
        self.assertEqual([], result)
Exemple #29
0
 def testBaseCountsOneRead(self):
     seq = '>hey\nagtcagtcagtc'
     result = summarizeReads(StringIO(seq), 'fasta')
     self.assertEqual(result['base_counts'], {
         'a': 3,
         'c': 3,
         't': 3,
         'g': 3
     })
Exemple #30
0
 def sideEffect(self, filename, *args, **kwargs):
     if self.count == 0:
         self.test.assertEqual('filename.fasta', filename)
         self.count += 1
         return BytesIO(b'>id1\nACTG\n>id2\nAACCTTGG\n')
     elif self.count == 1:
         self.test.assertEqual('filename.fasta', filename)
         self.count += 1
         return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
     elif self.count == 2:
         self.count += 1
         return self.manager
     elif self.count == 3:
         self.count += 1
         return StringIO(pyfaidxIndex.getvalue())
     else:
         self.test.fail(
             'Open called too many times. Filename: %r, Args: %r, '
             'Keyword args: %r.' % (filename, args, kwargs))
    def testGzip(self):
        """
        When a string '*.gz' filename is passed to asHandle, it must be
        possible to read the correct data from the fp that is returned.
        """
        if six.PY3:
            self.skipTest('Mocking gzip.GzipFile disabled under Python 3')

        # This test should be better. It should actually create some gzip
        # compressed data and make sure that it's decompressed
        # properly. But Python mocking makes me so confused...
        result = StringIO('xxx')

        with patch.object(gzip, 'GzipFile') as mockMethod:
            mockMethod.return_value = result
            with asHandle('file.gz') as fp:
                self.assertEqual('xxx', fp.read())
Exemple #32
0
    def testOneFile(self):
        """
        When just one file is passed we should get a result that has as many
        reads as was in the single input file.
        """
        fasta1 = '\n'.join([
            '>one',
            'agtcagtcagtc',
            '>two',
            'acctg',
            '>three',
            'atgggtc',
            '>four',
            'atggctattgaactgtatct',
        ])

        result = list(fastaSubtract([StringIO(fasta1)]))
        self.assertEqual(len(result), 4)
Exemple #33
0
    def testMissingKey(self):
        """
        If a non-existent sequence id is looked up, a KeyError must be raised.
        """

        pyfaidxIndex = StringIO()

        class Open(object):
            def __init__(self, test, manager):
                self.test = test
                self.manager = manager
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return BytesIO(b'>id1\nACTG\n')
                elif self.count == 1:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n')
                elif self.count == 2:
                    self.count += 1
                    return self.manager
                elif self.count == 3:
                    self.count += 1
                    return StringIO(pyfaidxIndex.getvalue())
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        @contextmanager
        def manager():
            yield pyfaidxIndex

        sideEffect = Open(self, manager()).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            reads = FastaFaiReads('filename.fasta')
            error = "^'id2 not in filename\\.fasta\\.'"
            assertRaisesRegex(self, KeyError, error, reads.__getitem__, 'id2')
 def testSamePrefixAndSuffixTwoReads(self):
     seq = '>hey\nagtcagtcagtc\n>you\nagttcctggtc'
     result = getPrefixAndSuffix(StringIO(seq))
     self.assertEqual(result, (3, 3))
Exemple #35
0
 def testInitiallyEmpty(self):
     """
     A StringIO instance must initially be empty.
     """
     self.assertEqual('', StringIO().getvalue())
 def testSamePrefixDifferentSuffixThreeReads(self):
     seq = '>hey\nagtcagtcagtc\n>you\nagttcctggtc\n>how\nagtcggtat'
     result = getPrefixAndSuffix(StringIO(seq))
     self.assertEqual(result, (3, 0))
 def testOneInput(self):
     result = trimReads(10, 10, StringIO())
     test_result = list(SeqIO.parse(StringIO(), 'fasta'))
     self.assertEqual(list(map(str, tuple(result))),
                      list(map(str, tuple(test_result))))
 def testDifferentPrefixSameSuffixThreeReads(self):
     seq = '>hey\nagtccttagatcg\n>you\ncgaatcg\n>how\natgacctcg'
     result = getPrefixAndSuffix(StringIO(seq))
     self.assertEqual(result, (0, 3))