Ejemplo n.º 1
0
    def testDictLookupWithFastaDirectory(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read, obtained from the expected file name, when a FASTA base
        directory is specified.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('/tmp/f.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                if self.count == 1:
                    self.test.assertEqual(
                        os.path.join('/usr/local/fasta', 'f.fasta'), filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:', fastaDirectory='/usr/local/fasta')
            index.addFile('/tmp/f.fasta')
            self.assertEqual(DNARead('id1', 'ACTGCCCCGGG'), index['id1'])
            index.close()
Ejemplo n.º 2
0
    def testAddDuplicateFile(self):
        """"
        If a filename is passed to addFile more than once, a ValueError must
        be raised.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            self.assertEqual(2, index.addFile('filename.fasta'))
            error = "^Duplicate file name: 'filename\\.fasta'$"
            assertRaisesRegex(self, ValueError, error, index._addFilename,
                              'filename.fasta')
            index.close()
Ejemplo n.º 3
0
    def testAddDuplicateFile(self):
        """"
        If a filename is passed to addFile more than once, a ValueError must
        be raised.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            self.assertEqual(2, index.addFile('filename.fasta'))
            error = "^Duplicate file name: 'filename.fasta'$"
            self.assertRaisesRegexp(ValueError, error, index._addFilename,
                                    'filename.fasta')
            index.close()
Ejemplo n.º 4
0
    def testDictLookupWithTwoFiles(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected reads when sequences are added from two files.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0 or self.count == 2 or self.count == 3:
                    self.test.assertEqual('filename1.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1 or self.count == 4:
                    self.test.assertEqual('filename2.fasta', filename)
                    self.count += 1
                    return StringIO('>seq3\nAAACCC\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename1.fasta')
            index.addFile('filename2.fasta')
            self.assertEqual(DNARead('id1', 'ACTG'), index['id1'])
            self.assertEqual(DNARead('id2', 'AACCTTGG'), index['id2'])
            self.assertEqual(DNARead('seq3', 'AAACCC'), index['seq3'])
            index.close()
Ejemplo n.º 5
0
    def testAddOneFile(self):
        """"
        Test the creation of an index with sequences added from one file.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            self.assertEqual(2, index.addFile('filename.fasta'))
            index.close()
Ejemplo n.º 6
0
    def testDictLookupGzipDataWithBGZsuffix(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read when the index file is in BGZF format and has a .bgz
        suffix.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count <= 1:
                    self.test.assertEqual('filename.fasta.bgz', filename)
                    self.count += 1
                    writerIO = BytesIO()
                    writer = bgzf.BgzfWriter(fileobj=writerIO)
                    writer.write(b'>id0\nAC\n')
                    writer.flush()
                    fileobj = BytesIO(writerIO.getvalue())
                    fileobj.mode = 'rb'
                    return bgzf.BgzfReader(fileobj=fileobj)
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(bgzf, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta.bgz')
            self.assertEqual(DNARead('id0', 'AC'), index['id0'])
            index.close()
Ejemplo n.º 7
0
    def testAddOneFile(self):
        """"
        Test the creation of an index with sequences added from one file.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            self.assertEqual(2, index.addFile('filename.fasta'))
            index.close()
Ejemplo n.º 8
0
    def testAddFileWithDuplicateSequence(self):
        """"
        If a sequence id is duplicated in a FASTA file, a ValueError must be
        raised.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id1\nAACCTTGG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            error = ("^FASTA sequence id 'id1' found twice in file "
                     "'filename.fasta'\\.$")
            assertRaisesRegex(self, ValueError, error, index.addFile,
                              'filename.fasta')
            index.close()
Ejemplo n.º 9
0
    def testAddFilesWithDuplicateSequence(self):
        """"
        If a sequence id occurs in more than one FASTA file, a ValueError must
        be raised.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename1.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1:
                    self.test.assertEqual('filename2.fasta', filename)
                    self.count += 1
                    return StringIO('>id2\nAAACCC\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename1.fasta')
            error = ("^FASTA sequence id 'id2', found in file "
                     "'filename2\.fasta', was previously added from file "
                     "'filename1\.fasta'\.$")
            self.assertRaisesRegexp(ValueError, error, index.addFile,
                                    'filename2.fasta')
            index.close()
Ejemplo n.º 10
0
    def testDictLookupSequenceCrossesNewlines(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read when the sequence spans multiple lines of the input file,
        including lines ending in \n and \r\n.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0 or self.count == 1:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta')
            self.assertEqual(DNARead('id1', 'ACTGCCCCGGG'), index['id1'])
            index.close()
Ejemplo n.º 11
0
    def testDictLookupSpecificReadClass(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read type.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0 or self.count == 1:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nMM\n>id2\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:', readClass=AARead)
            index.addFile('filename.fasta')
            result = index['id1']
            self.assertTrue(isinstance(result, AARead))
            self.assertEqual(AARead('id1', 'MM'), result)
            index.close()
Ejemplo n.º 12
0
 def testAddFilename(self):
     """"
     Test the internal _addFilename method.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(1, index._addFilename('filename1.fasta'))
     self.assertEqual(2, index._addFilename('filename2.fasta'))
     index.close()
Ejemplo n.º 13
0
 def testGetNonexistentFileNumber(self):
     """"
     If the internal _getFileNumber method is called with a file whose name
     has not been added, it must return None.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(None, index._getFileNumber('filename.fasta'))
     index.close()
Ejemplo n.º 14
0
 def testAddFilename(self):
     """"
     Test the internal _addFilename method.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(1, index._addFilename('filename1.fasta'))
     self.assertEqual(2, index._addFilename('filename2.fasta'))
     index.close()
Ejemplo n.º 15
0
 def testGetNonexistentFileNumber(self):
     """"
     If the internal _getFileNumber method is called with a file whose name
     has not been added, it must return None.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(None, index._getFileNumber('filename.fasta'))
     index.close()
Ejemplo n.º 16
0
 def testGetFileNumber(self):
     """"
     The internal _getFileNumber method must return the expected result.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(1, index._addFilename('filename.fasta'))
     self.assertEqual(1, index._getFileNumber('filename.fasta'))
     index.close()
Ejemplo n.º 17
0
 def testAddDuplicateFilename(self):
     """"
     When _addFilename is called twice with the same name, a ValueError
     must be raised.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(1, index._addFilename('f.fas'))
     error = "^Duplicate file name: 'f.fas'$"
     self.assertRaisesRegexp(ValueError, error, index._addFilename, 'f.fas')
Ejemplo n.º 18
0
 def testAddDuplicateFilename(self):
     """"
     When _addFilename is called twice with the same name, a ValueError
     must be raised.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(1, index._addFilename('f.fas'))
     error = "^Duplicate file name: 'f.fas'$"
     assertRaisesRegex(self, ValueError, error, index._addFilename, 'f.fas')
Ejemplo n.º 19
0
    def testFindWithTwoFiles(self):
        """"
        The _find method must return the expected filename and offset when
        sequences are added from two files.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename1.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1:
                    self.test.assertEqual('filename2.fasta', filename)
                    self.count += 1
                    return StringIO('>sequence3\nAAACCC\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename1.fasta')
            index.addFile('filename2.fasta')
            self.assertEqual(('filename1.fasta', 5), index._find('id1'))
            self.assertEqual(('filename1.fasta', 15), index._find('id2'))
            self.assertEqual(('filename2.fasta', 11), index._find('sequence3'))
            index.close()
Ejemplo n.º 20
0
    def testFindWithTwoFiles(self):
        """"
        The _find method must return the expected filename and offset when
        sequences are added from two files.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename1.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1:
                    self.test.assertEqual('filename2.fasta', filename)
                    self.count += 1
                    return StringIO('>sequence3\nAAACCC\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename1.fasta')
            index.addFile('filename2.fasta')
            self.assertEqual(('filename1.fasta', 5), index._find('id1'))
            self.assertEqual(('filename1.fasta', 15), index._find('id2'))
            self.assertEqual(('filename2.fasta', 11), index._find('sequence3'))
            index.close()
Ejemplo n.º 21
0
    def getSubjectSequence(self, title):
        """
        Obtain information about a subject sequence given its title.

        @param title: A C{str} sequence title from a DIAMOND hit.
        @raise KeyError: If the C{title} is not present in the DIAMOND
            database.
        @return: An C{AAReadWithX} instance.
        """
        if self._subjectTitleToSubject is None:
            if self._databaseFilename is None:
                # An Sqlite3 database is used to look up subjects.
                self._subjectTitleToSubject = SqliteIndex(
                    self._sqliteDatabaseFilename,
                    fastaDirectory=self._databaseDirectory,
                    readClass=AAReadWithX)
            else:
                # Build a dict to look up subjects.
                titles = {}
                for read in FastaReads(self._databaseFilename,
                                       readClass=AAReadWithX):
                    titles[read.id] = read
                self._subjectTitleToSubject = titles

        return self._subjectTitleToSubject[title]
Ejemplo n.º 22
0
 def testBZ2File(self):
     """"
     Trying to add a .bz2 file must result in a ValueError.
     """
     index = SqliteIndex(':memory:')
     error = ('^Compressed FASTA is only supported in BGZF format\\. Use '
              'bgzip to compresss your FASTA\\.$')
     assertRaisesRegex(self, ValueError, error, index.addFile, 'file.bz2')
Ejemplo n.º 23
0
    def testDictLookupGzipData(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected reads when sequences span multiple lines of the input file,
        and include lines ending in \n and \r\n and have been compressed with
        bgzip, including when sequences are more than 64K bytes into the input
        file.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count <= 4:
                    self.test.assertEqual('filename.fasta.gz', filename)
                    self.count += 1
                    writerIO = BytesIO()
                    writer = bgzf.BgzfWriter(fileobj=writerIO)
                    writer.write(
                        b'>id0\nAC\n' +
                        b'>id1\n' + (b'A' * 70000) + b'\n' +
                        b'>id2\r\nACTG\r\nCCCC\r\nGGG\r\n' +
                        b'>id3\nAACCTG\n')
                    writer.flush()
                    fileobj = BytesIO(writerIO.getvalue())
                    fileobj.mode = 'rb'
                    return bgzf.BgzfReader(fileobj=fileobj)
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(bgzf, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta.gz')
            self.assertEqual(DNARead('id0', 'AC'), index['id0'])
            self.assertEqual(DNARead('id1', 'A' * 70000), index['id1'])
            self.assertEqual(DNARead('id2', 'ACTGCCCCGGG'), index['id2'])
            self.assertEqual(DNARead('id3', 'AACCTG'), index['id3'])
            index.close()
Ejemplo n.º 24
0
    def testDictLookupGzipDataWithBGZsuffix(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read when the index file is in BGZF format and has a .bgz
        suffix.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count <= 1:
                    self.test.assertEqual('filename.fasta.bgz', filename)
                    self.count += 1
                    writerIO = BytesIO()
                    writer = bgzf.BgzfWriter(fileobj=writerIO)
                    writer.write(b'>id0\nAC\n')
                    writer.flush()
                    fileobj = BytesIO(writerIO.getvalue())
                    fileobj.mode = 'rb'
                    return bgzf.BgzfReader(fileobj=fileobj)
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(bgzf, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta.bgz')
            self.assertEqual(DNARead('id0', 'AC'), index['id0'])
            index.close()
Ejemplo n.º 25
0
    def testDictLookupSpecificReadClass(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read type.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0 or self.count == 1:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nMM\n>id2\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:', readClass=AARead)
            index.addFile('filename.fasta')
            result = index['id1']
            self.assertTrue(isinstance(result, AARead))
            self.assertEqual(AARead('id1', 'MM'), result)
            index.close()
Ejemplo n.º 26
0
    def testDictLookupWithFastaDirectory(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read, obtained from the expected file name, when a FASTA base
        directory is specified.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('/tmp/f.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                if self.count == 1:
                    self.test.assertEqual(
                        os.path.join('/usr/local/fasta', 'f.fasta'), filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:', fastaDirectory='/usr/local/fasta')
            index.addFile('/tmp/f.fasta')
            self.assertEqual(DNARead('id1', 'ACTGCCCCGGG'), index['id1'])
            index.close()
Ejemplo n.º 27
0
    def testDictLookupSequenceCrossesNewlines(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read when the sequence spans multiple lines of the input file,
        including lines ending in \n and \r\n.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0 or self.count == 1:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta')
            self.assertEqual(DNARead('id1', 'ACTGCCCCGGG'), index['id1'])
            index.close()
Ejemplo n.º 28
0
    def testDictLookupWithTwoFiles(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected reads when sequences are added from two files.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0 or self.count == 2 or self.count == 3:
                    self.test.assertEqual('filename1.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1 or self.count == 4:
                    self.test.assertEqual('filename2.fasta', filename)
                    self.count += 1
                    return StringIO('>seq3\nAAACCC\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename1.fasta')
            index.addFile('filename2.fasta')
            self.assertEqual(DNARead('id1', 'ACTG'), index['id1'])
            self.assertEqual(DNARead('id2', 'AACCTTGG'), index['id2'])
            self.assertEqual(DNARead('seq3', 'AAACCC'), index['seq3'])
            index.close()
Ejemplo n.º 29
0
 def testGetFileNumber(self):
     """"
     The internal _getFileNumber method must return the expected result.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(1, index._addFilename('filename.fasta'))
     self.assertEqual(1, index._getFileNumber('filename.fasta'))
     index.close()
Ejemplo n.º 30
0
    def getSubjectSequence(self, title):
        """
        Obtain information about a subject sequence given its title.

        This information is cached in self._subjectTitleToSubject. It can
        be obtained from either a) an sqlite database (given via the
        sqliteDatabaseFilename argument to __init__), b) the FASTA that was
        originally given to BLAST (via the databaseFilename argument), or
        c) from the BLAST database using blastdbcmd (which can be
        unreliable - occasionally failing to find subjects that are in its
        database).

        @param title: A C{str} sequence title from a BLAST hit. Of the form
            'gi|63148399|gb|DQ011818.1| Description...'.
        @return: An C{AARead} or C{DNARead} instance, depending on the type of
            BLAST database in use.

        """
        if self.params.application in {'blastp', 'blastx'}:
            readClass = AARead
        else:
            readClass = DNARead

        if self._subjectTitleToSubject is None:
            if self._databaseFilename is None:
                if self._sqliteDatabaseFilename is None:
                    # Fall back to blastdbcmd.  ncbidb has to be imported
                    # as below so ncbidb.getSequence can be patched by our
                    # test suite.
                    from dark import ncbidb
                    seq = ncbidb.getSequence(
                        title, self.params.applicationParams['database'])
                    return readClass(seq.description, str(seq.seq))
                else:
                    # An Sqlite3 database is used to look up subjects.
                    self._subjectTitleToSubject = SqliteIndex(
                        self._sqliteDatabaseFilename,
                        fastaDirectory=self._databaseDirectory,
                        readClass=readClass)
            else:
                # Build an in-memory dict to look up subjects. This only
                # works for small databases, obviously.
                titles = {}
                for read in FastaReads(self._databaseFilename,
                                       readClass=readClass):
                    titles[read.id] = read
                self._subjectTitleToSubject = titles

        return self._subjectTitleToSubject[title]
Ejemplo n.º 31
0
    def testDictLookupGzipData(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected reads when sequences span multiple lines of the input file,
        and include lines ending in \n and \r\n and have been compressed with
        bgzip, including when sequences are more than 64K bytes into the input
        file.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count <= 4:
                    self.test.assertEqual('filename.fasta.gz', filename)
                    self.count += 1
                    writerIO = BytesIO()
                    writer = bgzf.BgzfWriter(fileobj=writerIO)
                    writer.write(b'>id0\nAC\n' + b'>id1\n' + (b'A' * 70000) +
                                 b'\n' + b'>id2\r\nACTG\r\nCCCC\r\nGGG\r\n' +
                                 b'>id3\nAACCTG\n')
                    writer.flush()
                    fileobj = BytesIO(writerIO.getvalue())
                    fileobj.mode = 'rb'
                    return bgzf.BgzfReader(fileobj=fileobj)
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(bgzf, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta.gz')
            self.assertEqual(DNARead('id0', 'AC'), index['id0'])
            self.assertEqual(DNARead('id1', 'A' * 70000), index['id1'])
            self.assertEqual(DNARead('id2', 'ACTGCCCCGGG'), index['id2'])
            self.assertEqual(DNARead('id3', 'AACCTG'), index['id3'])
            index.close()
Ejemplo n.º 32
0
              'uncompressed, or compressed with bgzip (from samtools), with '
              'a .gz suffix.'))

    args = parser.parse_args()

    if os.path.exists(args.out):
        if args.force:
            os.unlink(args.out)
        else:
            print(
                "Output file '%s' already exists. Use --force to overwrite." %
                args.out,
                file=sys.stderr)
            sys.exit(1)

    index = SqliteIndex(args.out)

    # Flatten the lists of lists that we get from using both nargs='+' and
    # action='append'. We use both because it allows people to use (e.g.)
    # --fasta on the command line either via "--fasta file1 --fasta file2"
    # or "--fasta file1 file2", or a combination of these. That way it's
    # not necessary to remember which way you're supposed to use it and you
    # also can't be hit by the subtle problem encountered in
    # https://github.com/acorg/dark-matter/issues/453
    fastaFiles = list(chain.from_iterable(args.fasta))

    verbose = not args.quiet

    for filename in fastaFiles:
        if verbose:
            print("Indexing '%s' ... " % filename, end='', file=sys.stderr)
Ejemplo n.º 33
0
        required=True,
        help=('the FASTA file(s) to make the database from. These may be '
              'uncompressed, or compressed with bgzip (from samtools), with '
              'a .gz suffix.'))

    args = parser.parse_args()

    if os.path.exists(args.out):
        if args.force:
            os.unlink(args.out)
        else:
            print("Output file '%s' already exists. Use --force to overwrite."
                  % args.out, file=sys.stderr)
            sys.exit(1)

    index = SqliteIndex(args.out)

    # Flatten the lists of lists that we get from using both nargs='+' and
    # action='append'. We use both because it allows people to use (e.g.)
    # --fasta on the command line either via "--fasta file1 --fasta file2"
    # or "--fasta file1 file2", or a combination of these. That way it's
    # not necessary to remember which way you're supposed to use it and you
    # also can't be hit by the subtle problem encountered in
    # https://github.com/acorg/dark-matter/issues/453
    fastaFiles = list(chain.from_iterable(args.fasta))

    verbose = not args.quiet

    for filename in fastaFiles:
        if verbose:
            print("Indexing '%s' ... " % filename, end='', file=sys.stderr)