コード例 #1
0
    def testAddDuplicateFile(self):
        """"
        If a filename is passed to addFile more than once, a ValueError must
        be raised.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            self.assertEqual(2, index.addFile('filename.fasta'))
            error = "^Duplicate file name: 'filename\\.fasta'$"
            assertRaisesRegex(self, ValueError, error, index._addFilename,
                              'filename.fasta')
            index.close()
コード例 #2
0
    def testFindWithTwoFiles(self):
        """"
        The _find method must return the expected filename and offset when
        sequences are added from two files.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename1.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1:
                    self.test.assertEqual('filename2.fasta', filename)
                    self.count += 1
                    return StringIO('>sequence3\nAAACCC\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename1.fasta')
            index.addFile('filename2.fasta')
            self.assertEqual(('filename1.fasta', 5), index._find('id1'))
            self.assertEqual(('filename1.fasta', 15), index._find('id2'))
            self.assertEqual(('filename2.fasta', 11), index._find('sequence3'))
            index.close()
コード例 #3
0
    def testAddOneFile(self):
        """"
        Test the creation of an index with sequences added from one file.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            self.assertEqual(2, index.addFile('filename.fasta'))
            index.close()
コード例 #4
0
    def testAddFilesWithDuplicateSequence(self):
        """"
        If a sequence id occurs in more than one FASTA file, a ValueError must
        be raised.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('filename1.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1:
                    self.test.assertEqual('filename2.fasta', filename)
                    self.count += 1
                    return StringIO('>id2\nAAACCC\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename1.fasta')
            error = ("^FASTA sequence id 'id2', found in file "
                     "'filename2\\.fasta', was previously added from file "
                     "'filename1\\.fasta'\\.$")
            assertRaisesRegex(self, ValueError, error, index.addFile,
                              'filename2.fasta')
            index.close()
コード例 #5
0
    def getSubjectSequence(self, title):
        """
        Obtain information about a subject sequence given its title.

        @param title: A C{str} sequence title from a DIAMOND hit.
        @raise KeyError: If the C{title} is not present in the DIAMOND
            database.
        @return: An C{AAReadWithX} instance.
        """
        if self._subjectTitleToSubject is None:
            if self._databaseFilename is None:
                # An Sqlite3 database is used to look up subjects.
                self._subjectTitleToSubject = SqliteIndex(
                    self._sqliteDatabaseFilename,
                    fastaDirectory=self._databaseDirectory,
                    readClass=AAReadWithX)
            else:
                # Build a dict to look up subjects.
                titles = {}
                for read in FastaReads(self._databaseFilename,
                                       readClass=AAReadWithX):
                    titles[read.id] = read
                self._subjectTitleToSubject = titles

        return self._subjectTitleToSubject[title]
コード例 #6
0
    def testDictLookupWithFastaDirectory(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read, obtained from the expected file name, when a FASTA base
        directory is specified.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0:
                    self.test.assertEqual('/tmp/f.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                if self.count == 1:
                    self.test.assertEqual(
                        os.path.join('/usr/local/fasta', 'f.fasta'), filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:', fastaDirectory='/usr/local/fasta')
            index.addFile('/tmp/f.fasta')
            self.assertEqual(DNARead('id1', 'ACTGCCCCGGG'), index['id1'])
            index.close()
コード例 #7
0
    def testDictLookupGzipDataWithBGZsuffix(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read when the index file is in BGZF format and has a .bgz
        suffix.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count <= 1:
                    self.test.assertEqual('filename.fasta.bgz', filename)
                    self.count += 1
                    writerIO = BytesIO()
                    writer = bgzf.BgzfWriter(fileobj=writerIO)
                    writer.write(b'>id0\nAC\n')
                    writer.flush()
                    fileobj = BytesIO(writerIO.getvalue())
                    fileobj.mode = 'rb'
                    return bgzf.BgzfReader(fileobj=fileobj)
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(bgzf, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta.bgz')
            self.assertEqual(DNARead('id0', 'AC'), index['id0'])
            index.close()
コード例 #8
0
    def testDictLookupSpecificReadClass(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read type.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0 or self.count == 1:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nMM\n>id2\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:', readClass=AARead)
            index.addFile('filename.fasta')
            result = index['id1']
            self.assertTrue(isinstance(result, AARead))
            self.assertEqual(AARead('id1', 'MM'), result)
            index.close()
コード例 #9
0
    def testDictLookupWithTwoFiles(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected reads when sequences are added from two files.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0 or self.count == 2 or self.count == 3:
                    self.test.assertEqual('filename1.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\n>id2\nAACCTTGG\n')
                elif self.count == 1 or self.count == 4:
                    self.test.assertEqual('filename2.fasta', filename)
                    self.count += 1
                    return StringIO('>seq3\nAAACCC\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename1.fasta')
            index.addFile('filename2.fasta')
            self.assertEqual(DNARead('id1', 'ACTG'), index['id1'])
            self.assertEqual(DNARead('id2', 'AACCTTGG'), index['id2'])
            self.assertEqual(DNARead('seq3', 'AAACCC'), index['seq3'])
            index.close()
コード例 #10
0
    def testDictLookupSequenceCrossesNewlines(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected read when the sequence spans multiple lines of the input file,
        including lines ending in \n and \r\n.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count == 0 or self.count == 1:
                    self.test.assertEqual('filename.fasta', filename)
                    self.count += 1
                    return StringIO('>id1\nACTG\r\nCCCC\nGGG\n>id2\nAACCTG\n')
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(builtins, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta')
            self.assertEqual(DNARead('id1', 'ACTGCCCCGGG'), index['id1'])
            index.close()
コード例 #11
0
 def testBZ2File(self):
     """"
     Trying to add a .bz2 file must result in a ValueError.
     """
     index = SqliteIndex(':memory:')
     error = ('^Compressed FASTA is only supported in BGZF format\\. Use '
              'bgzip to compresss your FASTA\\.$')
     assertRaisesRegex(self, ValueError, error, index.addFile, 'file.bz2')
コード例 #12
0
 def testGetNonexistentFileNumber(self):
     """"
     If the internal _getFileNumber method is called with a file whose name
     has not been added, it must return None.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(None, index._getFileNumber('filename.fasta'))
     index.close()
コード例 #13
0
 def testAddFilename(self):
     """"
     Test the internal _addFilename method.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(1, index._addFilename('filename1.fasta'))
     self.assertEqual(2, index._addFilename('filename2.fasta'))
     index.close()
コード例 #14
0
 def testGetFileNumber(self):
     """"
     The internal _getFileNumber method must return the expected result.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(1, index._addFilename('filename.fasta'))
     self.assertEqual(1, index._getFileNumber('filename.fasta'))
     index.close()
コード例 #15
0
 def testAddDuplicateFilename(self):
     """"
     When _addFilename is called twice with the same name, a ValueError
     must be raised.
     """
     index = SqliteIndex(':memory:')
     self.assertEqual(1, index._addFilename('f.fas'))
     error = "^Duplicate file name: 'f.fas'$"
     assertRaisesRegex(self, ValueError, error, index._addFilename, 'f.fas')
コード例 #16
0
    def getSubjectSequence(self, title):
        """
        Obtain information about a subject sequence given its title.

        This information is cached in self._subjectTitleToSubject. It can
        be obtained from either a) an sqlite database (given via the
        sqliteDatabaseFilename argument to __init__), b) the FASTA that was
        originally given to BLAST (via the databaseFilename argument), or
        c) from the BLAST database using blastdbcmd (which can be
        unreliable - occasionally failing to find subjects that are in its
        database).

        @param title: A C{str} sequence title from a BLAST hit. Of the form
            'gi|63148399|gb|DQ011818.1| Description...'.
        @return: An C{AARead} or C{DNARead} instance, depending on the type of
            BLAST database in use.

        """
        if self.params.application in {'blastp', 'blastx'}:
            readClass = AARead
        else:
            readClass = DNARead

        if self._subjectTitleToSubject is None:
            if self._databaseFilename is None:
                if self._sqliteDatabaseFilename is None:
                    # Fall back to blastdbcmd.  ncbidb has to be imported
                    # as below so ncbidb.getSequence can be patched by our
                    # test suite.
                    from dark import ncbidb
                    seq = ncbidb.getSequence(
                        title, self.params.applicationParams['database'])
                    return readClass(seq.description, str(seq.seq))
                else:
                    # An Sqlite3 database is used to look up subjects.
                    self._subjectTitleToSubject = SqliteIndex(
                        self._sqliteDatabaseFilename,
                        fastaDirectory=self._databaseDirectory,
                        readClass=readClass)
            else:
                # Build an in-memory dict to look up subjects. This only
                # works for small databases, obviously.
                titles = {}
                for read in FastaReads(self._databaseFilename,
                                       readClass=readClass):
                    titles[read.id] = read
                self._subjectTitleToSubject = titles

        return self._subjectTitleToSubject[title]
コード例 #17
0
    def testDictLookupGzipData(self):
        """"
        The __getitem__ method (i.e., dictionary-like lookup) must return the
        expected reads when sequences span multiple lines of the input file,
        and include lines ending in \n and \r\n and have been compressed with
        bgzip, including when sequences are more than 64K bytes into the input
        file.
        """
        class Open(object):
            def __init__(self, test):
                self.test = test
                self.count = 0

            def sideEffect(self, filename, *args, **kwargs):
                if self.count <= 4:
                    self.test.assertEqual('filename.fasta.gz', filename)
                    self.count += 1
                    writerIO = BytesIO()
                    writer = bgzf.BgzfWriter(fileobj=writerIO)
                    writer.write(b'>id0\nAC\n' + b'>id1\n' + (b'A' * 70000) +
                                 b'\n' + b'>id2\r\nACTG\r\nCCCC\r\nGGG\r\n' +
                                 b'>id3\nAACCTG\n')
                    writer.flush()
                    fileobj = BytesIO(writerIO.getvalue())
                    fileobj.mode = 'rb'
                    return bgzf.BgzfReader(fileobj=fileobj)
                else:
                    self.test.fail(
                        'Open called too many times. Filename: %r, Args: %r, '
                        'Keyword args: %r.' % (filename, args, kwargs))

        sideEffect = Open(self).sideEffect
        with patch.object(bgzf, 'open') as mockMethod:
            mockMethod.side_effect = sideEffect
            index = SqliteIndex(':memory:')
            index.addFile('filename.fasta.gz')
            self.assertEqual(DNARead('id0', 'AC'), index['id0'])
            self.assertEqual(DNARead('id1', 'A' * 70000), index['id1'])
            self.assertEqual(DNARead('id2', 'ACTGCCCCGGG'), index['id2'])
            self.assertEqual(DNARead('id3', 'AACCTG'), index['id3'])
            index.close()
コード例 #18
0
              'uncompressed, or compressed with bgzip (from samtools), with '
              'a .gz suffix.'))

    args = parser.parse_args()

    if os.path.exists(args.out):
        if args.force:
            os.unlink(args.out)
        else:
            print(
                "Output file '%s' already exists. Use --force to overwrite." %
                args.out,
                file=sys.stderr)
            sys.exit(1)

    index = SqliteIndex(args.out)

    # Flatten the lists of lists that we get from using both nargs='+' and
    # action='append'. We use both because it allows people to use (e.g.)
    # --fasta on the command line either via "--fasta file1 --fasta file2"
    # or "--fasta file1 file2", or a combination of these. That way it's
    # not necessary to remember which way you're supposed to use it and you
    # also can't be hit by the subtle problem encountered in
    # https://github.com/acorg/dark-matter/issues/453
    fastaFiles = list(chain.from_iterable(args.fasta))

    verbose = not args.quiet

    for filename in fastaFiles:
        if verbose:
            print("Indexing '%s' ... " % filename, end='', file=sys.stderr)