Beispiel #1
0
 def testNoKeywords(self):
     """
     The getDatabaseFromKeywords method must return a database when
     it is passed no keywords.
     """
     db = DatabaseSpecifier().getDatabaseFromKeywords()
     self.assertIsInstance(db, Database)
Beispiel #2
0
    def testPopulationFromCommandLineSequencesAndFastaFile(self):
        """
        Using both command line sequences and --databaseFasta must result in
        all the command line subjects and those in the file being added to the
        returned database.
        """
        parser = argparse.ArgumentParser()
        specifier = DatabaseSpecifier()
        specifier.addArgsToParser(parser)
        args = parser.parse_args([
            '--databaseFasta', 'file.fasta', '--databaseSequence', 'id1 FFF',
            '--databaseSequence', 'id2 RRR'
        ])
        data = '\n'.join(['>id3', 'FFFF', '>id4', 'RRRR'])
        mockOpener = mockOpen(read_data=data)
        with patch.object(builtins, 'open', mockOpener):
            db = specifier.getDatabaseFromArgs(args)

        allSubjects = [subject.read for subject in db.getSubjects()]
        self.assertEqual(
            {
                AARead('id1', 'FFF'),
                AARead('id2', 'RRR'),
                AARead('id3', 'FFFF'),
                AARead('id4', 'RRRR'),
            }, set(allSubjects))
Beispiel #3
0
 def testNoKeywordsDefaultParameters(self):
     """
     The database returned from getDatabaseFromKeywords when it is
     passed no keywords must have the default database parameters.
     """
     db = DatabaseSpecifier().getDatabaseFromKeywords()
     self.assertIs(None, db.dbParams.compare(DatabaseParameters()))
Beispiel #4
0
    def getFractionOfStructuresCovered(self):
        """
        Return the fraction of known structures matched by at least one
        substring in the subset that is being evaluated.
        """
        hit = 0
        total = 0

        db = DatabaseSpecifier().getDatabaseFromKeywords(
            trigPoints=[],
            landmarks=['AC ' + self.structureType],
            acAlphaHelixFilename=self.acAlphaHelixFilename,
            acAlphaHelix310Filename=self.acAlphaHelix310Filename,
            acAlphaHelixCombinedFilename=self.acAlphaHelixCombinedFilename,
            acAlphaHelixPiFilename=self.acAlphaHelixPiFilename,
            acExtendedStrandFilename=self.acExtendedStrandFilename)

        backend = Backend()
        backend.configure(db.dbParams)

        for read in FastaReads(self.structureFile,
                               readClass=AAReadWithX,
                               checkAlphabet=0):
            total += 1
            scannedRead = backend.scan(read)
            if len(scannedRead.landmarks) > 0:
                hit += 1

        return hit / total if total else 0.0
Beispiel #5
0
 def testPopulationNotAllowed(self):
     """
     Using --databaseFasta must result in a ValueError if database
     population has not been enabled.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier(allowPopulation=False)
     specifier.addArgsToParser(parser)
Beispiel #6
0
 def testInMemoryDatabaseIsReturned(self):
     """
     Passing a database keyword with an in-memory database results in that
     database being returned.
     """
     original = Database()
     db = DatabaseSpecifier().getDatabaseFromKeywords(database=original)
     self.assertIs(original, db)
Beispiel #7
0
 def testNoArgs(self):
     """
     If no arguments are given, getDatabaseFromArgs must return a database.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier()
     specifier.addArgsToParser(parser)
     args = parser.parse_args([])
     db = specifier.getDatabaseFromArgs(args)
     self.assertIsInstance(db, Database)
Beispiel #8
0
 def testPassedParamsAreUsed(self):
     """
     If specific parameters are given, they must be used.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier()
     specifier.addArgsToParser(parser)
     args = parser.parse_args([])
     dbParams = DatabaseParameters()
     db = specifier.getDatabaseFromArgs(args, dbParams)
     self.assertIs(db.dbParams, dbParams)
Beispiel #9
0
 def testCreationNotAllowed(self):
     """
     Not passing a database keyword when creation (or a WAMP connection)
     is not allowed must result in a RuntimeError.
     """
     specifier = DatabaseSpecifier(allowCreation=False, allowWamp=False)
     error = ('^Not enough information given to specify a database, '
              'database creation is not enabled, and '
              'no remote WAMP database could be found\.$')
     six.assertRaisesRegex(self, RuntimeError, error,
                           specifier.getDatabaseFromKeywords)
Beispiel #10
0
 def testNoArgsDefaultParameters(self):
     """
     The database returned from getDatabaseFromKeywords when it is
     passed no keywords must have the default database parameters.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier()
     specifier.addArgsToParser(parser)
     args = parser.parse_args([])
     db = specifier.getDatabaseFromArgs(args)
     self.assertIs(None, db.dbParams.compare(DatabaseParameters()))
Beispiel #11
0
 def testPopulationNotAllowed(self):
     """
     Passing a subjects keyword must result in a ValueError if database
     population has not been enabled.
     """
     subjects = Reads()
     specifier = DatabaseSpecifier(allowPopulation=False)
     error = '^Database population is not enabled.$'
     six.assertRaisesRegex(self,
                           ValueError,
                           error,
                           specifier.getDatabaseFromKeywords,
                           subjects=subjects)
Beispiel #12
0
 def testInMemoryDatabaseNotAllowed(self):
     """
     Passing a database keyword results in a ValueError if an in-memory
     database is not allowed.
     """
     original = Database()
     specifier = DatabaseSpecifier(allowInMemory=False)
     error = '^In-memory database specification not enabled.$'
     six.assertRaisesRegex(self,
                           ValueError,
                           error,
                           specifier.getDatabaseFromKeywords,
                           database=original)
Beispiel #13
0
 def testCreationNotAllowed(self):
     """
     Not passing any arguments when creation (or a WAMP connection)
     is not allowed must result in a RuntimeError.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier(allowCreation=False, allowWamp=False)
     specifier.addArgsToParser(parser)
     args = parser.parse_args([])
     error = ('^Not enough information given to specify a database, '
              'database creation is not enabled, and '
              'no remote WAMP database could be found\.$')
     six.assertRaisesRegex(self, RuntimeError, error,
                           specifier.getDatabaseFromArgs, args)
Beispiel #14
0
    def testPopulationFromFastaFile(self):
        """
        Passing a databaseFasta keyword must result in the subjects in the
        file being added to the returned database.
        """
        data = '\n'.join(['>id1', 'FFF', '>id2', 'RRR'])
        mockOpener = mockOpen(read_data=data)
        with patch.object(builtins, 'open', mockOpener):
            db = DatabaseSpecifier().getDatabaseFromKeywords(
                databaseFasta='file.fasta')

        allSubjects = [subject.read for subject in db.getSubjects()]
        self.assertEqual(
            {AARead('id1', 'FFF'), AARead('id2', 'RRR')}, set(allSubjects))
Beispiel #15
0
 def testPopulationFromCommandLineSequences(self):
     """
     Passing --databaseSequence arguments must result in the subjects in the
     sequences being added to the returned database.
     """
     parser = argparse.ArgumentParser()
     specifier = DatabaseSpecifier()
     specifier.addArgsToParser(parser)
     args = parser.parse_args(
         ['--databaseSequence', 'id1 FFF', '--databaseSequence', 'id2 RRR'])
     db = specifier.getDatabaseFromArgs(args)
     allSubjects = [subject.read for subject in db.getSubjects()]
     self.assertEqual(
         {AARead('id1', 'FFF'), AARead('id2', 'RRR')}, set(allSubjects))
Beispiel #16
0
 def testInMemoryDatabaseIsPopulated(self):
     """
     Passing a database keyword with an in-memory database results in that
     database being populated.
     """
     original = Database()
     subjects = Reads()
     subject1 = AARead('id1', 'FFF')
     subject2 = AARead('id2', 'RRR')
     subjects.add(subject1)
     subjects.add(subject2)
     db = DatabaseSpecifier().getDatabaseFromKeywords(database=original,
                                                      subjects=subjects)
     allSubjects = [subject.read for subject in db.getSubjects()]
     self.assertEqual({subject1, subject2}, set(allSubjects))
Beispiel #17
0
    def __init__(self, **kwargs):
        # Set default landmark and trig point finders.
        if 'landmarks' not in kwargs:
            kwargs['landmarks'] = ALL_LANDMARK_CLASSES + [
                c for c in DEV_LANDMARK_CLASSES if c.NAME.startswith('PDB ')
            ]
        if 'trigPoints' not in kwargs:
            kwargs['trigPoints'] = [
                c for c in ALL_TRIG_CLASSES if c.NAME != 'Volume'
            ]

        db = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)
        self._backend = Backend()
        self._backend.configure(db.dbParams)

        self._names = (db.dbParams.landmarkFinderNames() +
                       db.dbParams.trigPointFinderNames())
Beispiel #18
0
    def testPopulationFromFastaFile(self):
        """
        Passing a --databaseFasta argument must result in the subjects in the
        file being added to the returned database.
        """
        parser = argparse.ArgumentParser()
        specifier = DatabaseSpecifier()
        specifier.addArgsToParser(parser)
        args = parser.parse_args(['--databaseFasta', 'file.fasta'])
        data = '\n'.join(['>id1', 'FFF', '>id2', 'RRR'])
        mockOpener = mockOpen(read_data=data)
        with patch.object(builtins, 'open', mockOpener):
            db = specifier.getDatabaseFromArgs(args)

        allSubjects = [subject.read for subject in db.getSubjects()]
        self.assertEqual(
            {AARead('id1', 'FFF'), AARead('id2', 'RRR')}, set(allSubjects))
Beispiel #19
0
    def testPopulationFromInMemoryAndFastaFile(self):
        """
        Passing both subjects and databaseFasta keywords must result in
        all the subjects in memory and in the file being added to the returned
        database.
        """
        subjects = Reads()
        subject1 = AARead('id1', 'FFF')
        subject2 = AARead('id2', 'RRR')
        subjects.add(subject1)
        subjects.add(subject2)

        data = '\n'.join(['>id3', 'FFFF', '>id4', 'RRRR'])
        mockOpener = mockOpen(read_data=data)
        with patch.object(builtins, 'open', mockOpener):
            db = DatabaseSpecifier().getDatabaseFromKeywords(
                subjects=subjects, databaseFasta='file.fasta')

        allSubjects = [subject.read for subject in db.getSubjects()]
        self.assertEqual(
            {subject1, subject2,
             AARead('id3', 'FFFF'),
             AARead('id4', 'RRRR')}, set(allSubjects))
Beispiel #20
0
    def __init__(self, sequences, cutoff, **kwargs):
        """
        A class to work with hashes.
        For a set of given sequences, find all hashes and for each sequence
        make a string of 1 or 0 denoting whether a hash is present in that
        sequence or not. Only include hashes if they occur in more than at '
        least a specified percentage of all given sequences.

        @param sequences: A C{str} filename with a fasta file of sequences to
            be used or a C{dark.reads.Reads} object.
        @param cutoff: A C{float} between 0.0 and 1.0 of the fraction of
            sequences in which a hash has to be present to be included in the
            final string.
        @param kwargs: See
            C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
            additional keywords, all of which are optional.
        """
        if isinstance(sequences, str):
            reads = FastaReads(sequences,
                               readClass=AAReadWithX,
                               upperCase=True)
        else:
            reads = sequences

        database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)
        backend = Backend()
        backend.configure(database.dbParams)

        # Make a dictionary where the keys are the sequence ids and the value
        # is an orderedDict of hashes as returned from getHashes().
        hashes = {}
        for read in reads:
            scannedRead = backend.scan(read)
            readHashes = backend.getHashes(scannedRead)
            hashes[read.id] = readHashes

        # Make a list of all unique hashes that occur.
        totalHashes = set()
        for read in hashes:
            totalHashes.update(hashes[read].keys())

        # Make a dictionary where the key is a hash and the value is a list of
        # the reads in which the hash occurs.
        byHashes = {}
        for hash_ in totalHashes:
            viruses = []
            for readId in hashes:
                try:
                    hashes[readId][hash_]
                except KeyError:
                    continue
                viruses.append(readId)
            byHashes[hash_] = viruses

        # Make a dictionary where the key is a readId and the value is a string
        # of 1 and 0 denoting which hashes occur in which read.
        co = cutoff * len(reads)
        self.hashString = {read.id: '' for read in reads}

        for hash_ in byHashes:
            if len(byHashes[hash_]) > co:
                for virus in self.hashString:
                    if virus in byHashes[hash_]:
                        self.hashString[virus] += '1'
                    else:
                        self.hashString[virus] += '0'
Beispiel #21
0
    def __init__(self, sequences, labels, defaultLabel=None, **kwargs):
        """
        Base class for using cluster analysis to evaluate how well various
        feature finders and database parameter settings can separate a set of
        sequences. The clustering is based on feature offset deltas.

        @param sequences: Either A C{str} filename of sequences to consider or
            a C{light.reads.Reads} instance.
        @param labels: A C{dict} with a label for each sequence id in
            C{sequences}. These are the known categories of each sequence.
        @param defaultLabel: If not C{None}, a label to use for reads whose ids
            are not present in C{labels}. If C{None} and a read id has no label
            a ValueError is raised.
        @param kwargs: See
            C{database.DatabaseSpecifier.getDatabaseFromKeywords} for
            additional keywords, all of which are optional.
        @raises ValueError: If the id of a read is not in labels and no default
            label has been set, or if there are no reads in C{sequences}.
        """
        if isinstance(sequences, str):
            reads = FastaReads(sequences,
                               readClass=AAReadWithX,
                               upperCase=True)
        else:
            reads = sequences
        database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs)
        backend = Backend()
        backend.configure(database.dbParams)
        allOffsetDeltas = []
        trueLabels = []

        for read in reads:
            trueLabel = labels.get(read.id, defaultLabel)
            if trueLabel is None:
                raise ValueError('Read %r has no corresponding label' %
                                 read.id)
            trueLabels.append(trueLabel)
            offsetDeltas = Counter()
            scannedRead = backend.scan(read)
            for landmark, trigPoint in backend.getScannedPairs(scannedRead):
                delta = scaleLog(trigPoint.offset - landmark.offset,
                                 database.dbParams.distanceBase)
                offsetDeltas[delta] += 1
            allOffsetDeltas.append(offsetDeltas)

        nReads = len(reads)

        if nReads == 0:
            raise ValueError('No sequences were found in %r' % sequences)

        # Don't check that len(reads) == len(labels). I.e., ignore extra labels
        # to make using this class interactively more convenient.

        # Create an affinity matrix. Initially set all values to 1.0 so we
        # don't need to later initialize the diagonal.
        affinity = np.ones((nReads, nReads))

        for row, offsetDeltas in enumerate(allOffsetDeltas):
            for col in range(row + 1, nReads):
                affinity[row,
                         col] = affinity[col,
                                         row] = (self.affinityFromOffsetDeltas(
                                             allOffsetDeltas[row],
                                             allOffsetDeltas[col]))

        self.nReads = nReads
        self.affinity = affinity
        self.trueLabels = trueLabels
        # 'if not noSave' or 'noSave=False' (in method definitions).
        #
        # To get both these things at once, we present the user with an
        # option called '--noSave' but we store (dest) the command line
        # value into the 'save' attribute (whose value is the opposite of
        # noSave). For this reason, the default of the variable is True and
        # the argparse action is to save a False value. That makes for a
        # little awkwardness here (and this long comment) but everywhere
        # else in our code we just have simple 'save' variables that
        # default to True and we avoid all the double negatives.
        '--noSave',
        action='store_false',
        default=True,
        dest='save',
        help='If True, shut down the database WITHOUT saving.')

    databaseSpecifier = DatabaseSpecifier(allowCreation=False,
                                          allowInMemory=False)
    databaseSpecifier.addArgsToParser(parser)
    args = parser.parse_args()

    # We know we're a WAMP client, so don't needlessly make the user
    # specify --wampClient.
    args.wampClient = True

    runner = ApplicationRunner(args.wampUrl,
                               args.realm,
                               extra=dict(save=args.save,
                                          filePrefix=args.filePrefix))
    runner.run(ShutdownComponent)