def testPopulationFromCommandLineSequencesAndFastaFile(self): """ Using both command line sequences and --databaseFasta must result in all the command line subjects and those in the file being added to the returned database. """ parser = argparse.ArgumentParser() specifier = DatabaseSpecifier() specifier.addArgsToParser(parser) args = parser.parse_args([ '--databaseFasta', 'file.fasta', '--databaseSequence', 'id1 FFF', '--databaseSequence', 'id2 RRR' ]) data = '\n'.join(['>id3', 'FFFF', '>id4', 'RRRR']) mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): db = specifier.getDatabaseFromArgs(args) allSubjects = [subject.read for subject in db.getSubjects()] self.assertEqual( { AARead('id1', 'FFF'), AARead('id2', 'RRR'), AARead('id3', 'FFFF'), AARead('id4', 'RRRR'), }, set(allSubjects))
def testPopulationNotAllowed(self): """ Using --databaseFasta must result in a ValueError if database population has not been enabled. """ parser = argparse.ArgumentParser() specifier = DatabaseSpecifier(allowPopulation=False) specifier.addArgsToParser(parser)
def testNoArgs(self): """ If no arguments are given, getDatabaseFromArgs must return a database. """ parser = argparse.ArgumentParser() specifier = DatabaseSpecifier() specifier.addArgsToParser(parser) args = parser.parse_args([]) db = specifier.getDatabaseFromArgs(args) self.assertIsInstance(db, Database)
def testPassedParamsAreUsed(self): """ If specific parameters are given, they must be used. """ parser = argparse.ArgumentParser() specifier = DatabaseSpecifier() specifier.addArgsToParser(parser) args = parser.parse_args([]) dbParams = DatabaseParameters() db = specifier.getDatabaseFromArgs(args, dbParams) self.assertIs(db.dbParams, dbParams)
def testNoArgsDefaultParameters(self): """ The database returned from getDatabaseFromKeywords when it is passed no keywords must have the default database parameters. """ parser = argparse.ArgumentParser() specifier = DatabaseSpecifier() specifier.addArgsToParser(parser) args = parser.parse_args([]) db = specifier.getDatabaseFromArgs(args) self.assertIs(None, db.dbParams.compare(DatabaseParameters()))
def testPopulationByInMemorySubjects(self): """ Passing a subjects keyword must result in the subjects being added to the returned database. """ subjects = Reads() subject1 = AARead('id1', 'FFF') subject2 = AARead('id2', 'RRR') subjects.add(subject1) subjects.add(subject2) db = DatabaseSpecifier().getDatabaseFromKeywords(subjects=subjects) allSubjects = [subject.read for subject in db.getSubjects()] self.assertEqual({subject1, subject2}, set(allSubjects))
def testCreationNotAllowed(self): """ Not passing any arguments when creation (or a WAMP connection) is not allowed must result in a RuntimeError. """ parser = argparse.ArgumentParser() specifier = DatabaseSpecifier(allowCreation=False, allowWamp=False) specifier.addArgsToParser(parser) args = parser.parse_args([]) error = ('^Not enough information given to specify a database, ' 'database creation is not enabled, and ' 'no remote WAMP database could be found\.$') six.assertRaisesRegex(self, RuntimeError, error, specifier.getDatabaseFromArgs, args)
def testPopulationFromCommandLineSequences(self): """ Passing --databaseSequence arguments must result in the subjects in the sequences being added to the returned database. """ parser = argparse.ArgumentParser() specifier = DatabaseSpecifier() specifier.addArgsToParser(parser) args = parser.parse_args( ['--databaseSequence', 'id1 FFF', '--databaseSequence', 'id2 RRR']) db = specifier.getDatabaseFromArgs(args) allSubjects = [subject.read for subject in db.getSubjects()] self.assertEqual( {AARead('id1', 'FFF'), AARead('id2', 'RRR')}, set(allSubjects))
def testPopulationFromFastaFile(self): """ Passing a databaseFasta keyword must result in the subjects in the file being added to the returned database. """ data = '\n'.join(['>id1', 'FFF', '>id2', 'RRR']) mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): db = DatabaseSpecifier().getDatabaseFromKeywords( databaseFasta='file.fasta') allSubjects = [subject.read for subject in db.getSubjects()] self.assertEqual( {AARead('id1', 'FFF'), AARead('id2', 'RRR')}, set(allSubjects))
def testInMemoryDatabaseIsPopulated(self): """ Passing a database keyword with an in-memory database results in that database being populated. """ original = Database() subjects = Reads() subject1 = AARead('id1', 'FFF') subject2 = AARead('id2', 'RRR') subjects.add(subject1) subjects.add(subject2) db = DatabaseSpecifier().getDatabaseFromKeywords(database=original, subjects=subjects) allSubjects = [subject.read for subject in db.getSubjects()] self.assertEqual({subject1, subject2}, set(allSubjects))
def getFractionOfStructuresCovered(self): """ Return the fraction of known structures matched by at least one substring in the subset that is being evaluated. """ hit = 0 total = 0 db = DatabaseSpecifier().getDatabaseFromKeywords( trigPoints=[], landmarks=['AC ' + self.structureType], acAlphaHelixFilename=self.acAlphaHelixFilename, acAlphaHelix310Filename=self.acAlphaHelix310Filename, acAlphaHelixCombinedFilename=self.acAlphaHelixCombinedFilename, acAlphaHelixPiFilename=self.acAlphaHelixPiFilename, acExtendedStrandFilename=self.acExtendedStrandFilename) backend = Backend() backend.configure(db.dbParams) for read in FastaReads(self.structureFile, readClass=AAReadWithX, checkAlphabet=0): total += 1 scannedRead = backend.scan(read) if len(scannedRead.landmarks) > 0: hit += 1 return hit / total if total else 0.0
def testNoKeywordsDefaultParameters(self): """ The database returned from getDatabaseFromKeywords when it is passed no keywords must have the default database parameters. """ db = DatabaseSpecifier().getDatabaseFromKeywords() self.assertIs(None, db.dbParams.compare(DatabaseParameters()))
def testNoKeywords(self): """ The getDatabaseFromKeywords method must return a database when it is passed no keywords. """ db = DatabaseSpecifier().getDatabaseFromKeywords() self.assertIsInstance(db, Database)
def testPopulationFromFastaFile(self): """ Passing a --databaseFasta argument must result in the subjects in the file being added to the returned database. """ parser = argparse.ArgumentParser() specifier = DatabaseSpecifier() specifier.addArgsToParser(parser) args = parser.parse_args(['--databaseFasta', 'file.fasta']) data = '\n'.join(['>id1', 'FFF', '>id2', 'RRR']) mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): db = specifier.getDatabaseFromArgs(args) allSubjects = [subject.read for subject in db.getSubjects()] self.assertEqual( {AARead('id1', 'FFF'), AARead('id2', 'RRR')}, set(allSubjects))
def testInMemoryDatabaseIsReturned(self): """ Passing a database keyword with an in-memory database results in that database being returned. """ original = Database() db = DatabaseSpecifier().getDatabaseFromKeywords(database=original) self.assertIs(original, db)
def testCreationNotAllowed(self): """ Not passing a database keyword when creation (or a WAMP connection) is not allowed must result in a RuntimeError. """ specifier = DatabaseSpecifier(allowCreation=False, allowWamp=False) error = ('^Not enough information given to specify a database, ' 'database creation is not enabled, and ' 'no remote WAMP database could be found\.$') six.assertRaisesRegex(self, RuntimeError, error, specifier.getDatabaseFromKeywords)
def testPopulationNotAllowed(self): """ Passing a subjects keyword must result in a ValueError if database population has not been enabled. """ subjects = Reads() specifier = DatabaseSpecifier(allowPopulation=False) error = '^Database population is not enabled.$' six.assertRaisesRegex(self, ValueError, error, specifier.getDatabaseFromKeywords, subjects=subjects)
def testInMemoryDatabaseNotAllowed(self): """ Passing a database keyword results in a ValueError if an in-memory database is not allowed. """ original = Database() specifier = DatabaseSpecifier(allowInMemory=False) error = '^In-memory database specification not enabled.$' six.assertRaisesRegex(self, ValueError, error, specifier.getDatabaseFromKeywords, database=original)
def testPopulationFromInMemoryAndFastaFile(self): """ Passing both subjects and databaseFasta keywords must result in all the subjects in memory and in the file being added to the returned database. """ subjects = Reads() subject1 = AARead('id1', 'FFF') subject2 = AARead('id2', 'RRR') subjects.add(subject1) subjects.add(subject2) data = '\n'.join(['>id3', 'FFFF', '>id4', 'RRRR']) mockOpener = mockOpen(read_data=data) with patch.object(builtins, 'open', mockOpener): db = DatabaseSpecifier().getDatabaseFromKeywords( subjects=subjects, databaseFasta='file.fasta') allSubjects = [subject.read for subject in db.getSubjects()] self.assertEqual( {subject1, subject2, AARead('id3', 'FFFF'), AARead('id4', 'RRRR')}, set(allSubjects))
def __init__(self, **kwargs): # Set default landmark and trig point finders. if 'landmarks' not in kwargs: kwargs['landmarks'] = ALL_LANDMARK_CLASSES + [ c for c in DEV_LANDMARK_CLASSES if c.NAME.startswith('PDB ') ] if 'trigPoints' not in kwargs: kwargs['trigPoints'] = [ c for c in ALL_TRIG_CLASSES if c.NAME != 'Volume' ] db = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs) self._backend = Backend() self._backend.configure(db.dbParams) self._names = (db.dbParams.landmarkFinderNames() + db.dbParams.trigPointFinderNames())
def affinityMatrix(queries, findParams=None, subjects=None, symmetric=True, computeDiagonal=False, diagonalValue=1.0, progressFunc=None, returnDict=False, returnAnalysis=False, **kwargs): """ Produce an affinity matrix containing scores for a set of reads matched against a set of subjects. @param queries: Either A C{str} filename of sequences to consider or a C{light.reads.Reads} instance. @param findParams: A C{light.parameters.FindParameters} instance. @param subjects: Either 1) a C{str} filename of sequences to consider, or 2) a C{light.reads.Reads} instance, or 3) C{None}, in which case the C{queries} will also be used as the subjects. @param symmetric: If C{True}, corresponding off-diagonal scores will be assumed to be equal and only computed once. I.e., this is a speedup when scores (affinities) are symmetric. This option gives the biggest speed up on a square matrix, but can also be used when the matrix is not square (e.g., when making a 2x4 matrix comparing {A, B} against {A, B, C, D}, the A->B distance can be used to set the B->A distance). @param computeDiagonal: If C{True}, values on the diagonal will be computed (i.e., obtained from find). Otherwise, all diagonal values will be set to C{diagonalValue}. @param diagonalValue: The result that diagonal values will all be set to if C{computeDiagonal} is C{False}. @param progressFunc: If not C{None}, a function that takes two arguments. The function will be called before each query sequence is processed. The arguments will be the C{int} (zero-based) number of the query and the query (an AAReadWithX instance) itself. @param returnDict: If C{True}, return a C{dict} keyed by query id, whose values are C{dict}s keyed by subject id, whose values are C{float} scores. In other words, a 2-level deep C{dict} that allows the caller to look up a score via something like C{result[query.id][subject.id]}. @param returnAnalysis: This determines what information is returned in each affinity matrix location. If C{False}, the default, each location will contain the overall score for the corresponding query/subject pair, or 0.0 if the query did not match the subject. If C{True}, the location will contain the analysis C{dict} computed by the C{light.result.Result} instance, or C{None} if the query did not match the subject. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. @raise ValueError: If C{returnDict} is C{True} and there is a duplicated query or subject id. @return: If C{returnDict} is C{True}, a C{dict} as described above, else a two-dimensional array whose dimensions are the query index (in C{queries}, and then the subject index (in C{subjects}). The values in the returned structure are as described in C{returnAnalysis}, above. """ if isinstance(queries, str): queries = list( FastaReads(queries, readClass=AAReadWithX, upperCase=True)) if subjects is None: subjects = queries else: if isinstance(subjects, str): subjects = list( FastaReads(subjects, readClass=AAReadWithX, upperCase=True)) findParams = findParams or FindParameters() db = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs) subjectIndices = [] for subject in subjects: _, subjectIndex, _ = db.addSubject(subject) subjectIndices.append(subjectIndex) nQueries = len(queries) nSubjects = len(subjectIndices) # Prepare a result array (we'll walk through this later to make a dict # if returnDict is True). affinity = [] noMatchValue = None if returnAnalysis else 0.0 for _ in range(nQueries): affinity.append([noMatchValue] * nSubjects) for i, query in enumerate(queries): if progressFunc: progressFunc(i, query) if symmetric: # We don't have to consider all subjects in the find, so pass a # restricted set of subject indices to restrict the search to. # The ones we omit have already been looked up. # # For clarity, there's a little code repetition here. if computeDiagonal: wantedIndices = set(subjectIndices[i:]) else: wantedIndices = set(subjectIndices[i + 1:]) result = db.find(query, findParams, storeFullAnalysis=returnAnalysis, subjectIndices=wantedIndices) else: result = db.find(query, findParams, storeFullAnalysis=returnAnalysis) analysis = result.analysis for j in range(nSubjects): if j < i and symmetric: score = affinity[j][i] elif j == i and not computeDiagonal: score = diagonalValue else: # Be careful how we access the analysis. It is a defaultdict, # so its keys are created on access. I.e., we must use 'in' # to test for membership not try/except, because # analysis[subjectIndex] will never raise a KeyError. if subjectIndices[j] in analysis: if returnAnalysis: score = analysis[subjectIndices[j]] else: score = analysis[subjectIndices[j]]['overallScore'] else: # The query didn't match the subject. We don't actually # need to set this value as it is already present due # to the initialization of affinity above, but it # simplifies the code to do so. score = noMatchValue affinity[i][j] = score if returnDict: result = {} for i, query in enumerate(queries): if query.id in result: raise ValueError('Query id %r appears more than once.' % query.id) result[query.id] = values = {} for j, subject in enumerate(subjects): if subject.id in values: raise ValueError('Subject id %r appears more than once.' % subject.id) values[subject.id] = affinity[i][j] return result else: return affinity
#!/usr/bin/env python import argparse from autobahn.asyncio.wamp import ApplicationRunner from light.autobahn.database import DatabaseComponent from light.database import DatabaseSpecifier if __name__ == '__main__': parser = argparse.ArgumentParser( description='Start a WAMP-based distributed light-matter database.') databaseSpecifier = DatabaseSpecifier(allowInMemory=False) databaseSpecifier.addArgsToParser(parser) args = parser.parse_args() # We're always using WAMP for distributed databases. args.wampServer = True database = databaseSpecifier.getDatabaseFromArgs(args) runner = ApplicationRunner(args.wampUrl, args.realm, extra=dict(database=database)) runner.run(DatabaseComponent)
def __init__(self, sequences, cutoff, **kwargs): """ A class to work with hashes. For a set of given sequences, find all hashes and for each sequence make a string of 1 or 0 denoting whether a hash is present in that sequence or not. Only include hashes if they occur in more than at ' least a specified percentage of all given sequences. @param sequences: A C{str} filename with a fasta file of sequences to be used or a C{dark.reads.Reads} object. @param cutoff: A C{float} between 0.0 and 1.0 of the fraction of sequences in which a hash has to be present to be included in the final string. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. """ if isinstance(sequences, str): reads = FastaReads(sequences, readClass=AAReadWithX, upperCase=True) else: reads = sequences database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs) backend = Backend() backend.configure(database.dbParams) # Make a dictionary where the keys are the sequence ids and the value # is an orderedDict of hashes as returned from getHashes(). hashes = {} for read in reads: scannedRead = backend.scan(read) readHashes = backend.getHashes(scannedRead) hashes[read.id] = readHashes # Make a list of all unique hashes that occur. totalHashes = set() for read in hashes: totalHashes.update(hashes[read].keys()) # Make a dictionary where the key is a hash and the value is a list of # the reads in which the hash occurs. byHashes = {} for hash_ in totalHashes: viruses = [] for readId in hashes: try: hashes[readId][hash_] except KeyError: continue viruses.append(readId) byHashes[hash_] = viruses # Make a dictionary where the key is a readId and the value is a string # of 1 and 0 denoting which hashes occur in which read. co = cutoff * len(reads) self.hashString = {read.id: '' for read in reads} for hash_ in byHashes: if len(byHashes[hash_]) > co: for virus in self.hashString: if virus in byHashes[hash_]: self.hashString[virus] += '1' else: self.hashString[virus] += '0'
#!/usr/bin/env python import sys import argparse from autobahn.asyncio.wamp import ApplicationRunner from light.autobahn.backend import BackendComponent from light.database import DatabaseSpecifier if sys.version_info < (3, 3): raise Exception('The light matter autobahn code needs Python 3.3 or ' 'later.') if __name__ == '__main__': parser = argparse.ArgumentParser( description=('Start a WAMP-based distributed light-matter database ' 'backend.')) databaseSpecifier = DatabaseSpecifier(allowInMemory=False) databaseSpecifier.addArgsToParser(parser) args = parser.parse_args() runner = ApplicationRunner(args.wampUrl, args.realm, extra=dict(args=args)) runner.run(BackendComponent)
def __init__(self, sequences, labels, defaultLabel=None, **kwargs): """ Base class for using cluster analysis to evaluate how well various feature finders and database parameter settings can separate a set of sequences. The clustering is based on feature offset deltas. @param sequences: Either A C{str} filename of sequences to consider or a C{light.reads.Reads} instance. @param labels: A C{dict} with a label for each sequence id in C{sequences}. These are the known categories of each sequence. @param defaultLabel: If not C{None}, a label to use for reads whose ids are not present in C{labels}. If C{None} and a read id has no label a ValueError is raised. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. @raises ValueError: If the id of a read is not in labels and no default label has been set, or if there are no reads in C{sequences}. """ if isinstance(sequences, str): reads = FastaReads(sequences, readClass=AAReadWithX, upperCase=True) else: reads = sequences database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs) backend = Backend() backend.configure(database.dbParams) allOffsetDeltas = [] trueLabels = [] for read in reads: trueLabel = labels.get(read.id, defaultLabel) if trueLabel is None: raise ValueError('Read %r has no corresponding label' % read.id) trueLabels.append(trueLabel) offsetDeltas = Counter() scannedRead = backend.scan(read) for landmark, trigPoint in backend.getScannedPairs(scannedRead): delta = scaleLog(trigPoint.offset - landmark.offset, database.dbParams.distanceBase) offsetDeltas[delta] += 1 allOffsetDeltas.append(offsetDeltas) nReads = len(reads) if nReads == 0: raise ValueError('No sequences were found in %r' % sequences) # Don't check that len(reads) == len(labels). I.e., ignore extra labels # to make using this class interactively more convenient. # Create an affinity matrix. Initially set all values to 1.0 so we # don't need to later initialize the diagonal. affinity = np.ones((nReads, nReads)) for row, offsetDeltas in enumerate(allOffsetDeltas): for col in range(row + 1, nReads): affinity[row, col] = affinity[col, row] = (self.affinityFromOffsetDeltas( allOffsetDeltas[row], allOffsetDeltas[col])) self.nReads = nReads self.affinity = affinity self.trueLabels = trueLabels