def testCollectReadHashes(self): """ The getHashes method must return a dict keyed by (landmark, trigPoints) hash with values containing the read offsets. """ dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[Peaks], distanceBase=1.0) be = Backend() be.configure(dbParams) query = AARead('query', 'FRRRFRRRFASAASAFRRRFRRRFASAASA') scannedQuery = be.scan(query) hashCount = be.getHashes(scannedQuery) helixAt0 = Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 0, 9, 2) helixAt15 = Landmark(AlphaHelix.NAME, AlphaHelix.SYMBOL, 15, 9, 2) peakAt10 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 10) peakAt13 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 13) peakAt25 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 25) peakAt28 = TrigPoint(Peaks.NAME, Peaks.SYMBOL, 28) self.assertEqual( { 'A2:P:28': [[helixAt0, peakAt28]], 'A2:P:25': [[helixAt0, peakAt25]], 'A2:P:13': [[helixAt0, peakAt13], [helixAt15, peakAt28]], 'A2:P:10': [[helixAt0, peakAt10], [helixAt15, peakAt25]], 'A2:P:-5': [[helixAt15, peakAt10]], 'A2:P:-2': [[helixAt15, peakAt13]], 'A2:A2:15': [[helixAt0, helixAt15]], }, hashCount)
def __init__(self, histogram, query, subject, dbParams): self._histogram = histogram self._queryLen = len(query) self._subjectLen = len(subject) from light.backend import Backend backend = Backend() backend.configure(dbParams) scannedQuery = backend.scan(query) allQueryHashes = backend.getHashes(scannedQuery) self._allQueryFeatures = getHashFeatures(allQueryHashes) scannedSubject = backend.scan(subject.read) allSubjectHashes = backend.getHashes(scannedSubject) self._allSubjectFeatures = getHashFeatures(allSubjectHashes)
def __init__(self, histogram, query, subject, dbParams, weights=None): self._histogram = histogram self._queryLen = len(query) self._subjectLen = len(subject) self._weights = self.DEFAULT_WEIGHTS if weights is None else weights from light.backend import Backend backend = Backend() backend.configure(dbParams) scannedQuery = backend.scan(query) allQueryHashes = backend.getHashes(scannedQuery) self._allQueryFeatures = getHashFeatures(allQueryHashes) scannedSubject = backend.scan(subject.read) allSubjectHashes = backend.getHashes(scannedSubject) self._allSubjectFeatures = getHashFeatures(allSubjectHashes)
def testCollectReadHashesWithOneLandmark(self): """ The getHashes method must return a dict keyed by (landmark, trigPoints) hash with values containing the read offsets. The result should be empty if there is only one landmark in the read. """ dbParams = DatabaseParameters(landmarks=[AlphaHelix], trigPoints=[]) be = Backend() be.configure(dbParams) query = AARead('query', 'FRRRFRRRF') scannedQuery = be.scan(query) hashCount = be.getHashes(scannedQuery) self.assertEqual({}, hashCount)
def __init__(self, sequences, cutoff, **kwargs): """ A class to work with hashes. For a set of given sequences, find all hashes and for each sequence make a string of 1 or 0 denoting whether a hash is present in that sequence or not. Only include hashes if they occur in more than at ' least a specified percentage of all given sequences. @param sequences: A C{str} filename with a fasta file of sequences to be used or a C{dark.reads.Reads} object. @param cutoff: A C{float} between 0.0 and 1.0 of the fraction of sequences in which a hash has to be present to be included in the final string. @param kwargs: See C{database.DatabaseSpecifier.getDatabaseFromKeywords} for additional keywords, all of which are optional. """ if isinstance(sequences, str): reads = FastaReads(sequences, readClass=AAReadWithX, upperCase=True) else: reads = sequences database = DatabaseSpecifier().getDatabaseFromKeywords(**kwargs) backend = Backend() backend.configure(database.dbParams) # Make a dictionary where the keys are the sequence ids and the value # is an orderedDict of hashes as returned from getHashes(). hashes = {} for read in reads: scannedRead = backend.scan(read) readHashes = backend.getHashes(scannedRead) hashes[read.id] = readHashes # Make a list of all unique hashes that occur. totalHashes = set() for read in hashes: totalHashes.update(hashes[read].keys()) # Make a dictionary where the key is a hash and the value is a list of # the reads in which the hash occurs. byHashes = {} for hash_ in totalHashes: viruses = [] for readId in hashes: try: hashes[readId][hash_] except KeyError: continue viruses.append(readId) byHashes[hash_] = viruses # Make a dictionary where the key is a readId and the value is a string # of 1 and 0 denoting which hashes occur in which read. co = cutoff * len(reads) self.hashString = {read.id: '' for read in reads} for hash_ in byHashes: if len(byHashes[hash_]) > co: for virus in self.hashString: if virus in byHashes[hash_]: self.hashString[virus] += '1' else: self.hashString[virus] += '0'
def calculateScore(self): """ Calculates the overall score, as described above. @return: a C{float} overall score for all significant bins (or C{None} if there are no significant bins) and a C{dict} with information about the score. """ # We could do more checking here and use the score of the best bin as # the overall score if there is only one significant bin or if the # score of the best bin is 1.0. # Don't attempt to calculate an overall score if there are no # significant bins. if not self._significantBins: analysis = { 'score': None, 'scoreClass': self.__class__, } return None, analysis from light.backend import Backend backend = Backend() backend.configure(self._dbParams) allQueryFeatures = getHashFeatures( backend.getHashes(backend.scan(self._query))) allSubjectFeatures = getHashFeatures( backend.getHashes(backend.scan(self._subject.read))) # Keep track of variables state = { # overallMatchedQueryOffsets and overallMatchedSubjectOffsets will # contain all int offsets that are in matching features (and thus # inside the matched region). 'overallMatchedQueryOffsets': set(), 'overallMatchedSubjectOffsets': set(), # overallUnmatchedQueryOffsets and overallUnmatchedSubjectOffsets # will contain all int offsets that are in features that don't # match, but which are inside the matched region. 'overallUnmatchedQueryOffsets': set(), 'overallUnmatchedSubjectOffsets': set(), # The set of all offsets in all bins (whether or not the offsets # are in matched features, unmatched features, or not in any # feature). 'queryOffsetsInBins': set(), 'subjectOffsetsInBins': set(), 'score': 0.0, 'denominatorQuery': 0.0, 'denominatorSubject': 0.0, 'matchedOffsetCount': 0, 'matchedRegionScore': 0.0, 'numeratorQuery': 0.0, 'numeratorSubject': 0.0, 'normalizerQuery': 0.0, 'normalizerSubject': 0.0, 'totalOffsetCount': 0, 'scoreClass': self.__class__, 'queryOffsetsInBinsCount': 0, 'subjectOffsetsInBinsCount': 0, 'numberOfBinsConsidered': 0, } # Consider the significantBins one by one until the overall score drops # below the bestBinScore, or we run out of bins. for i, bin_ in enumerate((sb['bin'] for sb in self._significantBins), start=1): result = addBin(bin_, allQueryFeatures, allSubjectFeatures, state) # Check if we can add more bins, or if we need to return here. if result['score'] >= state['score']: # The new overallScore is higher or equal to the current # overallScore. Continue adding the next bin using the newly # calculated values. state.update(result) else: # The new overallScore is lower than the current overallScore. break state['numberOfBinsConsidered'] = i return state['score'], state
def calculateScore(self): """ Calculates the overall score for all significant bins, as described above. @return: a C{float} overall score for all significant bins (or C{None} if there are no significant bins) and a C{dict} with information about the score. """ if self._significantBins: bestBinScore = self._significantBins[0]['score'] if not self._significantBins: analysis = { 'score': None, 'scoreClass': self.__class__, } return None, analysis from light.backend import Backend backend = Backend() backend.configure(self._dbParams) allQueryFeatures = getHashFeatures( backend.getHashes(backend.scan(self._query))) allSubjectFeatures = getHashFeatures( backend.getHashes(backend.scan(self._subject.read))) # overallMatchedQueryOffsets and overallMatchedSubjectOffsets will # contain all int offsets that are in matching features (and thus # inside the matched region). overallMatchedQueryOffsets = set() overallMatchedSubjectOffsets = set() # overallUnmatchedQueryOffsets and overallUnmatchedSubjectOffsets # will contain all int offsets that are in features that don't match, # but which are inside the matched region. overallUnmatchedQueryOffsets = set() overallUnmatchedSubjectOffsets = set() # The set of all offsets in all bins (whether or not the offsets are in # matched features, unmatched features, or not in any feature. queryOffsetsInBins = set() subjectOffsetsInBins = set() # Get the features and their offsets which are matched and unmatched in # subject and query in all bins. for bin_ in (sb['bin'] for sb in self._significantBins): # Query. matchedOffsets, unmatchedOffsets, minOffset, maxOffset = ( offsetsInBin(bin_, 'query', allQueryFeatures)) overallMatchedQueryOffsets.update(matchedOffsets) overallUnmatchedQueryOffsets.update(unmatchedOffsets) queryOffsetsInBins.update(range(minOffset, maxOffset + 1)) # Subject. matchedOffsets, unmatchedOffsets, minOffset, maxOffset = ( offsetsInBin(bin_, 'subject', allSubjectFeatures)) overallMatchedSubjectOffsets.update(matchedOffsets) overallUnmatchedSubjectOffsets.update(unmatchedOffsets) subjectOffsetsInBins.update(range(minOffset, maxOffset + 1)) # Make sure none of the overall matched offsets are in the overall # unmatchedOffsets. overallMatchedQueryOffsets -= overallUnmatchedQueryOffsets overallMatchedSubjectOffsets -= overallUnmatchedSubjectOffsets # Overall score calculation step 1: the matched region score (MRS). matchedOffsetCount = (len(overallMatchedQueryOffsets) + len(overallMatchedSubjectOffsets)) totalOffsetCount = (matchedOffsetCount + len(overallUnmatchedQueryOffsets) + len(overallUnmatchedSubjectOffsets)) try: matchedRegionScore = matchedOffsetCount / totalOffsetCount except ZeroDivisionError: # A small optimization could be done here. If the MRS is zero, # we already know the overall score will be zero, so we could # return at this point. To keep things simple, for now, just # continue with the overall calculation. matchedRegionScore = 0.0 # Overall score calculation step 2: the length normalizer (LN). normalizerQuery, numeratorQuery, denominatorQuery = ( computeLengthNormalizer(allQueryFeatures, overallMatchedQueryOffsets, overallUnmatchedQueryOffsets, queryOffsetsInBins)) # There is a small optimization that could be done at this point. # If the query normalizer is 1.0, don't bother to compute a # normalizer for the subject (due to the use of max() below and # because a normalizer is always <= 1.0). But to keep the code # simpler, for now, we still compute both normalizers. normalizerSubject, numeratorSubject, denominatorSubject = ( computeLengthNormalizer(allSubjectFeatures, overallMatchedSubjectOffsets, overallUnmatchedSubjectOffsets, subjectOffsetsInBins)) # Calculate the final score, as descibed in the docstring. score = matchedRegionScore * max(normalizerQuery, normalizerSubject) # The overall score can be lower than the best bin score, for # example when a sequence is compared against itself, where the # bestBinScore will be 1.0, but the overallScore can be lower, # because worse bins are taken into account. We don't allow that. if bestBinScore is not None and score < bestBinScore: overallScore = bestBinScore adjusted = True else: overallScore = score adjusted = False analysis = { 'denominatorQuery': denominatorQuery, 'denominatorSubject': denominatorSubject, 'matchedOffsetCount': matchedOffsetCount, 'matchedSubjectOffsetCount': len(overallMatchedSubjectOffsets), 'matchedQueryOffsetCount': len(overallMatchedQueryOffsets), 'matchedRegionScore': matchedRegionScore, 'numeratorQuery': numeratorQuery, 'numeratorSubject': numeratorSubject, 'normalizerQuery': normalizerQuery, 'normalizerSubject': normalizerSubject, 'score': overallScore, 'scoreClass': self.__class__, 'totalOffsetCount': totalOffsetCount, 'queryOffsetsInBins': len(queryOffsetsInBins), 'subjectOffsetsInBins': len(subjectOffsetsInBins), 'overallScoreAdjustedToBestBinScore': adjusted, } return overallScore, analysis