def calculateScore(self, binIndex): """ Calculates the score for a given histogram bin. @param binIndex: The C{int} index of the bin to examine. @return: A 2-tuple, containing the C{float} score of the bin and a C{dict} with the analysis leading to the score. """ queryFeatures, queryOffsets = histogramBinFeatures( self._histogram[binIndex], 'query') subjectFeatures, subjectOffsets = histogramBinFeatures( self._histogram[binIndex], 'subject') matchScore = self._findParams.featureMatchScore * ( len(queryFeatures) + len(subjectFeatures)) minQueryOffset = minWithDefault(queryOffsets, default=None) maxQueryOffset = maxWithDefault(queryOffsets, default=None) minSubjectOffset = minWithDefault(subjectOffsets, default=None) maxSubjectOffset = maxWithDefault(subjectOffsets, default=None) # The mismatch score is applied to all features that are not # among those in the bin and which fall inside the max and min # offsets of the features in the bin. mismatchScore = self._findParams.featureMismatchScore * ( len(list(filter( lambda f: featureInRange(f, minQueryOffset, maxQueryOffset), self._allQueryFeatures - queryFeatures))) + len(list(filter( lambda f: featureInRange(f, minSubjectOffset, maxSubjectOffset), self._allSubjectFeatures - subjectFeatures)))) score = matchScore + mismatchScore # We could put more in here, but I have a feeling we wont be using # this score method as FeatureAAScore is (hopefully) better. If we # need to add more detail (e.g., the number of features in each # class) we can easily add it. analysis = { 'minQueryOffset': minQueryOffset, 'maxQueryOffset': maxQueryOffset, 'minSubjectOffset': minSubjectOffset, 'maxSubjectOffset': maxSubjectOffset, 'matchScore': matchScore, 'mismatchScore': mismatchScore, 'score': score, 'scoreClass': self.__class__, } return score, analysis
def calculateScore(self, binIndex): """ Calculates the score for a given histogram bin. The score is the product of two quotients. The first quotient is the matched region score (MRS). In the numerator, we have the number of AA locations that are in features that are in hashes that match between the subject and the query. The denominator is the number of AA locations that are in features which are in all hashes in the matching regions of the query and subject. Leaving the score like this would mean that a short match can have the same score as a long match. To account for this we multiply the MRS with a length normaliser (LN). The LN is the quotient of all AA locations in the matched region divided by the total length of the subject or the query, whichever of the two is smaller. @param binIndex: The C{int} index of the bin to examine. @return: A 2-tuple, containing the C{float} score of the bin and a C{dict} with the analysis leading to the score. """ # Get the features and their offsets which match in subject and query. # These will be used to calculate the numerator of the score. matchedQueryFeatures, matchedQueryOffsets = histogramBinFeatures( self._histogram[binIndex], 'query') matchedSubjectFeatures, matchedSubjectOffsets = histogramBinFeatures( self._histogram[binIndex], 'subject') # Get the extreme offsets in the matched region of query and subject. minQueryOffset = minWithDefault(matchedQueryOffsets, default=None) maxQueryOffset = maxWithDefault(matchedQueryOffsets, default=None) minSubjectOffset = minWithDefault(matchedSubjectOffsets, default=None) maxSubjectOffset = maxWithDefault(matchedSubjectOffsets, default=None) # Calculate the length of the matched region for the query and the # subject. try: queryMatchedRegionLength = maxQueryOffset - minQueryOffset + 1 except TypeError: queryMatchedRegionLength = 0 try: subjectMatchedRegionLength = (maxSubjectOffset - minSubjectOffset + 1) except TypeError: subjectMatchedRegionLength = 0 # Get all features and their offsets which are present in the subject # and the query within the matched region. These will be used to # calculate the denominator. unmatchedQueryOffsets = set() for feature in filter( lambda f: featureInRange(f, minQueryOffset, maxQueryOffset), self._allQueryFeatures - matchedQueryFeatures): unmatchedQueryOffsets.update(feature.coveredOffsets()) # The unmatched offsets shouldn't contain any offsets that were # matched. This can occur if an unmatched feature overlaps with a # matched feature. unmatchedQueryOffsets -= matchedQueryOffsets unmatchedSubjectOffsets = set() for feature in filter( lambda f: featureInRange(f, minSubjectOffset, maxSubjectOffset), self._allSubjectFeatures - matchedSubjectFeatures): unmatchedSubjectOffsets.update(feature.coveredOffsets()) # The unmatched offsets shouldn't contain any offsets that were # matched. This can occur if an unmatched feature overlaps with a # matched feature. unmatchedSubjectOffsets -= matchedSubjectOffsets matchedOffsetCount = ( len(matchedQueryOffsets) + len(matchedSubjectOffsets)) totalOffsetCount = matchedOffsetCount + ( len(unmatchedQueryOffsets) + len(unmatchedSubjectOffsets)) try: matchedRegionScore = matchedOffsetCount / totalOffsetCount except ZeroDivisionError: matchedRegionScore = 0.0 # The length normaliser is a quotient which is calculated separately # for the query and the subject. The numerator is the number of AA's in # the matched region. The denominator is the length of the subject or # the query. The quotient which is bigger is then used to do the length # normalisation. lengthNormaliser = max(queryMatchedRegionLength / self._queryLen, subjectMatchedRegionLength / self._subjectLen) score = matchedRegionScore * lengthNormaliser analysis = { 'denominatorQuery': self._queryLen, 'denominatorSubject': self._subjectLen, 'matchedOffsetCount': matchedOffsetCount, 'matchedSubjectOffsetCount': len(matchedSubjectOffsets), 'matchedQueryOffsetCount': len(matchedQueryOffsets), 'matchedRegionScore': matchedRegionScore, 'maxQueryOffset': maxQueryOffset, 'maxSubjectOffset': maxSubjectOffset, 'minQueryOffset': minQueryOffset, 'minSubjectOffset': minSubjectOffset, 'queryMatchedRegionSize': queryMatchedRegionLength, 'subjectMatchedRegionSize': subjectMatchedRegionLength, 'normaliserQuery': queryMatchedRegionLength / self._queryLen, 'normaliserSubject': subjectMatchedRegionLength / self._subjectLen, 'score': score, 'scoreClass': self.__class__, 'totalOffsetCount': totalOffsetCount, } return score, analysis
def calculateScore(self, binIndex): """ Calculates the score for a given histogram bin. The score is a quotient. In the numerator, we have the weighted number of AA locations that are in features that are in hashes that match between the subject and the query. The denominator is the weighted number of AA locations that are in features which are in all hashes in the matching regions of the query and subject. Leaving the score like this would mean that a short match can have the same score as a long match. To account for this, the quotient from above is multiplied by the matched fraction of the shorter sequence. @param binIndex: The C{int} index of the bin to examine. @return: A 2-tuple, containing the C{float} score of the bin and a C{dict} with the analysis leading to the score. """ # Get the features and their offsets with associated weights which # match in subject and query. # These will be used to calculate the numerator of the score. matchedQFeatures, matchedQOffsets = weightedHistogramBinFeatures( self._histogram[binIndex], 'query', self._weights) matchedSFeatures, matchedSOffsets = weightedHistogramBinFeatures( self._histogram[binIndex], 'subject', self._weights) # Get the extreme offsets in the matched region of query and subject. minQueryOffset = minWithDefault(matchedQOffsets, default=None) maxQueryOffset = maxWithDefault(matchedQOffsets, default=None) minSubjectOffset = minWithDefault(matchedSOffsets, default=None) maxSubjectOffset = maxWithDefault(matchedSOffsets, default=None) # Get all features and their offsets with their associated weights # which are present in the subject and the query within the matched # region. # These will be used to calculate the denominator of the score. unmatchedQueryOffsets = defaultdict(list) for feature in filter( lambda f: featureInRange(f, minQueryOffset, maxQueryOffset), self._allQueryFeatures - matchedQFeatures): for offset in feature.coveredOffsets(): unmatchedQueryOffsets[offset].append( self._weights[feature.name]) unmatchedSubjectOffsets = defaultdict(list) for feature in filter( lambda f: featureInRange(f, minSubjectOffset, maxSubjectOffset), self._allSubjectFeatures - matchedSFeatures): for offset in feature.coveredOffsets(): unmatchedSubjectOffsets[offset].append( self._weights[feature.name]) # The unmatched offsets in the query and the subject shouldn't contain # any offsets that were matched. This can occur if an unmatched feature # overlaps with a matched feature. for offset in matchedQOffsets.keys(): unmatchedQueryOffsets.pop(offset, None) for offset in matchedSOffsets.keys(): unmatchedSubjectOffsets.pop(offset, None) matchedWeightsCount = ( getWeightedOffsets(matchedQOffsets) + getWeightedOffsets(matchedSOffsets)) totalWeightsCount = matchedWeightsCount + ( getWeightedOffsets(unmatchedQueryOffsets) + getWeightedOffsets(unmatchedSubjectOffsets)) # Calculate the weighted score of the features within the matched # region. try: matchedRegionScore = matchedWeightsCount / totalWeightsCount except ZeroDivisionError: matchedRegionScore = 0.0 # The calculation of the fraction to normalise by length consists of # three parts: the numerator is the matchedOffsetCount + either the # unmatchedQueryOffsets or the unmatchedSubjectOffsets. The denominator # is the numerator + the length of hashes in either subject or query # which are outside the matched region. The sequence with less covered # indices is used to do the normalisation. # Note that the fraction to normalise by length is not weighted. offsetsNotInMatchQuery = set() for feature in filterfalse( lambda f: featureInRange(f, minQueryOffset, maxQueryOffset), self._allQueryFeatures - matchedQFeatures): offsetsNotInMatchQuery.update(feature.coveredOffsets()) matchedQOffsetsSet = set(matchedQOffsets.keys()) offsetsNotInMatchQuery -= matchedQOffsetsSet numeratorQuery = (len(matchedQOffsets) + len(unmatchedQueryOffsets)) denominatorQuery = numeratorQuery + len(offsetsNotInMatchQuery) offsetsNotInMatchSubject = set() for feature in filterfalse( lambda f: featureInRange(f, minSubjectOffset, maxSubjectOffset), self._allSubjectFeatures - matchedSFeatures): offsetsNotInMatchSubject.update(feature.coveredOffsets()) matchedSOffsetsSet = set(matchedSOffsets.keys()) offsetsNotInMatchSubject -= matchedSOffsetsSet numeratorSubject = (len(matchedSOffsets) + len(unmatchedSubjectOffsets)) denominatorSubject = numeratorSubject + len(offsetsNotInMatchSubject) # Calculate the fraction to normalise by length. try: normaliserQuery = numeratorQuery / denominatorQuery except ZeroDivisionError: normaliserQuery = 1.0 try: normaliserSubject = numeratorSubject / denominatorSubject except ZeroDivisionError: normaliserSubject = 1.0 # Calculate the final score. score = matchedRegionScore * max(normaliserQuery, normaliserSubject) analysis = { 'denominatorQuery': denominatorQuery, 'denominatorSubject': denominatorSubject, 'matchedOffsetCount': matchedWeightsCount, 'matchedSubjectOffsetCount': len(matchedSOffsets), 'matchedQueryOffsetCount': len(matchedQOffsets), 'weightedMatchedQueryOffsetCount': getWeightedOffsets(matchedQOffsets), 'weightedMatchedSubjectOffsetCount': getWeightedOffsets(matchedSOffsets), 'matchedRegionScore': matchedRegionScore, 'maxQueryOffset': maxQueryOffset, 'maxSubjectOffset': maxSubjectOffset, 'minQueryOffset': minQueryOffset, 'minSubjectOffset': minSubjectOffset, 'numeratorQuery': numeratorQuery, 'numeratorSubject': numeratorSubject, 'normaliserQuery': normaliserQuery, 'normaliserSubject': normaliserSubject, 'score': score, 'scoreClass': self.__class__, 'totalOffsetCount': totalWeightsCount, } return score, analysis
def calculateScore(self, binIndex): """ Calculates the score for a given histogram bin. The score is the product of two quotients. The first quotient is the matched region score (MRS). In the numerator, we have the number of AA locations that are in features that are in hashes that match between the subject and the query. The denominator is the number of AA locations that are in features which are in all hashes in the matching regions of the query and subject. Leaving the score like this would mean that a short match can have the same score as a long match. To account for this we multiply the MRS with a length normaliser (LN). The LN is the quotient of all AA locations in hashes in the matched region in the subject or the query divided by all AA locations in hashes in the subject or the query, whichever of the two is bigger. @param binIndex: The C{int} index of the bin to examine. @return: A 2-tuple, containing the C{float} score of the bin and a C{dict} with the analysis leading to the score. """ # Get the features and their offsets which match in subject and query. # These will be used to calculate the numerator of the score. matchedQueryFeatures, matchedQueryOffsets = histogramBinFeatures( self._histogram[binIndex], 'query') matchedSubjectFeatures, matchedSubjectOffsets = histogramBinFeatures( self._histogram[binIndex], 'subject') # Get the extreme offsets in the matched region of query and subject. minQueryOffset = minWithDefault(matchedQueryOffsets, default=None) maxQueryOffset = maxWithDefault(matchedQueryOffsets, default=None) minSubjectOffset = minWithDefault(matchedSubjectOffsets, default=None) maxSubjectOffset = maxWithDefault(matchedSubjectOffsets, default=None) # Get all features and their offsets which are present in the subject # and the query within the matched region. These will be used to # calculate the denominator. unmatchedQueryOffsets = set() for feature in filter( lambda f: featureInRange(f, minQueryOffset, maxQueryOffset), self._allQueryFeatures - matchedQueryFeatures): unmatchedQueryOffsets.update(feature.coveredOffsets()) # The unmatched offsets shouldn't contain any offsets that were # matched. This can occur if an unmatched feature overlaps with a # matched feature. unmatchedQueryOffsets -= matchedQueryOffsets unmatchedSubjectOffsets = set() for feature in filter( lambda f: featureInRange(f, minSubjectOffset, maxSubjectOffset), self._allSubjectFeatures - matchedSubjectFeatures): unmatchedSubjectOffsets.update(feature.coveredOffsets()) # The unmatched offsets shouldn't contain any offsets that were # matched. This can occur if an unmatched feature overlaps with a # matched feature. unmatchedSubjectOffsets -= matchedSubjectOffsets matchedOffsetCount = ( len(matchedQueryOffsets) + len(matchedSubjectOffsets)) totalOffsetCount = matchedOffsetCount + ( len(unmatchedQueryOffsets) + len(unmatchedSubjectOffsets)) try: matchedRegionScore = matchedOffsetCount / totalOffsetCount except ZeroDivisionError: matchedRegionScore = 0.0 # The calculation of the fraction to normalise by length consists of # three parts: the numerator is the matchedOffsetCount + either the # unmatchedQueryOffsets or the unmatchedSubjectOffsets. The denominator # is the numerator + the length of hashes in either subject or query # which are outside the matched region. The sequence with less covered # indices is used to do the normalisation. offsetsNotInMatchQuery = set() for feature in filterfalse( lambda f: featureInRange(f, minQueryOffset, maxQueryOffset), self._allQueryFeatures - matchedQueryFeatures): offsetsNotInMatchQuery.update(feature.coveredOffsets()) offsetsNotInMatchQuery -= matchedQueryOffsets numeratorQuery = (len(matchedQueryOffsets) + len(unmatchedQueryOffsets)) denominatorQuery = numeratorQuery + len(offsetsNotInMatchQuery) offsetsNotInMatchSubject = set() for feature in filterfalse( lambda f: featureInRange(f, minSubjectOffset, maxSubjectOffset), self._allSubjectFeatures - matchedSubjectFeatures): offsetsNotInMatchSubject.update(feature.coveredOffsets()) offsetsNotInMatchSubject -= matchedSubjectOffsets numeratorSubject = (len(matchedSubjectOffsets) + len(unmatchedSubjectOffsets)) denominatorSubject = numeratorSubject + len(offsetsNotInMatchSubject) try: normaliserQuery = numeratorQuery / denominatorQuery except ZeroDivisionError: normaliserQuery = 1.0 try: normaliserSubject = numeratorSubject / denominatorSubject except ZeroDivisionError: normaliserSubject = 1.0 score = matchedRegionScore * max(normaliserQuery, normaliserSubject) analysis = { 'denominatorQuery': denominatorQuery, 'denominatorSubject': denominatorSubject, 'matchedOffsetCount': matchedOffsetCount, 'matchedSubjectOffsetCount': len(matchedSubjectOffsets), 'matchedQueryOffsetCount': len(matchedQueryOffsets), 'matchedRegionScore': matchedRegionScore, 'maxQueryOffset': maxQueryOffset, 'maxSubjectOffset': maxSubjectOffset, 'minQueryOffset': minQueryOffset, 'minSubjectOffset': minSubjectOffset, 'numeratorQuery': numeratorQuery, 'numeratorSubject': numeratorSubject, 'normaliserQuery': normaliserQuery, 'normaliserSubject': normaliserSubject, 'score': score, 'scoreClass': self.__class__, 'totalOffsetCount': totalOffsetCount, } return score, analysis
def testMinWithNonEmptyListAndDefault(self): """ If minWithDefault is called with a non-empty list (and a default), then the min of the list must be returned. """ self.assertIs(2, minWithDefault([3, 4, 2], default=7))
def testMinWithEmptyListAndDefault(self): """ If minWithDefault is called with an empty list and a default value, then the default value must be returned. """ self.assertIs(3, minWithDefault([], default=3))