Esempio n. 1
0
    def calculateScore(self, binIndex):
        """
        Calculates the score for a given histogram bin.

        @param binIndex: The C{int} index of the bin to examine.
        @return: A 2-tuple, containing the C{float} score of the bin and a
            C{dict} with the analysis leading to the score.
        """
        queryFeatures, queryOffsets = histogramBinFeatures(
            self._histogram[binIndex], 'query')
        subjectFeatures, subjectOffsets = histogramBinFeatures(
            self._histogram[binIndex], 'subject')

        matchScore = self._findParams.featureMatchScore * (
            len(queryFeatures) + len(subjectFeatures))

        minQueryOffset = minWithDefault(queryOffsets, default=None)
        maxQueryOffset = maxWithDefault(queryOffsets, default=None)
        minSubjectOffset = minWithDefault(subjectOffsets, default=None)
        maxSubjectOffset = maxWithDefault(subjectOffsets, default=None)

        # The mismatch score is applied to all features that are not
        # among those in the bin and which fall inside the max and min
        # offsets of the features in the bin.
        mismatchScore = self._findParams.featureMismatchScore * (
            len(list(filter(
                lambda f: featureInRange(f, minQueryOffset, maxQueryOffset),
                self._allQueryFeatures - queryFeatures))) +
            len(list(filter(
                lambda f: featureInRange(f, minSubjectOffset,
                                         maxSubjectOffset),
                self._allSubjectFeatures - subjectFeatures))))

        score = matchScore + mismatchScore

        # We could put more in here, but I have a feeling we wont be using
        # this score method as FeatureAAScore is (hopefully) better. If we
        # need to add more detail (e.g., the number of features in each
        # class) we can easily add it.
        analysis = {
            'minQueryOffset': minQueryOffset,
            'maxQueryOffset': maxQueryOffset,
            'minSubjectOffset': minSubjectOffset,
            'maxSubjectOffset': maxSubjectOffset,
            'matchScore': matchScore,
            'mismatchScore': mismatchScore,
            'score': score,
            'scoreClass': self.__class__,
        }

        return score, analysis
Esempio n. 2
0
    def calculateScore(self, binIndex):
        """
        Calculates the score for a given histogram bin.

        The score is the product of two quotients. The first quotient is the
        matched region score (MRS). In the numerator, we have the number of AA
        locations that are in features that are in hashes that match between
        the subject and the query. The denominator is the number of AA
        locations that are in features which are in all hashes in the matching
        regions of the query and subject.
        Leaving the score like this would mean that a short match can have the
        same score as a long match. To account for this we multiply the MRS
        with a length normaliser (LN). The LN is the quotient of all AA
        locations in the matched region divided by the total length of the
        subject or the query, whichever of the two is smaller.

        @param binIndex: The C{int} index of the bin to examine.
        @return: A 2-tuple, containing the C{float} score of the bin and a
            C{dict} with the analysis leading to the score.
        """
        # Get the features and their offsets which match in subject and query.
        # These will be used to calculate the numerator of the score.
        matchedQueryFeatures, matchedQueryOffsets = histogramBinFeatures(
            self._histogram[binIndex], 'query')

        matchedSubjectFeatures, matchedSubjectOffsets = histogramBinFeatures(
            self._histogram[binIndex], 'subject')

        # Get the extreme offsets in the matched region of query and subject.
        minQueryOffset = minWithDefault(matchedQueryOffsets, default=None)
        maxQueryOffset = maxWithDefault(matchedQueryOffsets, default=None)
        minSubjectOffset = minWithDefault(matchedSubjectOffsets, default=None)
        maxSubjectOffset = maxWithDefault(matchedSubjectOffsets, default=None)

        # Calculate the length of the matched region for the query and the
        # subject.
        try:
            queryMatchedRegionLength = maxQueryOffset - minQueryOffset + 1
        except TypeError:
            queryMatchedRegionLength = 0

        try:
            subjectMatchedRegionLength = (maxSubjectOffset - minSubjectOffset +
                                          1)
        except TypeError:
            subjectMatchedRegionLength = 0

        # Get all features and their offsets which are present in the subject
        # and the query within the matched region. These will be used to
        # calculate the denominator.
        unmatchedQueryOffsets = set()
        for feature in filter(
                lambda f: featureInRange(f, minQueryOffset, maxQueryOffset),
                self._allQueryFeatures - matchedQueryFeatures):
            unmatchedQueryOffsets.update(feature.coveredOffsets())
        # The unmatched offsets shouldn't contain any offsets that were
        # matched. This can occur if an unmatched feature overlaps with a
        # matched feature.
        unmatchedQueryOffsets -= matchedQueryOffsets

        unmatchedSubjectOffsets = set()
        for feature in filter(
                lambda f: featureInRange(f, minSubjectOffset,
                                         maxSubjectOffset),
                self._allSubjectFeatures - matchedSubjectFeatures):
            unmatchedSubjectOffsets.update(feature.coveredOffsets())
        # The unmatched offsets shouldn't contain any offsets that were
        # matched. This can occur if an unmatched feature overlaps with a
        # matched feature.
        unmatchedSubjectOffsets -= matchedSubjectOffsets

        matchedOffsetCount = (
            len(matchedQueryOffsets) + len(matchedSubjectOffsets))

        totalOffsetCount = matchedOffsetCount + (
            len(unmatchedQueryOffsets) + len(unmatchedSubjectOffsets))
        try:
            matchedRegionScore = matchedOffsetCount / totalOffsetCount
        except ZeroDivisionError:
            matchedRegionScore = 0.0

        # The length normaliser is a quotient which is calculated separately
        # for the query and the subject. The numerator is the number of AA's in
        # the matched region. The denominator is the length of the subject or
        # the query. The quotient which is bigger is then used to do the length
        # normalisation.

        lengthNormaliser = max(queryMatchedRegionLength / self._queryLen,
                               subjectMatchedRegionLength / self._subjectLen)

        score = matchedRegionScore * lengthNormaliser

        analysis = {
            'denominatorQuery': self._queryLen,
            'denominatorSubject': self._subjectLen,
            'matchedOffsetCount': matchedOffsetCount,
            'matchedSubjectOffsetCount': len(matchedSubjectOffsets),
            'matchedQueryOffsetCount': len(matchedQueryOffsets),
            'matchedRegionScore': matchedRegionScore,
            'maxQueryOffset': maxQueryOffset,
            'maxSubjectOffset': maxSubjectOffset,
            'minQueryOffset': minQueryOffset,
            'minSubjectOffset': minSubjectOffset,
            'queryMatchedRegionSize': queryMatchedRegionLength,
            'subjectMatchedRegionSize': subjectMatchedRegionLength,
            'normaliserQuery': queryMatchedRegionLength / self._queryLen,
            'normaliserSubject': subjectMatchedRegionLength / self._subjectLen,
            'score': score,
            'scoreClass': self.__class__,
            'totalOffsetCount': totalOffsetCount,
        }

        return score, analysis
Esempio n. 3
0
    def calculateScore(self, binIndex):
        """
        Calculates the score for a given histogram bin.

        The score is a quotient. In the numerator, we have the weighted number
        of AA locations that are in features that are in hashes that match
        between the subject and the query. The denominator is the weighted
        number of AA locations that are in features which are in all hashes in
        the matching regions of the query and subject.
        Leaving the score like this would mean that a short match can have the
        same score as a long match. To account for this, the quotient from
        above is multiplied by the matched fraction of the shorter sequence.

        @param binIndex: The C{int} index of the bin to examine.
        @return: A 2-tuple, containing the C{float} score of the bin and a
            C{dict} with the analysis leading to the score.
        """
        # Get the features and their offsets with associated weights which
        # match in subject and query.
        # These will be used to calculate the numerator of the score.
        matchedQFeatures, matchedQOffsets = weightedHistogramBinFeatures(
            self._histogram[binIndex], 'query', self._weights)

        matchedSFeatures, matchedSOffsets = weightedHistogramBinFeatures(
            self._histogram[binIndex], 'subject', self._weights)

        # Get the extreme offsets in the matched region of query and subject.
        minQueryOffset = minWithDefault(matchedQOffsets, default=None)
        maxQueryOffset = maxWithDefault(matchedQOffsets, default=None)
        minSubjectOffset = minWithDefault(matchedSOffsets, default=None)
        maxSubjectOffset = maxWithDefault(matchedSOffsets, default=None)

        # Get all features and their offsets with their associated weights
        # which are present in the subject and the query within the matched
        # region.
        # These will be used to calculate the denominator of the score.
        unmatchedQueryOffsets = defaultdict(list)
        for feature in filter(
                lambda f: featureInRange(f, minQueryOffset, maxQueryOffset),
                self._allQueryFeatures - matchedQFeatures):
            for offset in feature.coveredOffsets():
                unmatchedQueryOffsets[offset].append(
                    self._weights[feature.name])

        unmatchedSubjectOffsets = defaultdict(list)
        for feature in filter(
                lambda f: featureInRange(f, minSubjectOffset,
                                         maxSubjectOffset),
                self._allSubjectFeatures - matchedSFeatures):
            for offset in feature.coveredOffsets():
                unmatchedSubjectOffsets[offset].append(
                    self._weights[feature.name])

        # The unmatched offsets in the query and the subject shouldn't contain
        # any offsets that were matched. This can occur if an unmatched feature
        # overlaps with a matched feature.
        for offset in matchedQOffsets.keys():
            unmatchedQueryOffsets.pop(offset, None)

        for offset in matchedSOffsets.keys():
            unmatchedSubjectOffsets.pop(offset, None)

        matchedWeightsCount = (
            getWeightedOffsets(matchedQOffsets) +
            getWeightedOffsets(matchedSOffsets))

        totalWeightsCount = matchedWeightsCount + (
            getWeightedOffsets(unmatchedQueryOffsets) +
            getWeightedOffsets(unmatchedSubjectOffsets))

        # Calculate the weighted score of the features within the matched
        # region.
        try:
            matchedRegionScore = matchedWeightsCount / totalWeightsCount
        except ZeroDivisionError:
            matchedRegionScore = 0.0

        # The calculation of the fraction to normalise by length consists of
        # three parts: the numerator is the matchedOffsetCount + either the
        # unmatchedQueryOffsets or the unmatchedSubjectOffsets. The denominator
        # is the numerator + the length of hashes in either subject or query
        # which are outside the matched region. The sequence with less covered
        # indices is used to do the normalisation.
        # Note that the fraction to normalise by length is not weighted.

        offsetsNotInMatchQuery = set()
        for feature in filterfalse(
                lambda f: featureInRange(f, minQueryOffset,
                                         maxQueryOffset),
                self._allQueryFeatures - matchedQFeatures):
            offsetsNotInMatchQuery.update(feature.coveredOffsets())

        matchedQOffsetsSet = set(matchedQOffsets.keys())
        offsetsNotInMatchQuery -= matchedQOffsetsSet

        numeratorQuery = (len(matchedQOffsets) +
                          len(unmatchedQueryOffsets))
        denominatorQuery = numeratorQuery + len(offsetsNotInMatchQuery)

        offsetsNotInMatchSubject = set()
        for feature in filterfalse(
                lambda f: featureInRange(f, minSubjectOffset,
                                         maxSubjectOffset),
                self._allSubjectFeatures - matchedSFeatures):
            offsetsNotInMatchSubject.update(feature.coveredOffsets())

        matchedSOffsetsSet = set(matchedSOffsets.keys())
        offsetsNotInMatchSubject -= matchedSOffsetsSet

        numeratorSubject = (len(matchedSOffsets) +
                            len(unmatchedSubjectOffsets))
        denominatorSubject = numeratorSubject + len(offsetsNotInMatchSubject)

        # Calculate the fraction to normalise by length.
        try:
            normaliserQuery = numeratorQuery / denominatorQuery
        except ZeroDivisionError:
            normaliserQuery = 1.0

        try:
            normaliserSubject = numeratorSubject / denominatorSubject
        except ZeroDivisionError:
            normaliserSubject = 1.0

        # Calculate the final score.
        score = matchedRegionScore * max(normaliserQuery, normaliserSubject)

        analysis = {
            'denominatorQuery': denominatorQuery,
            'denominatorSubject': denominatorSubject,
            'matchedOffsetCount': matchedWeightsCount,
            'matchedSubjectOffsetCount': len(matchedSOffsets),
            'matchedQueryOffsetCount': len(matchedQOffsets),
            'weightedMatchedQueryOffsetCount':
                getWeightedOffsets(matchedQOffsets),
            'weightedMatchedSubjectOffsetCount':
                getWeightedOffsets(matchedSOffsets),
            'matchedRegionScore': matchedRegionScore,
            'maxQueryOffset': maxQueryOffset,
            'maxSubjectOffset': maxSubjectOffset,
            'minQueryOffset': minQueryOffset,
            'minSubjectOffset': minSubjectOffset,
            'numeratorQuery': numeratorQuery,
            'numeratorSubject': numeratorSubject,
            'normaliserQuery': normaliserQuery,
            'normaliserSubject': normaliserSubject,
            'score': score,
            'scoreClass': self.__class__,
            'totalOffsetCount': totalWeightsCount,
        }

        return score, analysis
Esempio n. 4
0
    def calculateScore(self, binIndex):
        """
        Calculates the score for a given histogram bin.

        The score is the product of two quotients. The first quotient is the
        matched region score (MRS). In the numerator, we have the number of AA
        locations that are in features that are in hashes that match between
        the subject and the query. The denominator is the number of AA
        locations that are in features which are in all hashes in the matching
        regions of the query and subject.
        Leaving the score like this would mean that a short match can have the
        same score as a long match. To account for this we multiply the MRS
        with a length normaliser (LN). The LN is the quotient of all AA
        locations in hashes in the matched region in the subject or the query
        divided by all AA locations in hashes in the subject or the query,
        whichever of the two is bigger.

        @param binIndex: The C{int} index of the bin to examine.
        @return: A 2-tuple, containing the C{float} score of the bin and a
            C{dict} with the analysis leading to the score.
        """
        # Get the features and their offsets which match in subject and query.
        # These will be used to calculate the numerator of the score.
        matchedQueryFeatures, matchedQueryOffsets = histogramBinFeatures(
            self._histogram[binIndex], 'query')

        matchedSubjectFeatures, matchedSubjectOffsets = histogramBinFeatures(
            self._histogram[binIndex], 'subject')

        # Get the extreme offsets in the matched region of query and subject.
        minQueryOffset = minWithDefault(matchedQueryOffsets, default=None)
        maxQueryOffset = maxWithDefault(matchedQueryOffsets, default=None)
        minSubjectOffset = minWithDefault(matchedSubjectOffsets, default=None)
        maxSubjectOffset = maxWithDefault(matchedSubjectOffsets, default=None)

        # Get all features and their offsets which are present in the subject
        # and the query within the matched region. These will be used to
        # calculate the denominator.
        unmatchedQueryOffsets = set()
        for feature in filter(
                lambda f: featureInRange(f, minQueryOffset, maxQueryOffset),
                self._allQueryFeatures - matchedQueryFeatures):
            unmatchedQueryOffsets.update(feature.coveredOffsets())
        # The unmatched offsets shouldn't contain any offsets that were
        # matched. This can occur if an unmatched feature overlaps with a
        # matched feature.
        unmatchedQueryOffsets -= matchedQueryOffsets

        unmatchedSubjectOffsets = set()
        for feature in filter(
                lambda f: featureInRange(f, minSubjectOffset,
                                         maxSubjectOffset),
                self._allSubjectFeatures - matchedSubjectFeatures):
            unmatchedSubjectOffsets.update(feature.coveredOffsets())
        # The unmatched offsets shouldn't contain any offsets that were
        # matched. This can occur if an unmatched feature overlaps with a
        # matched feature.
        unmatchedSubjectOffsets -= matchedSubjectOffsets

        matchedOffsetCount = (
            len(matchedQueryOffsets) + len(matchedSubjectOffsets))

        totalOffsetCount = matchedOffsetCount + (
            len(unmatchedQueryOffsets) + len(unmatchedSubjectOffsets))

        try:
            matchedRegionScore = matchedOffsetCount / totalOffsetCount
        except ZeroDivisionError:
            matchedRegionScore = 0.0

        # The calculation of the fraction to normalise by length consists of
        # three parts: the numerator is the matchedOffsetCount + either the
        # unmatchedQueryOffsets or the unmatchedSubjectOffsets. The denominator
        # is the numerator + the length of hashes in either subject or query
        # which are outside the matched region. The sequence with less covered
        # indices is used to do the normalisation.

        offsetsNotInMatchQuery = set()
        for feature in filterfalse(
                lambda f: featureInRange(f, minQueryOffset,
                                         maxQueryOffset),
                self._allQueryFeatures - matchedQueryFeatures):
            offsetsNotInMatchQuery.update(feature.coveredOffsets())
        offsetsNotInMatchQuery -= matchedQueryOffsets
        numeratorQuery = (len(matchedQueryOffsets) +
                          len(unmatchedQueryOffsets))
        denominatorQuery = numeratorQuery + len(offsetsNotInMatchQuery)

        offsetsNotInMatchSubject = set()
        for feature in filterfalse(
                lambda f: featureInRange(f, minSubjectOffset,
                                         maxSubjectOffset),
                self._allSubjectFeatures - matchedSubjectFeatures):
            offsetsNotInMatchSubject.update(feature.coveredOffsets())
        offsetsNotInMatchSubject -= matchedSubjectOffsets
        numeratorSubject = (len(matchedSubjectOffsets) +
                            len(unmatchedSubjectOffsets))
        denominatorSubject = numeratorSubject + len(offsetsNotInMatchSubject)

        try:
            normaliserQuery = numeratorQuery / denominatorQuery
        except ZeroDivisionError:
            normaliserQuery = 1.0

        try:
            normaliserSubject = numeratorSubject / denominatorSubject
        except ZeroDivisionError:
            normaliserSubject = 1.0

        score = matchedRegionScore * max(normaliserQuery, normaliserSubject)

        analysis = {
            'denominatorQuery': denominatorQuery,
            'denominatorSubject': denominatorSubject,
            'matchedOffsetCount': matchedOffsetCount,
            'matchedSubjectOffsetCount': len(matchedSubjectOffsets),
            'matchedQueryOffsetCount': len(matchedQueryOffsets),
            'matchedRegionScore': matchedRegionScore,
            'maxQueryOffset': maxQueryOffset,
            'maxSubjectOffset': maxSubjectOffset,
            'minQueryOffset': minQueryOffset,
            'minSubjectOffset': minSubjectOffset,
            'numeratorQuery': numeratorQuery,
            'numeratorSubject': numeratorSubject,
            'normaliserQuery': normaliserQuery,
            'normaliserSubject': normaliserSubject,
            'score': score,
            'scoreClass': self.__class__,
            'totalOffsetCount': totalOffsetCount,
        }

        return score, analysis
Esempio n. 5
0
 def testMinWithNonEmptyListAndDefault(self):
     """
     If minWithDefault is called with a non-empty list (and a default),
     then the min of the list must be returned.
     """
     self.assertIs(2, minWithDefault([3, 4, 2], default=7))
Esempio n. 6
0
 def testMinWithEmptyListAndDefault(self):
     """
     If minWithDefault is called with an empty list and a default value,
     then the default value must be returned.
     """
     self.assertIs(3, minWithDefault([], default=3))