Beispiel #1
0
 def testFinalizeWithNoData(self):
     """
     If finalize is called and the histogram has no data, all
     bins must have zero counts.
     """
     h = Histogram(5)
     h.finalize()
     self.assertEqual([0, 0, 0, 0, 0], [len(bin_) for bin_ in h.bins])
Beispiel #2
0
 def testNoDataValue(self):
     """
     If an element with no associated datum is added to a histogram,
     the value that is passed must be stored in the bin.
     """
     h = Histogram(1)
     h.add(3)
     h.finalize()
     self.assertEqual([[3]], h.bins)
Beispiel #3
0
 def testRepeatedFinalize(self):
     """
     If finalize is called a second time, a RuntimeError must be raised.
     """
     h = Histogram()
     error = ('^Histogram already finalized$')
     h.add(3)
     h.finalize()
     six.assertRaisesRegex(self, RuntimeError, error, h.finalize)
Beispiel #4
0
 def testOneElementBinWidth(self):
     """
     If a histogram is created with just one element, the bin width must be
     zero.
     """
     h = Histogram()
     h.add(3)
     h.finalize()
     self.assertEqual(0.0, h.binWidth)
Beispiel #5
0
 def testGetItem(self):
     """
     The __getitem__ method must return the correct bin.
     """
     h = Histogram(3)
     list(map(h.add, range(9)))
     h.finalize()
     self.assertEqual([0, 1, 2], h[0])
     self.assertEqual([3, 4, 5], h[1])
 def testAlwaysMustBeTrue(self):
     """
     The Always significance method must return True.
     """
     histogram = Histogram(5)
     list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9]))
     histogram.finalize()
     significance = Always()
     self.assertTrue(significance.isSignificant(0))
Beispiel #7
0
 def testTenElementsInThreeBinsBinWidth(self):
     """
     If a histogram is created with 10 elements (0-9) placed into 3 bins,
     the bin width must be 3.0.
     """
     h = Histogram(3)
     list(map(h.add, range(10)))
     h.finalize()
     self.assertEqual(3.0, h.binWidth)
Beispiel #8
0
 def testNineElementsInThreeBins(self):
     """
     If a histogram is created with 9 elements placed into 2 bins, the
     bins must contain the expected values.
     """
     h = Histogram(3)
     list(map(h.add, range(9)))
     h.finalize()
     self.assertEqual([[0, 1, 2], [3, 4, 5], [6, 7, 8]], h.bins)
Beispiel #9
0
 def testTwoElementsBinWidth(self):
     """
     If a histogram with 5 buckets is created with two elements that differ
     by 1.0, the bin width should be set to the correct value of 0.2.
     """
     h = Histogram(5)
     h.add(3)
     h.add(4)
     h.finalize()
     self.assertEqual(0.2, h.binWidth)
Beispiel #10
0
 def testHashFractionIsSignificantWhenSignificant(self):
     """
     The isSignificant method must return True if asked about a bin
     that is significant.
     """
     histogram = Histogram(5)
     list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9]))
     histogram.finalize()
     significance = HashFraction(histogram, 10, 0.1)
     self.assertTrue(significance.isSignificant(0))
Beispiel #11
0
 def testElementIsStoredInBin(self):
     """
     If a histogram is created with just one element and one bin, the
     exact element that was passed must be placed in the bin.
     """
     element = object()
     h = Histogram(1)
     h.add(3, element)
     h.finalize()
     self.assertIs(element, h.bins[0][0])
Beispiel #12
0
 def testAlwaysSignificanceAnalysis(self):
     """
     The correct analysis must be provided.
     """
     histogram = Histogram(5)
     list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9]))
     histogram.finalize()
     significance = Always()
     self.assertEqual({'significanceMethod': 'Always'},
                      significance.analysis)
Beispiel #13
0
 def testMaxBinHeightIsSignificantWhenNotSignificant(self):
     """
     The isSignificant method must return False if asked about a bin
     that is not significant.
     """
     histogram = Histogram(5)
     list(map(histogram.add, [1, 1, 1, 1, 1, 7, 8, 9]))
     histogram.finalize()
     significance = MaxBinHeight(histogram, SQUIRRELPOX, DB)
     self.assertFalse(significance.isSignificant(1))
Beispiel #14
0
 def testMeanBinHeightIsSignificantWhenSignificant(self):
     """
     The isSignificant method must return True if asked about a bin
     that is significant.
     """
     histogram = Histogram(5)
     list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9]))
     histogram.finalize()
     significance = MeanBinHeight(histogram, COWPOX, DB)
     self.assertTrue(significance.isSignificant(0))
Beispiel #15
0
 def testGetItemInvalidIndex(self):
     """
     The __getitem__ method must raise IndexError if passed the index
     of a non-existent bin.
     """
     h = Histogram(3)
     list(map(h.add, range(9)))
     h.finalize()
     six.assertRaisesRegex(self, IndexError, '^list index out of range$',
                           h.__getitem__, 4)
Beispiel #16
0
 def testOneElementMaxMin(self):
     """
     If a histogram is created with just one element, the max and min
     should be set to that value.
     """
     h = Histogram()
     h.add(3)
     h.finalize()
     self.assertEqual(3, h.max)
     self.assertEqual(3, h.min)
Beispiel #17
0
 def testTwoElementsMaxMin(self):
     """
     If a histogram is created with two elements, the max and min
     should be set to the correct values.
     """
     h = Histogram()
     h.add(3)
     h.add(4)
     h.finalize()
     self.assertEqual(4, h.max)
     self.assertEqual(3, h.min)
Beispiel #18
0
 def testHashFractionSignificanceAnalysis(self):
     """
     The correct analysis must be provided.
     """
     histogram = Histogram(5)
     list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9]))
     histogram.finalize()
     significance = HashFraction(histogram, 10, 0.1)
     self.assertEqual({'significanceCutoff': 1.0,
                       'significanceMethod': 'HashFraction'},
                      significance.analysis)
Beispiel #19
0
 def testMaxBinHeightSignificanceAnalysis(self):
     """
     The correct analysis must be provided.
     """
     histogram = Histogram(5)
     list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9]))
     histogram.finalize()
     significance = MaxBinHeight(histogram, COWPOX, DB)
     self.assertEqual({'significanceCutoff': 0.0,
                       'significanceMethod': 'MaxBinHeight'},
                      significance.analysis)
Beispiel #20
0
 def testAddDataAfterFinalized(self):
     """
     If an attempt is made to add to a histogram that has been finalized,
     a RuntimeError must be raised.
     """
     h = Histogram()
     error = ('^Additional data cannot be added: histogram already '
              'finalized$')
     h.add(3)
     h.finalize()
     six.assertRaisesRegex(self, RuntimeError, error, h.add, 3)
Beispiel #21
0
 def testMeanBinHeightSignificanceAnalysis(self):
     """
     The right analysis must be returned.
     """
     histogram = Histogram(5)
     list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9]))
     histogram.finalize()
     significance = MeanBinHeight(histogram, COWPOX, DB)
     self.assertEqual({'meanBinHeight': 0.0,
                       'significanceCutoff': 0.0,
                       'significanceMethod': 'MeanBinHeight',
                       'standardDeviation': 0.0},
                      significance.analysis)
Beispiel #22
0
 def testAAFractionWhenSignificant(self):
     """
     The isSignificant method must return True if asked about a bin that is
     significant.
     """
     match = {
         'subjectLandmark': Landmark('AlphaHelix', 'A', 0, 9),
         'subjectTrigPoint': TrigPoint('Peaks', 'P', 21),
         'queryLandmark': Landmark('AlphaHelix', 'A', 10, 9),
         'queryTrigPoint': TrigPoint('Peaks', 'P', 25),
     }
     histogram = Histogram(1)
     histogram.add(0, match)
     histogram.finalize()
     significance = AAFraction(histogram, 10, 0.75)
     self.assertTrue(significance.isSignificant(0))
Beispiel #23
0
 def testAAFractionSignificanceAnalysis(self):
     """
     The correct analysis must be provided.
     """
     match = {
         'subjectLandmark': Landmark('AlphaHelix', 'A', 0, 9),
         'subjectTrigPoint': TrigPoint('Peaks', 'P', 21),
         'queryLandmark': Landmark('AlphaHelix', 'A', 10, 9),
         'queryTrigPoint': TrigPoint('Peaks', 'P', 25),
     }
     histogram = Histogram(3)
     histogram.add(0, match)
     histogram.add(1, match)
     histogram.add(2, match)
     histogram.finalize()
     significance = AAFraction(histogram, 10, 0.75)
     self.assertTrue(significance.isSignificant(0))
     self.assertEqual({'significanceCutoff': 7.5,
                       'significanceMethod': 'AAFraction'},
                      significance.analysis)
Beispiel #24
0
    def _checkPositiveNegative(self, nBins, values):
        """
        When a set of values is put into a histogram, the bin counts that
        result must be the same (just with the order reversed) as those that
        result from a histogram made with the same set of values but with
        opposite sign.

        @param nBins: The C{int} number of bins to use in the histogram.
        @param values: A C{list} of values to insert into the histogram.
        """
        # Make a histogram of the values and get all the bin counts.
        h1 = Histogram(nBins)
        for value in values:
            h1.add(value)
        h1.finalize()
        counts1 = [len(bin_) for bin_ in h1.bins]

        # Make a histogram of the negative values and get all the bin counts.
        h2 = Histogram(nBins)
        for value in [-x for x in values]:
            h2.add(value)
        h2.finalize()
        counts2 = [len(bin_) for bin_ in h2.bins]
        counts2.reverse()

        # Prepare a useful error message, in case there are any differences.
        differences = ['Counts differ']
        for i in range(len(counts1)):
            if counts1[i] != counts2[i]:
                h1Low = h1.min + i * h1.binWidth
                h1High = h1Low + h1.binWidth
                h2Low = h2.min + i * h2.binWidth
                h2High = h2Low + h2.binWidth
                differences.append(
                    '  bin %d (h1 bin range: %.7f to %.7f, h2 bin range: '
                    '%.7f to %.7f): count %d != count %d' %
                    (i, h1Low, h1High, h2Low, h2High, counts1[i], counts2[i]))

        # Bin counts must be the same.
        self.assertEqual(counts1, counts2, '\n'.join(differences))
Beispiel #25
0
 def testFiveBinsMinusTwoPointFiveToPlusTwoPointFiveIntermediates(self):
     """
     If a histogram is created with 5 bins and a data range of -2.5 to +2.5
     items that are added between histogram boundaries must be placed in
     the expected bins.
     """
     for (value,
          expectedCounts) in ((-2, [1, 0, 0, 0, 0]), (-1, [0, 1, 0, 0, 0]),
                              (+0, [0, 0, 1, 0,
                                    0]), (+1, [0, 0, 0, 1,
                                               0]), (+2, [0, 0, 0, 0, 1])):
         h = Histogram(5)
         h.add(-2.5)  # Set min value.
         h.add(2.5)  # Set max value.
         h.add(value)
         h.finalize()
         counts = [len(bin_) for bin_ in h.bins]
         # Subract 1 from the first and last bin counts, to adjust for the
         # -2.5 and 2.5 boundary values we added manually.
         counts[0] -= 1
         counts[-1] -= 1
         self.assertEqual(expectedCounts, counts)
Beispiel #26
0
class Template(object):
    """
    Parse an ASCII art picture of a light matter match and provide access to
    it.

    @param template: A C{str} template picture of the match.
    @raise ValueError: If the query and subject do not have the same number of
        paired features.
    """
    def __init__(self, template):
        self.template = self.templateToList(template)
        self.query = Query(self.template)
        self.subject = Subject(self.template)

        if len(self.query.pairedFeatures) != len(self.subject.pairedFeatures):
            raise ValueError(
                'The query and subject do not have the same number of paired '
                'features (%d != %d)' % (len(self.query.pairedFeatures),
                                         len(self.subject.pairedFeatures)))

        # Union the landmark and trig point names from the query and subject.
        self.landmarks = self.query.landmarks | self.subject.landmarks
        self.trigPoints = self.query.trigPoints | self.subject.trigPoints

        self.histogram = Histogram(1)

        for queryPair, subjectPair in zip(self.query.pairedFeatures,
                                          self.subject.pairedFeatures):
            _, queryLandmark, _, queryTrigPoint = queryPair
            _, subjectLandmark, _, subjectTrigPoint = subjectPair

            self.histogram.add(0, {
                'queryLandmark': queryLandmark,
                'queryTrigPoint': queryTrigPoint,
                'subjectLandmark': subjectLandmark,
                'subjectTrigPoint': subjectTrigPoint,
            })

        self.histogram.finalize()

    @staticmethod
    def templateToList(template):
        """
        Convert a picture to a list of trimmed non-blank lines.

        @param template: A C{str} template picture of the match.
        @return: A C{list} of \n separated non-blank lines from C{template}.
        """
        result = []
        whitespace = re.compile('^\s*$')
        for line in template.split('\n'):
            if whitespace.match(line) is None:
                result.append(line.rstrip())
        return result

    def calculateScore(self, dbParams=None, findParams=None):
        """
        Using a given scoring method, calculate the score of the alignment
        between the query and subject in the template.

        @param findParams: An instance of C{light.parameters.FindParameters} or
            C{None} to use default find parameters.
        @raises ValueError: If C{dbParams} is passed and the landmarks and
            trig points it specifies do not include all the landmarks and trig
            points named in the template. Of if the C{binScoreMethod} in
            C{findParams} is unknown.
        @return: A 2-tuple, being the result of calling the C{calculateScore}
            method of the C{binScoreMethod} class. The tuple contains a
            C{float} score of the bin and a C{dict} with the analysis leading
            to the score (see light/bin_score.py).
        """
        findParams = findParams or FindParameters()
        if dbParams is None:
            dbParams = DatabaseParameters(landmarks=self.landmarks,
                                          trigPoints=self.trigPoints)
        else:
            missing = self.landmarks - set(dbParams.landmarkFinderNames())
            if missing:
                raise ValueError(
                    'The template mentions landmark finders (%s) that are '
                    'not present in the passed DatabaseParameters instance' %
                    ', '.join(sorted(missing)))

            missing = self.trigPoints - set(dbParams.trigPointFinderNames())
            if missing:
                raise ValueError(
                    'The template mentions trig point finders (%s) that are '
                    'not present in the passed DatabaseParameters instance' %
                    ', '.join(sorted(missing)))

        database = Database(dbParams=dbParams)
        _, subjectIndex, subjectHashCount = database.addSubject(
            self.subject.read)
        dbSubject = database.getSubjectByIndex(subjectIndex)

        binScoreMethod = findParams.binScoreMethod
        if binScoreMethod == 'NoneScore':
            scorer = NoneScore()
        elif binScoreMethod == 'MinHashesScore':
            be = database._connector._backend
            queryHashCount = 0
            scannedQuery = be.scan(self.query.read)
            for hashInfo in be.getHashes(scannedQuery).values():
                queryHashCount += len(hashInfo)
            scorer = MinHashesScore(self.histogram,
                                    min(queryHashCount, subjectHashCount))
        elif binScoreMethod == 'FeatureMatchingScore':
            scorer = FeatureMatchingScore(
                self.histogram, self.query.read, dbSubject, dbParams,
                findParams)
        elif binScoreMethod == 'FeatureAAScore':
            scorer = FeatureAAScore(
                self.histogram, self.query.read, dbSubject, dbParams)
        elif binScoreMethod == 'WeightedFeatureAAScore':
            scorer = WeightedFeatureAAScore(
                self.histogram, self.query.read, dbSubject, dbParams,
                findParams.weights)
        else:
            raise ValueError('Unknown bin score method %r' % binScoreMethod)

        return scorer.calculateScore(0)
Beispiel #27
0
    def __init__(self,
                 query,
                 connector,
                 matches,
                 queryHashCount,
                 findParams=None,
                 nonMatchingHashes=None,
                 storeFullAnalysis=False):
        self.query = query
        self.connector = connector
        self.matches = matches  # Only saved on self for testing.
        self.queryHashCount = queryHashCount
        findParams = findParams or FindParameters()
        self._findParams = findParams
        self.nonMatchingHashes = nonMatchingHashes
        self._storeFullAnalysis = storeFullAnalysis
        self.analysis = defaultdict(dict)
        deltaScale = findParams.deltaScale
        scoreGetter = itemgetter('score')
        be = Backend()
        be.configure(connector.dbParams)

        if findParams.significanceMethod == 'AAFraction':
            queryAACount = len(be.scan(query).coveredIndices())

        # Go through all the subjects that were matched at all, and put the
        # match offset deltas into bins so we can decide which (if any) of
        # the matches is significant.
        for subjectIndex in matches:
            subject = connector.getSubjectByIndex(subjectIndex)
            # Use a histogram to bin scaled (landmark, trigPoint) offset
            # deltas.
            nBins = max(len(query), len(subject))
            # Make sure the number of bins is odd, else Histogram() will raise.
            nBins |= 0x1
            histogram = Histogram(nBins)
            add = histogram.add

            # To ensure the set of query/subject offset deltas is the same
            # no matter which of the sequences is the query and which is
            # the subject, we negate all deltas if the subject sequence
            # sorts first. This is just a way of canonicalizing the set of
            # deltas. If we don't canonicalize, we get sets of deltas with
            # opposite signs, like {-4, -2, 6} and {-6, 2, 4} depending on
            # which sequence is the subject and which the query. This
            # occasionally leads to hard-to-debug and awkward-to-fix
            # differences in the histogram binning at bin boundaries due to
            # tiny floating point differences. The simple solution is to
            # canonicalize the deltas based on an arbitrary consistent
            # difference between the subject and query.
            negateDeltas = subject.read.sequence < query.sequence

            for match in matches[subjectIndex]:
                # The delta is the difference between the
                # corresponding landmark offsets
                subjectLandmarkOffset = match['subjectLandmark'].offset
                queryLandmarkOffset = match['queryLandmark'].offset
                delta = subjectLandmarkOffset - queryLandmarkOffset
                if negateDeltas:
                    delta = -delta

                # Add the information about this common landmark /
                # trig point hash to the histogram bucket for the
                # query landmark to subject landmark offset delta.
                add(scaleLinear(delta, deltaScale), match)

            histogram.finalize()

            minHashCount = min(queryHashCount, subject.hashCount)

            significanceMethod = findParams.significanceMethod
            if significanceMethod == 'Always':
                significance = Always()
            elif significanceMethod == 'HashFraction':
                significance = HashFraction(histogram, minHashCount,
                                            findParams.significanceFraction)
            elif significanceMethod == 'MaxBinHeight':
                significance = MaxBinHeight(histogram, query, connector)
            elif significanceMethod == 'MeanBinHeight':
                significance = MeanBinHeight(histogram, query, connector)
            elif significanceMethod == 'AAFraction':
                featureAACount = (queryAACount +
                                  len(be.scan(subject.read).coveredIndices()))
                significance = AAFraction(histogram, featureAACount,
                                          findParams.significanceFraction)
            else:
                raise ValueError('Unknown significance method %r' %
                                 significanceMethod)

            binScoreMethod = findParams.binScoreMethod
            if binScoreMethod == 'NoneScore':
                scorer = NoneScore()
            elif binScoreMethod == 'MinHashesScore':
                scorer = MinHashesScore(histogram, minHashCount)
            elif binScoreMethod == 'FeatureMatchingScore':
                scorer = FeatureMatchingScore(histogram, query, subject,
                                              connector.dbParams, findParams)
            elif binScoreMethod == 'FeatureAAScore':
                scorer = FeatureAAScore(histogram, query, subject,
                                        connector.dbParams)
            elif binScoreMethod == 'WeightedFeatureAAScore':
                scorer = WeightedFeatureAAScore(histogram, query, subject,
                                                connector.dbParams,
                                                findParams.weights)
            elif binScoreMethod == 'FeatureAALengthScore':
                scorer = FeatureAALengthScore(histogram, query, subject,
                                              connector.dbParams)
            else:
                raise ValueError('Unknown bin score method %r' %
                                 binScoreMethod)

            # Find bins with a significant number of elements and score them.
            significantBins = []
            for binIndex, bin_ in enumerate(histogram.bins):
                if significance.isSignificant(binIndex):
                    score, scoreAnalysis = scorer.calculateScore(binIndex)
                    significantBin = {
                        'bin': bin_,
                        'index': binIndex,
                        'score': score
                    }
                    if storeFullAnalysis:
                        significantBin['scoreAnalysis'] = scoreAnalysis
                    significantBins.append(significantBin)

            if significantBins:
                significantBins.sort(key=scoreGetter, reverse=True)
                bestBinScore = significantBins[0]['score']
            else:
                bestBinScore = None

            overallScoreMethod = findParams.overallScoreMethod
            if overallScoreMethod == 'BestBinScore':
                scorer = BestBinScore(histogram, significantBins)
            elif overallScoreMethod == 'SignificantBinScore':
                scorer = SignificantBinScore(significantBins, query, subject,
                                             connector.dbParams)
            elif overallScoreMethod == 'GreedySignificantBinScore':
                scorer = GreedySignificantBinScore(significantBins, query,
                                                   subject, connector.dbParams)
            else:
                raise ValueError('Unknown overall score method %r' %
                                 overallScoreMethod)

            overallScore, overallScoreAnalysis = scorer.calculateScore()

            if storeFullAnalysis:
                self.analysis[subjectIndex] = {
                    'histogram': histogram,
                    'bestBinScore': bestBinScore,
                    'overallScore': overallScore,
                    'overallScoreAnalysis': overallScoreAnalysis,
                    'significantBins': significantBins,
                    'significanceAnalysis': significance.analysis,
                }
            elif significantBins:
                self.analysis[subjectIndex] = {
                    'bestBinScore': bestBinScore,
                    'overallScore': overallScore,
                    'significantBins': significantBins,
                }