def testFinalizeWithNoData(self): """ If finalize is called and the histogram has no data, all bins must have zero counts. """ h = Histogram(5) h.finalize() self.assertEqual([0, 0, 0, 0, 0], [len(bin_) for bin_ in h.bins])
def testNoDataValue(self): """ If an element with no associated datum is added to a histogram, the value that is passed must be stored in the bin. """ h = Histogram(1) h.add(3) h.finalize() self.assertEqual([[3]], h.bins)
def testRepeatedFinalize(self): """ If finalize is called a second time, a RuntimeError must be raised. """ h = Histogram() error = ('^Histogram already finalized$') h.add(3) h.finalize() six.assertRaisesRegex(self, RuntimeError, error, h.finalize)
def testOneElementBinWidth(self): """ If a histogram is created with just one element, the bin width must be zero. """ h = Histogram() h.add(3) h.finalize() self.assertEqual(0.0, h.binWidth)
def testGetItem(self): """ The __getitem__ method must return the correct bin. """ h = Histogram(3) list(map(h.add, range(9))) h.finalize() self.assertEqual([0, 1, 2], h[0]) self.assertEqual([3, 4, 5], h[1])
def testAlwaysMustBeTrue(self): """ The Always significance method must return True. """ histogram = Histogram(5) list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9])) histogram.finalize() significance = Always() self.assertTrue(significance.isSignificant(0))
def testTenElementsInThreeBinsBinWidth(self): """ If a histogram is created with 10 elements (0-9) placed into 3 bins, the bin width must be 3.0. """ h = Histogram(3) list(map(h.add, range(10))) h.finalize() self.assertEqual(3.0, h.binWidth)
def testNineElementsInThreeBins(self): """ If a histogram is created with 9 elements placed into 2 bins, the bins must contain the expected values. """ h = Histogram(3) list(map(h.add, range(9))) h.finalize() self.assertEqual([[0, 1, 2], [3, 4, 5], [6, 7, 8]], h.bins)
def testTwoElementsBinWidth(self): """ If a histogram with 5 buckets is created with two elements that differ by 1.0, the bin width should be set to the correct value of 0.2. """ h = Histogram(5) h.add(3) h.add(4) h.finalize() self.assertEqual(0.2, h.binWidth)
def testHashFractionIsSignificantWhenSignificant(self): """ The isSignificant method must return True if asked about a bin that is significant. """ histogram = Histogram(5) list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9])) histogram.finalize() significance = HashFraction(histogram, 10, 0.1) self.assertTrue(significance.isSignificant(0))
def testElementIsStoredInBin(self): """ If a histogram is created with just one element and one bin, the exact element that was passed must be placed in the bin. """ element = object() h = Histogram(1) h.add(3, element) h.finalize() self.assertIs(element, h.bins[0][0])
def testAlwaysSignificanceAnalysis(self): """ The correct analysis must be provided. """ histogram = Histogram(5) list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9])) histogram.finalize() significance = Always() self.assertEqual({'significanceMethod': 'Always'}, significance.analysis)
def testMaxBinHeightIsSignificantWhenNotSignificant(self): """ The isSignificant method must return False if asked about a bin that is not significant. """ histogram = Histogram(5) list(map(histogram.add, [1, 1, 1, 1, 1, 7, 8, 9])) histogram.finalize() significance = MaxBinHeight(histogram, SQUIRRELPOX, DB) self.assertFalse(significance.isSignificant(1))
def testMeanBinHeightIsSignificantWhenSignificant(self): """ The isSignificant method must return True if asked about a bin that is significant. """ histogram = Histogram(5) list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9])) histogram.finalize() significance = MeanBinHeight(histogram, COWPOX, DB) self.assertTrue(significance.isSignificant(0))
def testGetItemInvalidIndex(self): """ The __getitem__ method must raise IndexError if passed the index of a non-existent bin. """ h = Histogram(3) list(map(h.add, range(9))) h.finalize() six.assertRaisesRegex(self, IndexError, '^list index out of range$', h.__getitem__, 4)
def testOneElementMaxMin(self): """ If a histogram is created with just one element, the max and min should be set to that value. """ h = Histogram() h.add(3) h.finalize() self.assertEqual(3, h.max) self.assertEqual(3, h.min)
def testTwoElementsMaxMin(self): """ If a histogram is created with two elements, the max and min should be set to the correct values. """ h = Histogram() h.add(3) h.add(4) h.finalize() self.assertEqual(4, h.max) self.assertEqual(3, h.min)
def testHashFractionSignificanceAnalysis(self): """ The correct analysis must be provided. """ histogram = Histogram(5) list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9])) histogram.finalize() significance = HashFraction(histogram, 10, 0.1) self.assertEqual({'significanceCutoff': 1.0, 'significanceMethod': 'HashFraction'}, significance.analysis)
def testMaxBinHeightSignificanceAnalysis(self): """ The correct analysis must be provided. """ histogram = Histogram(5) list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9])) histogram.finalize() significance = MaxBinHeight(histogram, COWPOX, DB) self.assertEqual({'significanceCutoff': 0.0, 'significanceMethod': 'MaxBinHeight'}, significance.analysis)
def testAddDataAfterFinalized(self): """ If an attempt is made to add to a histogram that has been finalized, a RuntimeError must be raised. """ h = Histogram() error = ('^Additional data cannot be added: histogram already ' 'finalized$') h.add(3) h.finalize() six.assertRaisesRegex(self, RuntimeError, error, h.add, 3)
def testMeanBinHeightSignificanceAnalysis(self): """ The right analysis must be returned. """ histogram = Histogram(5) list(map(histogram.add, [1, 1, 1, 1, 1, 6, 7, 8, 9])) histogram.finalize() significance = MeanBinHeight(histogram, COWPOX, DB) self.assertEqual({'meanBinHeight': 0.0, 'significanceCutoff': 0.0, 'significanceMethod': 'MeanBinHeight', 'standardDeviation': 0.0}, significance.analysis)
def testAAFractionWhenSignificant(self): """ The isSignificant method must return True if asked about a bin that is significant. """ match = { 'subjectLandmark': Landmark('AlphaHelix', 'A', 0, 9), 'subjectTrigPoint': TrigPoint('Peaks', 'P', 21), 'queryLandmark': Landmark('AlphaHelix', 'A', 10, 9), 'queryTrigPoint': TrigPoint('Peaks', 'P', 25), } histogram = Histogram(1) histogram.add(0, match) histogram.finalize() significance = AAFraction(histogram, 10, 0.75) self.assertTrue(significance.isSignificant(0))
def testAAFractionSignificanceAnalysis(self): """ The correct analysis must be provided. """ match = { 'subjectLandmark': Landmark('AlphaHelix', 'A', 0, 9), 'subjectTrigPoint': TrigPoint('Peaks', 'P', 21), 'queryLandmark': Landmark('AlphaHelix', 'A', 10, 9), 'queryTrigPoint': TrigPoint('Peaks', 'P', 25), } histogram = Histogram(3) histogram.add(0, match) histogram.add(1, match) histogram.add(2, match) histogram.finalize() significance = AAFraction(histogram, 10, 0.75) self.assertTrue(significance.isSignificant(0)) self.assertEqual({'significanceCutoff': 7.5, 'significanceMethod': 'AAFraction'}, significance.analysis)
def _checkPositiveNegative(self, nBins, values): """ When a set of values is put into a histogram, the bin counts that result must be the same (just with the order reversed) as those that result from a histogram made with the same set of values but with opposite sign. @param nBins: The C{int} number of bins to use in the histogram. @param values: A C{list} of values to insert into the histogram. """ # Make a histogram of the values and get all the bin counts. h1 = Histogram(nBins) for value in values: h1.add(value) h1.finalize() counts1 = [len(bin_) for bin_ in h1.bins] # Make a histogram of the negative values and get all the bin counts. h2 = Histogram(nBins) for value in [-x for x in values]: h2.add(value) h2.finalize() counts2 = [len(bin_) for bin_ in h2.bins] counts2.reverse() # Prepare a useful error message, in case there are any differences. differences = ['Counts differ'] for i in range(len(counts1)): if counts1[i] != counts2[i]: h1Low = h1.min + i * h1.binWidth h1High = h1Low + h1.binWidth h2Low = h2.min + i * h2.binWidth h2High = h2Low + h2.binWidth differences.append( ' bin %d (h1 bin range: %.7f to %.7f, h2 bin range: ' '%.7f to %.7f): count %d != count %d' % (i, h1Low, h1High, h2Low, h2High, counts1[i], counts2[i])) # Bin counts must be the same. self.assertEqual(counts1, counts2, '\n'.join(differences))
def testFiveBinsMinusTwoPointFiveToPlusTwoPointFiveIntermediates(self): """ If a histogram is created with 5 bins and a data range of -2.5 to +2.5 items that are added between histogram boundaries must be placed in the expected bins. """ for (value, expectedCounts) in ((-2, [1, 0, 0, 0, 0]), (-1, [0, 1, 0, 0, 0]), (+0, [0, 0, 1, 0, 0]), (+1, [0, 0, 0, 1, 0]), (+2, [0, 0, 0, 0, 1])): h = Histogram(5) h.add(-2.5) # Set min value. h.add(2.5) # Set max value. h.add(value) h.finalize() counts = [len(bin_) for bin_ in h.bins] # Subract 1 from the first and last bin counts, to adjust for the # -2.5 and 2.5 boundary values we added manually. counts[0] -= 1 counts[-1] -= 1 self.assertEqual(expectedCounts, counts)
class Template(object): """ Parse an ASCII art picture of a light matter match and provide access to it. @param template: A C{str} template picture of the match. @raise ValueError: If the query and subject do not have the same number of paired features. """ def __init__(self, template): self.template = self.templateToList(template) self.query = Query(self.template) self.subject = Subject(self.template) if len(self.query.pairedFeatures) != len(self.subject.pairedFeatures): raise ValueError( 'The query and subject do not have the same number of paired ' 'features (%d != %d)' % (len(self.query.pairedFeatures), len(self.subject.pairedFeatures))) # Union the landmark and trig point names from the query and subject. self.landmarks = self.query.landmarks | self.subject.landmarks self.trigPoints = self.query.trigPoints | self.subject.trigPoints self.histogram = Histogram(1) for queryPair, subjectPair in zip(self.query.pairedFeatures, self.subject.pairedFeatures): _, queryLandmark, _, queryTrigPoint = queryPair _, subjectLandmark, _, subjectTrigPoint = subjectPair self.histogram.add(0, { 'queryLandmark': queryLandmark, 'queryTrigPoint': queryTrigPoint, 'subjectLandmark': subjectLandmark, 'subjectTrigPoint': subjectTrigPoint, }) self.histogram.finalize() @staticmethod def templateToList(template): """ Convert a picture to a list of trimmed non-blank lines. @param template: A C{str} template picture of the match. @return: A C{list} of \n separated non-blank lines from C{template}. """ result = [] whitespace = re.compile('^\s*$') for line in template.split('\n'): if whitespace.match(line) is None: result.append(line.rstrip()) return result def calculateScore(self, dbParams=None, findParams=None): """ Using a given scoring method, calculate the score of the alignment between the query and subject in the template. @param findParams: An instance of C{light.parameters.FindParameters} or C{None} to use default find parameters. @raises ValueError: If C{dbParams} is passed and the landmarks and trig points it specifies do not include all the landmarks and trig points named in the template. Of if the C{binScoreMethod} in C{findParams} is unknown. @return: A 2-tuple, being the result of calling the C{calculateScore} method of the C{binScoreMethod} class. The tuple contains a C{float} score of the bin and a C{dict} with the analysis leading to the score (see light/bin_score.py). """ findParams = findParams or FindParameters() if dbParams is None: dbParams = DatabaseParameters(landmarks=self.landmarks, trigPoints=self.trigPoints) else: missing = self.landmarks - set(dbParams.landmarkFinderNames()) if missing: raise ValueError( 'The template mentions landmark finders (%s) that are ' 'not present in the passed DatabaseParameters instance' % ', '.join(sorted(missing))) missing = self.trigPoints - set(dbParams.trigPointFinderNames()) if missing: raise ValueError( 'The template mentions trig point finders (%s) that are ' 'not present in the passed DatabaseParameters instance' % ', '.join(sorted(missing))) database = Database(dbParams=dbParams) _, subjectIndex, subjectHashCount = database.addSubject( self.subject.read) dbSubject = database.getSubjectByIndex(subjectIndex) binScoreMethod = findParams.binScoreMethod if binScoreMethod == 'NoneScore': scorer = NoneScore() elif binScoreMethod == 'MinHashesScore': be = database._connector._backend queryHashCount = 0 scannedQuery = be.scan(self.query.read) for hashInfo in be.getHashes(scannedQuery).values(): queryHashCount += len(hashInfo) scorer = MinHashesScore(self.histogram, min(queryHashCount, subjectHashCount)) elif binScoreMethod == 'FeatureMatchingScore': scorer = FeatureMatchingScore( self.histogram, self.query.read, dbSubject, dbParams, findParams) elif binScoreMethod == 'FeatureAAScore': scorer = FeatureAAScore( self.histogram, self.query.read, dbSubject, dbParams) elif binScoreMethod == 'WeightedFeatureAAScore': scorer = WeightedFeatureAAScore( self.histogram, self.query.read, dbSubject, dbParams, findParams.weights) else: raise ValueError('Unknown bin score method %r' % binScoreMethod) return scorer.calculateScore(0)
def __init__(self, query, connector, matches, queryHashCount, findParams=None, nonMatchingHashes=None, storeFullAnalysis=False): self.query = query self.connector = connector self.matches = matches # Only saved on self for testing. self.queryHashCount = queryHashCount findParams = findParams or FindParameters() self._findParams = findParams self.nonMatchingHashes = nonMatchingHashes self._storeFullAnalysis = storeFullAnalysis self.analysis = defaultdict(dict) deltaScale = findParams.deltaScale scoreGetter = itemgetter('score') be = Backend() be.configure(connector.dbParams) if findParams.significanceMethod == 'AAFraction': queryAACount = len(be.scan(query).coveredIndices()) # Go through all the subjects that were matched at all, and put the # match offset deltas into bins so we can decide which (if any) of # the matches is significant. for subjectIndex in matches: subject = connector.getSubjectByIndex(subjectIndex) # Use a histogram to bin scaled (landmark, trigPoint) offset # deltas. nBins = max(len(query), len(subject)) # Make sure the number of bins is odd, else Histogram() will raise. nBins |= 0x1 histogram = Histogram(nBins) add = histogram.add # To ensure the set of query/subject offset deltas is the same # no matter which of the sequences is the query and which is # the subject, we negate all deltas if the subject sequence # sorts first. This is just a way of canonicalizing the set of # deltas. If we don't canonicalize, we get sets of deltas with # opposite signs, like {-4, -2, 6} and {-6, 2, 4} depending on # which sequence is the subject and which the query. This # occasionally leads to hard-to-debug and awkward-to-fix # differences in the histogram binning at bin boundaries due to # tiny floating point differences. The simple solution is to # canonicalize the deltas based on an arbitrary consistent # difference between the subject and query. negateDeltas = subject.read.sequence < query.sequence for match in matches[subjectIndex]: # The delta is the difference between the # corresponding landmark offsets subjectLandmarkOffset = match['subjectLandmark'].offset queryLandmarkOffset = match['queryLandmark'].offset delta = subjectLandmarkOffset - queryLandmarkOffset if negateDeltas: delta = -delta # Add the information about this common landmark / # trig point hash to the histogram bucket for the # query landmark to subject landmark offset delta. add(scaleLinear(delta, deltaScale), match) histogram.finalize() minHashCount = min(queryHashCount, subject.hashCount) significanceMethod = findParams.significanceMethod if significanceMethod == 'Always': significance = Always() elif significanceMethod == 'HashFraction': significance = HashFraction(histogram, minHashCount, findParams.significanceFraction) elif significanceMethod == 'MaxBinHeight': significance = MaxBinHeight(histogram, query, connector) elif significanceMethod == 'MeanBinHeight': significance = MeanBinHeight(histogram, query, connector) elif significanceMethod == 'AAFraction': featureAACount = (queryAACount + len(be.scan(subject.read).coveredIndices())) significance = AAFraction(histogram, featureAACount, findParams.significanceFraction) else: raise ValueError('Unknown significance method %r' % significanceMethod) binScoreMethod = findParams.binScoreMethod if binScoreMethod == 'NoneScore': scorer = NoneScore() elif binScoreMethod == 'MinHashesScore': scorer = MinHashesScore(histogram, minHashCount) elif binScoreMethod == 'FeatureMatchingScore': scorer = FeatureMatchingScore(histogram, query, subject, connector.dbParams, findParams) elif binScoreMethod == 'FeatureAAScore': scorer = FeatureAAScore(histogram, query, subject, connector.dbParams) elif binScoreMethod == 'WeightedFeatureAAScore': scorer = WeightedFeatureAAScore(histogram, query, subject, connector.dbParams, findParams.weights) elif binScoreMethod == 'FeatureAALengthScore': scorer = FeatureAALengthScore(histogram, query, subject, connector.dbParams) else: raise ValueError('Unknown bin score method %r' % binScoreMethod) # Find bins with a significant number of elements and score them. significantBins = [] for binIndex, bin_ in enumerate(histogram.bins): if significance.isSignificant(binIndex): score, scoreAnalysis = scorer.calculateScore(binIndex) significantBin = { 'bin': bin_, 'index': binIndex, 'score': score } if storeFullAnalysis: significantBin['scoreAnalysis'] = scoreAnalysis significantBins.append(significantBin) if significantBins: significantBins.sort(key=scoreGetter, reverse=True) bestBinScore = significantBins[0]['score'] else: bestBinScore = None overallScoreMethod = findParams.overallScoreMethod if overallScoreMethod == 'BestBinScore': scorer = BestBinScore(histogram, significantBins) elif overallScoreMethod == 'SignificantBinScore': scorer = SignificantBinScore(significantBins, query, subject, connector.dbParams) elif overallScoreMethod == 'GreedySignificantBinScore': scorer = GreedySignificantBinScore(significantBins, query, subject, connector.dbParams) else: raise ValueError('Unknown overall score method %r' % overallScoreMethod) overallScore, overallScoreAnalysis = scorer.calculateScore() if storeFullAnalysis: self.analysis[subjectIndex] = { 'histogram': histogram, 'bestBinScore': bestBinScore, 'overallScore': overallScore, 'overallScoreAnalysis': overallScoreAnalysis, 'significantBins': significantBins, 'significanceAnalysis': significance.analysis, } elif significantBins: self.analysis[subjectIndex] = { 'bestBinScore': bestBinScore, 'overallScore': overallScore, 'significantBins': significantBins, }