def testSkipRecords(self): """ This calls estimateAnomalyLikelihoods with various values of skipRecords """ # Check happy path data1 = _generateSampleData(mean=0.1)[0:200] data1 = data1 + (_generateSampleData(mean=0.9)[0:200]) likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods( data1, skipRecords=200)) # Check results are correct, i.e. we are actually skipping the first 50 dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], 0.9, epsilon=0.1) # Check case where skipRecords > num records # In this case a null distribution should be returned which makes all # the likelihoods reasonably high likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods( data1, skipRecords=500)) self.assertEqual(len(likelihoods), len(data1)) self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods)) # Check the case where skipRecords == num records likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods( data1, skipRecords=len(data1))) self.assertEqual(len(likelihoods), len(data1)) self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods))
def testSkipRecords(self): """ This calls estimateAnomalyLikelihoods with various values of skipRecords """ # Check happy path data1 = _generateSampleData(mean=0.1)[0:200] data1 = data1 + (_generateSampleData(mean=0.9)[0:200]) likelihoods, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data1, skipRecords=200) ) # Check results are correct, i.e. we are actually skipping the first 50 dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], 0.9, epsilon=0.1) # Check case where skipRecords > num records # In this case a null distribution should be returned which makes all # the likelihoods reasonably high likelihoods, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data1, skipRecords=500) ) self.assertEqual(len(likelihoods), len(data1)) self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods)) # Check the case where skipRecords == num records likelihoods, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data1, skipRecords=len(data1)) ) self.assertEqual(len(likelihoods), len(data1)) self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods))
def testVeryFewScores(self): """ This calls estimateAnomalyLikelihoods and updateAnomalyLikelihoods with one or no scores. """ # Generate an estimate using two data points data1 = _generateSampleData(mean=42.0, variance=1e-10) _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:2])) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the estimated mean is that value dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], data1[0][2]) # Can't generate an estimate using no data points data1 = numpy.zeros(0) with self.assertRaises(ValueError): an.estimateAnomalyLikelihoods(data1) # Can't update with no scores with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, estimatorParams)
def testVeryFewScores(self): """ This calls estimateAnomalyLikelihoods and updateAnomalyLikelihoods with one or no scores. """ # Generate an estimate using two data points data1 = _generateSampleData(mean=42.0, variance=1e-10) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:2]) ) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the estimated mean is that value dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], data1[0][2]) # Can't generate an estimate using no data points data1 = numpy.zeros(0) with self.assertRaises(ValueError): an.estimateAnomalyLikelihoods(data1) # Can't update with no scores with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, estimatorParams)
def testFlatAnomalyScores(self): """ This calls estimateAnomalyLikelihoods with flat distributions and ensures things don't crash. """ # Generate an estimate using fake distribution of anomaly scores. data1 = _generateSampleData(mean=42.0, variance=1e-10) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1000]) ) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) ## Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], data1[0][2]) # If you deviate from the mean, you should get probability 0 # Test this by sending in just slightly different values. data2 = _generateSampleData(mean=42.5, variance=1e-10) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data2[0:10], estimatorParams) ) # The likelihoods should go to zero very quickly self.assertLessEqual(likelihoods2.sum(), 0.01) # Test edge case where anomaly scores are very close to 0 # In this case we don't let likelihood to get too low. An average # anomaly score of 0.1 should be essentially zero, but an average # of 0.04 should be higher data3 = _generateSampleData(mean=0.01, variance=1e-6) _, _, estimatorParams3 = ( an.estimateAnomalyLikelihoods(data3[0:1000]) ) data4 = _generateSampleData(mean=0.1, variance=1e-6) likelihoods4, _, estimatorParams4 = ( an.updateAnomalyLikelihoods(data4[0:20], estimatorParams3) ) # Average of 0.1 should go to zero self.assertLessEqual(likelihoods4[10:].mean(), 0.002) data5 = _generateSampleData(mean=0.05, variance=1e-6) likelihoods5, _, _ = ( an.updateAnomalyLikelihoods(data5[0:20], estimatorParams4) ) # The likelihoods should be low but not near zero self.assertLessEqual(likelihoods5[10:].mean(), 0.28) self.assertGreater(likelihoods5[10:].mean(), 0.015)
def testFlatAnomalyScores(self): """ This calls estimateAnomalyLikelihoods with flat distributions and ensures things don't crash. """ # Generate an estimate using fake distribution of anomaly scores. data1 = _generateSampleData(mean=42.0, variance=1e-10) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1000]) ) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) ## Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], data1[0][2]) # If you deviate from the mean, you should get probability 0 # Test this by sending in just slightly different values. data2 = _generateSampleData(mean=42.5, variance=1e-10) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data2[0:10], estimatorParams) ) # The likelihoods should go to zero very quickly self.assertLessEqual(likelihoods2.sum(), 0.01) # Test edge case where anomaly scores are very close to 0 # In this case we don't let likelihood to get too low. An average # anomaly score of 0.1 should be essentially zero, but an average # of 0.04 should be higher data3 = _generateSampleData(mean=0.01, variance=1e-6) _, _, estimatorParams3 = ( an.estimateAnomalyLikelihoods(data3[0:1000]) ) data4 = _generateSampleData(mean=0.1, variance=1e-6) likelihoods4, _, estimatorParams4 = ( an.updateAnomalyLikelihoods(data4[0:20], estimatorParams3) ) # Average of 0.1 should go to zero self.assertLessEqual(likelihoods4[10:].mean(), 0.002) data5 = _generateSampleData(mean=0.05, variance=1e-6) likelihoods5, _, _ = ( an.updateAnomalyLikelihoods(data5[0:20], estimatorParams4) ) # The likelihoods should be low but not near zero self.assertLessEqual(likelihoods5[10:].mean(), 0.28) self.assertGreater(likelihoods5[10:].mean(), 0.015)
def likelihood(self, value, anomalyScore, dttm): """ Given the current metric value, plus the current anomaly score, output the anomalyLikelihood for this record. """ dataPoint = (dttm, value, anomalyScore) # We ignore the first probationaryPeriod data points if len(self._historicalScores) < self._probationaryPeriod: likelihood = 0.5 else: # On a rolling basis we re-estimate the distribution every 100 iterations if self._distribution is None or (self._iteration % 100 == 0): _, _, self._distribution = ( anomaly_likelihood.estimateAnomalyLikelihoods( self._historicalScores, skipRecords = self._numentaLearningPeriod) ) likelihoods, _, self._distribution = ( anomaly_likelihood.updateAnomalyLikelihoods([dataPoint], self._distribution) ) likelihood = 1.0 - likelihoods[0] # Before we exit update historical scores and iteration self._historicalScores.append(dataPoint) self._iteration += 1 return likelihood
def testCaseIncreasedAnomalyScore(self): """ Test F: small anomaly score every 20 records, but then a large one when you would expect a small one. This should be anomalous. """ # Generate initial data data = [] data = self._addSampleData(data, spikePeriod=20, spikeValue=0.4, numSamples=1000) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data) ) # Now feed in a more frequent distribution data = self._addSampleData(spikePeriod=20, spikeValue=1.0, numSamples=100) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # We should detect highly unusual behavior self.assertTrue(likelihoods2.min() < 0.0003) # We should detect it pretty often self.assertTrue((likelihoods2 < 0.0003).sum() > 40)
def testCaseIncreasedSpikeFrequency(self): """ Test E: bunches of anomalies every 20 records that become even more frequent. This should be anomalous. """ # Generate initial data data = [] for _ in range(30): data = self._addSampleData(data, spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) # Now feed in a more frequent distribution data = self._addSampleData(spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=1, numSamples=10) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # The likelihood should become anomalous but only near the end self.assertTrue(likelihoods2[0:30].min() > 0.01) self.assertTrue(likelihoods2[-5:].min() < 0.002)
def testCaseContinuousBunchesOfSpikes(self): """ Test D: bunches of anomalies every 20 records that continue. This should not be anomalous. """ # Generate initial data data = [] for _ in range(30): data = self._addSampleData(data, spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) # Now feed in the same distribution data = self._addSampleData(spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # The likelihood should be reasonable high everywhere self.assertTrue(likelihoods2.min() > 0.01)
def testCaseUnusuallyHighSpikeFrequency(self): """ Test B: one anomaly spike every 20 records. Then we suddenly get a bunch in a row. The likelihood of those spikes should be low. """ data = self._addSampleData(spikePeriod=20, numSamples=1019) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) # If we continue to see the same distribution, we should get reasonable # likelihoods data = self._addSampleData(numSamples=119, spikePeriod=20) likelihoods1, _, estimatorParams1 = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # The minimum likelihood should be reasonably high self.assertTrue((likelihoods1.min() > 0.1 )) data = self._addSampleData(numSamples=20, spikePeriod=2) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams1) ) # The likelihood once you get past the initial averaging should be very low. self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.001)
def testCaseIncreasedSpikeFrequency(self): """ Test E: bunches of anomalies every 20 records that become even more frequent. This should be anomalous. """ # Generate initial data data = [] for _ in range(30): data = self._addSampleData(data, spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) # Now feed in a more frequent distribution data = self._addSampleData(spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=1, numSamples=10) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # The likelihood should become anomalous but only near the end self.assertTrue(likelihoods2[0:30].min() > 0.01) self.assertTrue(likelihoods2[-5:].min() < 0.002)
def testEstimateAnomalyLikelihoods(self): """ This calls estimateAnomalyLikelihoods to estimate the distribution on fake data and validates the results """ # Generate an estimate using fake distribution of anomaly scores. data1 = _generateSampleData(mean=0.2) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1000])) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the sum is correct avgParams = estimatorParams["movingAverage"] total = 0 for v in avgRecordList: total = total + v[2] self.assertTrue(avgParams["total"], total) # Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], total / float(len(avgRecordList))) # Number of points with lower than 2% probability should be pretty low # but not zero. Can't use exact 2% here due to random variations self.assertLessEqual(numpy.sum(likelihoods < 0.02), 50) self.assertGreaterEqual(numpy.sum(likelihoods < 0.02), 1)
def testUpdateAnomalyLikelihoods(self): """ A slight more complex test. This calls estimateAnomalyLikelihoods to estimate the distribution on fake data, followed by several calls to updateAnomalyLikelihoods. """ # ------------------------------------------ # Step 1. Generate an initial estimate using fake distribution of anomaly # scores. data1 = _generateSampleData(mean=0.2)[0:1000] _, _, estimatorParams = an.estimateAnomalyLikelihoods(data1, averagingWindow=5) # ------------------------------------------ # Step 2. Generate some new data with a higher average anomaly # score. Using the estimator from step 1, to compute likelihoods. Now we # should see a lot more anomalies. data2 = _generateSampleData(mean=0.6)[0:300] likelihoods2, avgRecordList2, estimatorParams2 = an.updateAnomalyLikelihoods(data2, estimatorParams) self.assertEqual(len(likelihoods2), len(data2)) self.assertEqual(len(avgRecordList2), len(data2)) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # The new running total should be different self.assertNotEqual(estimatorParams2["movingAverage"]["total"], estimatorParams["movingAverage"]["total"]) # We should have many more samples where likelihood is < 0.01, but not all self.assertGreaterEqual(numpy.sum(likelihoods2 < 0.01), 25) self.assertLessEqual(numpy.sum(likelihoods2 < 0.01), 250) # ------------------------------------------ # Step 3. Generate some new data with the expected average anomaly score. We # should see fewer anomalies than in Step 2. data3 = _generateSampleData(mean=0.2)[0:1000] likelihoods3, avgRecordList3, estimatorParams3 = an.updateAnomalyLikelihoods(data3, estimatorParams2) self.assertEqual(len(likelihoods3), len(data3)) self.assertEqual(len(avgRecordList3), len(data3)) self.assertTrue(an.isValidEstimatorParams(estimatorParams3)) # The new running total should be different self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams["movingAverage"]["total"]) self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams2["movingAverage"]["total"]) # We should have a small number samples where likelihood is < 0.02, but at # least one self.assertGreaterEqual(numpy.sum(likelihoods3 < 0.01), 1) self.assertLessEqual(numpy.sum(likelihoods3 < 0.01), 100) # ------------------------------------------ # Step 4. Validate that sending data incrementally is the same as sending # in one batch allData = data1 allData.extend(data2) allData.extend(data3) # Compute moving average of all the data and check it's the same _, historicalValuesAll, totalAll = an._anomalyScoreMovingAverage(allData, windowSize=5) self.assertEqual(sum(historicalValuesAll), sum(estimatorParams3["movingAverage"]["historicalValues"])) self.assertEqual(totalAll, estimatorParams3["movingAverage"]["total"])
def likelihood(self, value, anomalyScore, dttm): """ Given the current metric value, plus the current anomaly score, output the anomalyLikelihood for this record. """ dataPoint = (dttm, value, anomalyScore) # We ignore the first probationaryPeriod data points if len(self._historicalScores) < self._probationaryPeriod: likelihood = 0.5 else: # On a rolling basis we re-estimate the distribution every 100 iterations if self._distribution is None or (self._iteration % 100 == 0): _, _, self._distribution = ( anomaly_likelihood.estimateAnomalyLikelihoods( self._historicalScores, skipRecords = self._numentaLearningPeriod) ) likelihoods, _, self._distribution = ( anomaly_likelihood.updateAnomalyLikelihoods([dataPoint], self._distribution) ) likelihood = 1.0 - likelihoods[0] # Before we exit update historical scores and iteration self._historicalScores.append(dataPoint) self._iteration += 1 return likelihood
def testEstimateAnomalyLikelihoodsMalformedRecords(self): """ This calls estimateAnomalyLikelihoods with malformed records, which should be quietly skipped. """ # Generate an estimate using fake distribution of anomaly scores. data1 = _generateSampleData(mean=0.2) data1 = data1 + [(2, 2), (2, 2, 2, 2), (), (2)] # Malformed records likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1000]) ) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the sum is correct avgParams = estimatorParams["movingAverage"] total = 0 for v in avgRecordList: total = total + v[2] self.assertTrue(avgParams["total"], total) # Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], total / float(len(avgRecordList)))
def testCaseUnusuallyHighSpikeFrequency(self): """ Test B: one anomaly spike every 20 records. Then we suddenly get a bunch in a row. The likelihood of those spikes should be low. """ data = self._addSampleData(spikePeriod=20, numSamples=1019) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) # If we continue to see the same distribution, we should get reasonable # likelihoods data = self._addSampleData(numSamples=119, spikePeriod=20) likelihoods1, _, estimatorParams1 = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # The minimum likelihood should be reasonably high self.assertTrue((likelihoods1.min() > 0.1 )) data = self._addSampleData(numSamples=20, spikePeriod=2) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams1) ) # The likelihood once you get past the initial averaging should be very low. self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.001)
def testCaseIncreasedAnomalyScore(self): """ Test F: small anomaly score every 20 records, but then a large one when you would expect a small one. This should be anomalous. """ # Generate initial data data = [] data = self._addSampleData(data, spikePeriod=20, spikeValue=0.4, numSamples=1000) _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data)) # Now feed in a more frequent distribution data = self._addSampleData(spikePeriod=20, spikeValue=1.0, numSamples=100) likelihoods2, _, _ = (an.updateAnomalyLikelihoods( data, estimatorParams)) # We should detect highly unusual behavior self.assertTrue(likelihoods2.min() < 0.0003) # We should detect it pretty often self.assertTrue((likelihoods2 < 0.0003).sum() > 40)
def testEstimateAnomalyLikelihoods(self): """ This calls estimateAnomalyLikelihoods to estimate the distribution on fake data and validates the results """ # Generate an estimate using fake distribution of anomaly scores. data1 = _generateSampleData(mean=0.2) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1000]) ) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the sum is correct avgParams = estimatorParams["movingAverage"] total = 0 for v in avgRecordList: total = total + v[2] self.assertTrue(avgParams["total"], total) # Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], total / float(len(avgRecordList))) # Number of points with lower than 2% probability should be pretty low # but not zero. Can't use exact 2% here due to random variations self.assertLessEqual(numpy.sum(likelihoods < 0.02), 50) self.assertGreaterEqual(numpy.sum(likelihoods < 0.02), 1)
def testCaseContinuousBunchesOfSpikes(self): """ Test D: bunches of anomalies every 20 records that continue. This should not be anomalous. """ # Generate initial data data = [] for _ in range(30): data = self._addSampleData(data, spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) # Now feed in the same distribution data = self._addSampleData(spikePeriod=0, numSamples=30) data = self._addSampleData(data, spikePeriod=3, numSamples=10) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # The likelihood should be reasonable high everywhere self.assertTrue(likelihoods2.min() > 0.01)
def testEstimateAnomalyLikelihoodsMalformedRecords(self): """ This calls estimateAnomalyLikelihoods with malformed records, which should be quietly skipped. """ # Generate a fake distribution of anomaly scores, and add malformed records data1 = _generateSampleData(mean=0.2) data1 = data1[0:1000] + [(2, 2)] + [(2, 2, 2, 2)] + [()] + [(2)] likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1004])) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the sum is correct avgParams = estimatorParams["movingAverage"] total = 0 for v in avgRecordList: total = total + v[2] self.assertTrue(avgParams["total"], total) # Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], total / float(len(avgRecordList)))
def testEstimateAnomalyLikelihoodsCategoryValues(self): start = datetime.datetime(2017, 1, 1, 0, 0, 0) delta = datetime.timedelta(minutes=5) dts = [start + (i * delta) for i in xrange(10)] values = ["a", "b", "c", "d", "e"] * 2 rawScores = [0.1 * i for i in xrange(10)] data = zip(dts, values, rawScores) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data)) self.assertEqual(len(likelihoods), 10) self.assertEqual(len(avgRecordList), 10) self.assertTrue(an.isValidEstimatorParams(estimatorParams))
def testCaseSingleSpike(self): """ No anomalies, and then you see a single spike. The likelihood of that spike should be 0 """ data = self._addSampleData(spikePeriod=0, numSamples=1000) _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data[0:1000])) data = self._addSampleData(numSamples=1, spikePeriod=1) likelihoods1, _, _ = (an.updateAnomalyLikelihoods( data, estimatorParams)) self.assertWithinEpsilon(likelihoods1[0], 0.0)
def testEstimateAnomalyLikelihoodsCategoryValues(self): start = datetime.datetime(2017, 1, 1, 0, 0, 0) delta = datetime.timedelta(minutes=5) dts = [start + (i * delta) for i in xrange(10)] values = ["a", "b", "c", "d", "e"] * 2 rawScores = [0.1 * i for i in xrange(10)] data = zip(dts, values, rawScores) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data) ) self.assertEqual(len(likelihoods), 10) self.assertEqual(len(avgRecordList), 10) self.assertTrue(an.isValidEstimatorParams(estimatorParams))
def testCaseMissingSpike(self): """ Test C: one anomaly every 20 records, but then see none. The likelihood at the end should be very low. """ # Initial data data = self._addSampleData(spikePeriod=20, numSamples=1019) _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data[0:1000])) # Now feed in none data = self._addSampleData(numSamples=100, spikePeriod=0) likelihoods2, _, _ = (an.updateAnomalyLikelihoods( data, estimatorParams)) # The likelihood once you get past the initial averaging should be very low. self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.0001)
def testCaseSingleSpike(self): """ No anomalies, and then you see a single spike. The likelihood of that spike should be 0 """ data = self._addSampleData(spikePeriod=0, numSamples=1000) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) data = self._addSampleData(numSamples=1, spikePeriod=1) likelihoods1, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) self.assertWithinEpsilon(likelihoods1[0], 0.0)
def testFlatMetricScores(self): """ This calls estimateAnomalyLikelihoods with flat metric values. In this case we should use the null distribution, which gets reasonably high likelihood for everything. """ # Generate samples with very flat metric values data1 = _generateSampleData(metricMean=42.0, metricVariance=1e-10)[0:1000] likelihoods, _, estimatorParams = an.estimateAnomalyLikelihoods(data1) # Check that we do indeed get reasonable likelihood values self.assertEqual(len(likelihoods), len(data1)) self.assertTrue(likelihoods.sum() >= 0.4 * len(likelihoods)) # Check that we do indeed get null distribution self.assertDictEqual(estimatorParams["distribution"], an.nullDistribution())
def testBadParams(self): """ Calls updateAnomalyLikelihoods with bad params. """ # Generate an estimate using one data point data1 = _generateSampleData(mean=42.0, variance=1e-10) _, _, estimatorParams = an.estimateAnomalyLikelihoods(data1[0:1]) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Can't pass in a bad params structure with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, {"haha": "heehee"}) # Can't pass in something not a dict with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, 42.0)
def testBadParams(self): """ Calls updateAnomalyLikelihoods with bad params. """ # Generate an estimate using one data point data1 = _generateSampleData(mean=42.0, variance=1e-10) _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:1])) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Can't pass in a bad params structure with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, {"haha": "heehee"}) # Can't pass in something not a dict with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, 42.0)
def testCaseMissingSpike(self): """ Test C: one anomaly every 20 records, but then see none. The likelihood at the end should be very low. """ # Initial data data = self._addSampleData(spikePeriod=20, numSamples=1019) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data[0:1000]) ) # Now feed in none data = self._addSampleData(numSamples=100, spikePeriod=0) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data, estimatorParams) ) # The likelihood once you get past the initial averaging should be very low. self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.0001)
def testFlatMetricScores(self): """ This calls estimateAnomalyLikelihoods with flat metric values. In this case we should use the null distribution, which gets reasonably high likelihood for everything. """ # Generate samples with very flat metric values data1 = _generateSampleData(metricMean=42.0, metricVariance=1e-10)[0:1000] likelihoods, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data1)) # Check that we do indeed get reasonable likelihood values self.assertEqual(len(likelihoods), len(data1)) self.assertTrue(likelihoods.sum() >= 0.4 * len(likelihoods)) # Check that we do indeed get null distribution self.assertDictEqual(estimatorParams["distribution"], an.nullDistribution())
def testNABAnomalyLikelihood(self): """ Tests the specific calls to nupic/algorithms/anomaly_likelihood as they"re made in "NAB/detectors/numenta/numenta_detector.py". Note "NAB/.../numenta_detector.py" has its own class AnomalyLikelihood, different from nupic/algorithms/anomaly_likelihood.AnomalyLikelihood, but which calls the functions estimateAnomalyLikelihoods() and updateAnomalyLikelihoods() from "nupic/algorithms/anomaly_likelihood.py". """ # AnomalyLikelihood object initial values iteration = 0 probationaryPeriod = 4 historicalScores = [] likelihoodList = [] for dataPoint in self.data: # Ignore the first probationaryPeriod data points if len(historicalScores) < probationaryPeriod: likelihood = 0.5 else: if iteration % 4 == 0: _, _, distribution = an.estimateAnomalyLikelihoods( historicalScores, skipRecords=probationaryPeriod) likelihoods, _, distribution = an.updateAnomalyLikelihoods( [dataPoint], distribution) likelihood = 1.0 - likelihoods[0] historicalScores.append(dataPoint) iteration += 1 likelihoodList.append(likelihood) truthLikelihoodList = [ 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.044565462999999972, 0.044565462999999972, 0.044565462999999972, 0.044565462999999972, 0.90319951499999995, 0.90319951499999995, 0.90319951499999995, 0.90319951499999995, 0.78814460099999994, 0.78814460099999994, 0.78814460099999994, 0.78814460099999994 ] for i in xrange(len(likelihoodList)): self.assertAlmostEqual(likelihoodList[i], truthLikelihoodList[i], msg="unequal values are at index %i" % i)
def testNABAnomalyLikelihood(self): """ Tests the specific calls to nupic/algorithms/anomaly_likelihood as they"re made in "NAB/detectors/numenta/numenta_detector.py". Note "NAB/.../numenta_detector.py" has its own class AnomalyLikelihood, different from nupic/algorithms/anomaly_likelihood.AnomalyLikelihood, but which calls the functions estimateAnomalyLikelihoods() and updateAnomalyLikelihoods() from "nupic/algorithms/anomaly_likelihood.py". """ # AnomalyLikelihood object initial values iteration = 0 probationaryPeriod = 4 historicalScores = [] likelihoodList = [] for dataPoint in self.data: # Ignore the first probationaryPeriod data points if len(historicalScores) < probationaryPeriod: likelihood = 0.5 else: if iteration % 4 == 0: _, _, distribution = an.estimateAnomalyLikelihoods( historicalScores, skipRecords = probationaryPeriod) likelihoods, _, distribution = an.updateAnomalyLikelihoods( [dataPoint], distribution) likelihood = 1.0 - likelihoods[0] historicalScores.append(dataPoint) iteration += 1 likelihoodList.append(likelihood) truthLikelihoodList = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.044565462999999972, 0.044565462999999972, 0.044565462999999972, 0.044565462999999972, 0.90319951499999995, 0.90319951499999995, 0.90319951499999995, 0.90319951499999995, 0.78814460099999994, 0.78814460099999994, 0.78814460099999994, 0.78814460099999994] for i in xrange(len(likelihoodList)): self.assertAlmostEqual(likelihoodList[i], truthLikelihoodList[i], msg="unequal values are at index %i" % i)
def testUpdateAnomalyLikelihoods(self): """ A slight more complex test. This calls estimateAnomalyLikelihoods to estimate the distribution on fake data, followed by several calls to updateAnomalyLikelihoods. """ #------------------------------------------ # Step 1. Generate an initial estimate using fake distribution of anomaly # scores. data1 = _generateSampleData(mean=0.2)[0:1000] _, _, estimatorParams = (an.estimateAnomalyLikelihoods( data1, averagingWindow=5)) #------------------------------------------ # Step 2. Generate some new data with a higher average anomaly # score. Using the estimator from step 1, to compute likelihoods. Now we # should see a lot more anomalies. data2 = _generateSampleData(mean=0.6)[0:300] likelihoods2, avgRecordList2, estimatorParams2 = ( an.updateAnomalyLikelihoods(data2, estimatorParams)) self.assertEqual(len(likelihoods2), len(data2)) self.assertEqual(len(avgRecordList2), len(data2)) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # The new running total should be different self.assertNotEqual(estimatorParams2["movingAverage"]["total"], estimatorParams["movingAverage"]["total"]) # We should have many more samples where likelihood is < 0.01, but not all self.assertGreaterEqual(numpy.sum(likelihoods2 < 0.01), 25) self.assertLessEqual(numpy.sum(likelihoods2 < 0.01), 250) #------------------------------------------ # Step 3. Generate some new data with the expected average anomaly score. We # should see fewer anomalies than in Step 2. data3 = _generateSampleData(mean=0.2)[0:1000] likelihoods3, avgRecordList3, estimatorParams3 = ( an.updateAnomalyLikelihoods(data3, estimatorParams2)) self.assertEqual(len(likelihoods3), len(data3)) self.assertEqual(len(avgRecordList3), len(data3)) self.assertTrue(an.isValidEstimatorParams(estimatorParams3)) # The new running total should be different self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams["movingAverage"]["total"]) self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams2["movingAverage"]["total"]) # We should have a small number samples where likelihood is < 0.02, but at # least one self.assertGreaterEqual(numpy.sum(likelihoods3 < 0.01), 1) self.assertLessEqual(numpy.sum(likelihoods3 < 0.01), 100) #------------------------------------------ # Step 4. Validate that sending data incrementally is the same as sending # in one batch allData = data1 allData.extend(data2) allData.extend(data3) # Compute moving average of all the data and check it's the same _, historicalValuesAll, totalAll = (an._anomalyScoreMovingAverage( allData, windowSize=5)) self.assertEqual( sum(historicalValuesAll), sum(estimatorParams3["movingAverage"]["historicalValues"])) self.assertEqual(totalAll, estimatorParams3["movingAverage"]["total"])
def _generateAnomalyParams(self, metricID, statsSampleCache, defaultAnomalyParams): """ Generate the model's anomaly likelihood parameters from the given sample cache. :param metricID: the metric ID :param statsSampleCache: a sequence of MetricData instances that comprise the cache of samples for the current inference result batch with valid raw_anomaly_score in the processed order (by rowid/timestamp). At least self._statisticsMinSampleSize samples are needed. :param defaultAnomalyParams: the default anomaly params value; if can't generate new ones (not enough samples in cache), this value will be returned verbatim :returns: new anomaly likelihood parameters; defaultAnomalyParams, if there are not enough samples in statsSampleCache. """ if len(statsSampleCache) < self._statisticsMinSampleSize: # Not enough samples in cache # TODO: unit-test this self._log.error( "Not enough samples in cache to update anomaly params for model=%s: " "have=%d, which is less than min=%d; firstRowID=%s; lastRowID=%s.", metricID, len(statsSampleCache), self._statisticsMinSampleSize, statsSampleCache[0].rowid if statsSampleCache else None, statsSampleCache[-1].rowid if statsSampleCache else None) return defaultAnomalyParams # We have enough samples to generate anomaly params lastRowID = statsSampleCache[-1].rowid numSamples = min(len(statsSampleCache), self._statisticsSampleSize) # Create input sequence for algorithms samplesIter = itertools.islice(statsSampleCache, len(statsSampleCache) - numSamples, len(statsSampleCache)) scores = tuple(( row.timestamp, row.metric_value, row.raw_anomaly_score, ) for row in samplesIter) assert len(scores) >= self._statisticsMinSampleSize, ( "_generateAnomalyParams: samples count=%d is smaller than min=%d; " "model=%s; lastRowID=%s") % ( len(scores), self._statisticsMinSampleSize, metricID, lastRowID, ) assert len(scores) <= self._statisticsSampleSize, ( "_generateAnomalyParams: samples count=%d is larger than max=%d; " "model=%s; lastRowID=%s") % ( len(scores), self._statisticsSampleSize, metricID, lastRowID, ) # Calculate estimator parameters # We ignore statistics from the first day of data (288 records) since the # CLA is still learning. For simplicity, this logic continues to ignore the # first day of data even once the window starts sliding. _, _, params = algorithms.estimateAnomalyLikelihoods( anomalyScores=scores, skipRecords=NUM_SKIP_RECORDS) anomalyParams = {} anomalyParams["last_rowid_for_stats"] = lastRowID anomalyParams["params"] = params self._log.debug( "Generated anomaly params for model=%s using " "numRows=%d with rows=[%s..%s]", metricID, numSamples, statsSampleCache[-numSamples].rowid, statsSampleCache[-1].rowid) return anomalyParams
def _generateAnomalyParams(self, metricID, statsSampleCache, defaultAnomalyParams): """ Generate the model's anomaly likelihood parameters from the given sample cache. :param metricID: the metric ID :param statsSampleCache: a sequence of MetricData instances that comprise the cache of samples for the current inference result batch with valid raw_anomaly_score in the processed order (by rowid/timestamp). At least self._statisticsMinSampleSize samples are needed. :param defaultAnomalyParams: the default anomaly params value; if can't generate new ones (not enough samples in cache), this value will be returned verbatim :returns: new anomaly likelihood parameters; defaultAnomalyParams, if there are not enough samples in statsSampleCache. """ if len(statsSampleCache) < self._statisticsMinSampleSize: # Not enough samples in cache # TODO: unit-test this self._log.error( "Not enough samples in cache to update anomaly params for model=%s: " "have=%d, which is less than min=%d; firstRowID=%s; lastRowID=%s.", metricID, len(statsSampleCache), self._statisticsMinSampleSize, statsSampleCache[0].rowid if statsSampleCache else None, statsSampleCache[-1].rowid if statsSampleCache else None) return defaultAnomalyParams # We have enough samples to generate anomaly params lastRowID = statsSampleCache[-1].rowid numSamples = min(len(statsSampleCache), self._statisticsSampleSize) # Create input sequence for algorithms samplesIter = itertools.islice( statsSampleCache, len(statsSampleCache) - numSamples, len(statsSampleCache)) scores = tuple( (row.timestamp, row.metric_value, row.raw_anomaly_score,) for row in samplesIter) assert len(scores) >= self._statisticsMinSampleSize, ( "_generateAnomalyParams: samples count=%d is smaller than min=%d; " "model=%s; lastRowID=%s") % (len(scores), self._statisticsMinSampleSize, metricID, lastRowID,) assert len(scores) <= self._statisticsSampleSize, ( "_generateAnomalyParams: samples count=%d is larger than max=%d; " "model=%s; lastRowID=%s") % (len(scores), self._statisticsSampleSize, metricID, lastRowID,) # Calculate estimator parameters # We ignore statistics from the first day of data (288 records) since the # CLA is still learning. For simplicity, this logic continues to ignore the # first day of data even once the window starts sliding. _, _, params = algorithms.estimateAnomalyLikelihoods( anomalyScores=scores, skipRecords=NUM_SKIP_RECORDS) anomalyParams = {} anomalyParams["last_rowid_for_stats"] = lastRowID anomalyParams["params"] = params self._log.debug("Generated anomaly params for model=%s using " "numRows=%d with rows=[%s..%s]", metricID, numSamples, statsSampleCache[-numSamples].rowid, statsSampleCache[-1].rowid) return anomalyParams