def testUpdateAnomalyLikelihoods(self): """ A slight more complex test. This calls estimateAnomalyLikelihoods to estimate the distribution on fake data, followed by several calls to updateAnomalyLikelihoods. """ # ------------------------------------------ # Step 1. Generate an initial estimate using fake distribution of anomaly # scores. data1 = _generateSampleData(mean=0.2)[0:1000] _, _, estimatorParams = an.estimateAnomalyLikelihoods(data1, averagingWindow=5) # ------------------------------------------ # Step 2. Generate some new data with a higher average anomaly # score. Using the estimator from step 1, to compute likelihoods. Now we # should see a lot more anomalies. data2 = _generateSampleData(mean=0.6)[0:300] likelihoods2, avgRecordList2, estimatorParams2 = an.updateAnomalyLikelihoods(data2, estimatorParams) self.assertEqual(len(likelihoods2), len(data2)) self.assertEqual(len(avgRecordList2), len(data2)) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # The new running total should be different self.assertNotEqual(estimatorParams2["movingAverage"]["total"], estimatorParams["movingAverage"]["total"]) # We should have many more samples where likelihood is < 0.01, but not all self.assertGreaterEqual(numpy.sum(likelihoods2 < 0.01), 25) self.assertLessEqual(numpy.sum(likelihoods2 < 0.01), 250) # ------------------------------------------ # Step 3. Generate some new data with the expected average anomaly score. We # should see fewer anomalies than in Step 2. data3 = _generateSampleData(mean=0.2)[0:1000] likelihoods3, avgRecordList3, estimatorParams3 = an.updateAnomalyLikelihoods(data3, estimatorParams2) self.assertEqual(len(likelihoods3), len(data3)) self.assertEqual(len(avgRecordList3), len(data3)) self.assertTrue(an.isValidEstimatorParams(estimatorParams3)) # The new running total should be different self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams["movingAverage"]["total"]) self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams2["movingAverage"]["total"]) # We should have a small number samples where likelihood is < 0.02, but at # least one self.assertGreaterEqual(numpy.sum(likelihoods3 < 0.01), 1) self.assertLessEqual(numpy.sum(likelihoods3 < 0.01), 100) # ------------------------------------------ # Step 4. Validate that sending data incrementally is the same as sending # in one batch allData = data1 allData.extend(data2) allData.extend(data3) # Compute moving average of all the data and check it's the same _, historicalValuesAll, totalAll = an._anomalyScoreMovingAverage(allData, windowSize=5) self.assertEqual(sum(historicalValuesAll), sum(estimatorParams3["movingAverage"]["historicalValues"])) self.assertEqual(totalAll, estimatorParams3["movingAverage"]["total"])
def testVeryFewScores(self): """ This calls estimateAnomalyLikelihoods and updateAnomalyLikelihoods with one or no scores. """ # Generate an estimate using two data points data1 = _generateSampleData(mean=42.0, variance=1e-10) _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:2])) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the estimated mean is that value dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], data1[0][2]) # Can't generate an estimate using no data points data1 = numpy.zeros(0) with self.assertRaises(ValueError): an.estimateAnomalyLikelihoods(data1) # Can't update with no scores with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, estimatorParams)
def testEstimateAnomalyLikelihoodsMalformedRecords(self): """ This calls estimateAnomalyLikelihoods with malformed records, which should be quietly skipped. """ # Generate a fake distribution of anomaly scores, and add malformed records data1 = _generateSampleData(mean=0.2) data1 = data1[0:1000] + [(2, 2)] + [(2, 2, 2, 2)] + [()] + [(2)] likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1004])) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the sum is correct avgParams = estimatorParams["movingAverage"] total = 0 for v in avgRecordList: total = total + v[2] self.assertTrue(avgParams["total"], total) # Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], total / float(len(avgRecordList)))
def testEstimateAnomalyLikelihoods(self): """ This calls estimateAnomalyLikelihoods to estimate the distribution on fake data and validates the results """ # Generate an estimate using fake distribution of anomaly scores. data1 = _generateSampleData(mean=0.2) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1000])) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the sum is correct avgParams = estimatorParams["movingAverage"] total = 0 for v in avgRecordList: total = total + v[2] self.assertTrue(avgParams["total"], total) # Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], total / float(len(avgRecordList))) # Number of points with lower than 2% probability should be pretty low # but not zero. Can't use exact 2% here due to random variations self.assertLessEqual(numpy.sum(likelihoods < 0.02), 50) self.assertGreaterEqual(numpy.sum(likelihoods < 0.02), 1)
def testVeryFewScores(self): """ This calls estimateAnomalyLikelihoods and updateAnomalyLikelihoods with one or no scores. """ # Generate an estimate using two data points data1 = _generateSampleData(mean=42.0, variance=1e-10) _, _, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:2]) ) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the estimated mean is that value dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], data1[0][2]) # Can't generate an estimate using no data points data1 = numpy.zeros(0) with self.assertRaises(ValueError): an.estimateAnomalyLikelihoods(data1) # Can't update with no scores with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, estimatorParams)
def testEstimateAnomalyLikelihoodsMalformedRecords(self): """ This calls estimateAnomalyLikelihoods with malformed records, which should be quietly skipped. """ # Generate an estimate using fake distribution of anomaly scores. data1 = _generateSampleData(mean=0.2) data1 = data1 + [(2, 2), (2, 2, 2, 2), (), (2)] # Malformed records likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1000]) ) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the sum is correct avgParams = estimatorParams["movingAverage"] total = 0 for v in avgRecordList: total = total + v[2] self.assertTrue(avgParams["total"], total) # Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], total / float(len(avgRecordList)))
def testEstimateAnomalyLikelihoods(self): """ This calls estimateAnomalyLikelihoods to estimate the distribution on fake data and validates the results """ # Generate an estimate using fake distribution of anomaly scores. data1 = _generateSampleData(mean=0.2) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1000]) ) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Check that the sum is correct avgParams = estimatorParams["movingAverage"] total = 0 for v in avgRecordList: total = total + v[2] self.assertTrue(avgParams["total"], total) # Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], total / float(len(avgRecordList))) # Number of points with lower than 2% probability should be pretty low # but not zero. Can't use exact 2% here due to random variations self.assertLessEqual(numpy.sum(likelihoods < 0.02), 50) self.assertGreaterEqual(numpy.sum(likelihoods < 0.02), 1)
def testFlatAnomalyScores(self): """ This calls estimateAnomalyLikelihoods with flat distributions and ensures things don't crash. """ # Generate an estimate using fake distribution of anomaly scores. data1 = _generateSampleData(mean=42.0, variance=1e-10) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data1[0:1000]) ) self.assertEqual(len(likelihoods), 1000) self.assertEqual(len(avgRecordList), 1000) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) ## Check that the estimated mean is correct dParams = estimatorParams["distribution"] self.assertWithinEpsilon(dParams["mean"], data1[0][2]) # If you deviate from the mean, you should get probability 0 # Test this by sending in just slightly different values. data2 = _generateSampleData(mean=42.5, variance=1e-10) likelihoods2, _, _ = ( an.updateAnomalyLikelihoods(data2[0:10], estimatorParams) ) # The likelihoods should go to zero very quickly self.assertLessEqual(likelihoods2.sum(), 0.01) # Test edge case where anomaly scores are very close to 0 # In this case we don't let likelihood to get too low. An average # anomaly score of 0.1 should be essentially zero, but an average # of 0.04 should be higher data3 = _generateSampleData(mean=0.01, variance=1e-6) _, _, estimatorParams3 = ( an.estimateAnomalyLikelihoods(data3[0:1000]) ) data4 = _generateSampleData(mean=0.1, variance=1e-6) likelihoods4, _, estimatorParams4 = ( an.updateAnomalyLikelihoods(data4[0:20], estimatorParams3) ) # Average of 0.1 should go to zero self.assertLessEqual(likelihoods4[10:].mean(), 0.002) data5 = _generateSampleData(mean=0.05, variance=1e-6) likelihoods5, _, _ = ( an.updateAnomalyLikelihoods(data5[0:20], estimatorParams4) ) # The likelihoods should be low but not near zero self.assertLessEqual(likelihoods5[10:].mean(), 0.28) self.assertGreater(likelihoods5[10:].mean(), 0.015)
def testEstimateAnomalyLikelihoodsCategoryValues(self): start = datetime.datetime(2017, 1, 1, 0, 0, 0) delta = datetime.timedelta(minutes=5) dts = [start + (i * delta) for i in xrange(10)] values = ["a", "b", "c", "d", "e"] * 2 rawScores = [0.1 * i for i in xrange(10)] data = zip(dts, values, rawScores) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data)) self.assertEqual(len(likelihoods), 10) self.assertEqual(len(avgRecordList), 10) self.assertTrue(an.isValidEstimatorParams(estimatorParams))
def testEstimateAnomalyLikelihoodsCategoryValues(self): start = datetime.datetime(2017, 1, 1, 0, 0, 0) delta = datetime.timedelta(minutes=5) dts = [start + (i * delta) for i in xrange(10)] values = ["a", "b", "c", "d", "e"] * 2 rawScores = [0.1 * i for i in xrange(10)] data = zip(dts, values, rawScores) likelihoods, avgRecordList, estimatorParams = ( an.estimateAnomalyLikelihoods(data) ) self.assertEqual(len(likelihoods), 10) self.assertEqual(len(avgRecordList), 10) self.assertTrue(an.isValidEstimatorParams(estimatorParams))
def testBadParams(self): """ Calls updateAnomalyLikelihoods with bad params. """ # Generate an estimate using one data point data1 = _generateSampleData(mean=42.0, variance=1e-10) _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:1])) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Can't pass in a bad params structure with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, {"haha": "heehee"}) # Can't pass in something not a dict with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, 42.0)
def testBadParams(self): """ Calls updateAnomalyLikelihoods with bad params. """ # Generate an estimate using one data point data1 = _generateSampleData(mean=42.0, variance=1e-10) _, _, estimatorParams = an.estimateAnomalyLikelihoods(data1[0:1]) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # Can't pass in a bad params structure with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, {"haha": "heehee"}) # Can't pass in something not a dict with self.assertRaises(ValueError): an.updateAnomalyLikelihoods(data1, 42.0)
def testUpdateAnomalyLikelihoods(self): """ A slight more complex test. This calls estimateAnomalyLikelihoods to estimate the distribution on fake data, followed by several calls to updateAnomalyLikelihoods. """ #------------------------------------------ # Step 1. Generate an initial estimate using fake distribution of anomaly # scores. data1 = _generateSampleData(mean=0.2)[0:1000] _, _, estimatorParams = (an.estimateAnomalyLikelihoods( data1, averagingWindow=5)) #------------------------------------------ # Step 2. Generate some new data with a higher average anomaly # score. Using the estimator from step 1, to compute likelihoods. Now we # should see a lot more anomalies. data2 = _generateSampleData(mean=0.6)[0:300] likelihoods2, avgRecordList2, estimatorParams2 = ( an.updateAnomalyLikelihoods(data2, estimatorParams)) self.assertEqual(len(likelihoods2), len(data2)) self.assertEqual(len(avgRecordList2), len(data2)) self.assertTrue(an.isValidEstimatorParams(estimatorParams)) # The new running total should be different self.assertNotEqual(estimatorParams2["movingAverage"]["total"], estimatorParams["movingAverage"]["total"]) # We should have many more samples where likelihood is < 0.01, but not all self.assertGreaterEqual(numpy.sum(likelihoods2 < 0.01), 25) self.assertLessEqual(numpy.sum(likelihoods2 < 0.01), 250) #------------------------------------------ # Step 3. Generate some new data with the expected average anomaly score. We # should see fewer anomalies than in Step 2. data3 = _generateSampleData(mean=0.2)[0:1000] likelihoods3, avgRecordList3, estimatorParams3 = ( an.updateAnomalyLikelihoods(data3, estimatorParams2)) self.assertEqual(len(likelihoods3), len(data3)) self.assertEqual(len(avgRecordList3), len(data3)) self.assertTrue(an.isValidEstimatorParams(estimatorParams3)) # The new running total should be different self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams["movingAverage"]["total"]) self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams2["movingAverage"]["total"]) # We should have a small number samples where likelihood is < 0.02, but at # least one self.assertGreaterEqual(numpy.sum(likelihoods3 < 0.01), 1) self.assertLessEqual(numpy.sum(likelihoods3 < 0.01), 100) #------------------------------------------ # Step 4. Validate that sending data incrementally is the same as sending # in one batch allData = data1 allData.extend(data2) allData.extend(data3) # Compute moving average of all the data and check it's the same _, historicalValuesAll, totalAll = (an._anomalyScoreMovingAverage( allData, windowSize=5)) self.assertEqual( sum(historicalValuesAll), sum(estimatorParams3["movingAverage"]["historicalValues"])) self.assertEqual(totalAll, estimatorParams3["movingAverage"]["total"])