def testVeryFewScores(self):
        """
    This calls estimateAnomalyLikelihoods and updateAnomalyLikelihoods
    with one or no scores.
    """

        # Generate an estimate using two data points
        data1 = _generateSampleData(mean=42.0, variance=1e-10)

        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:2]))

        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Check that the estimated mean is that value
        dParams = estimatorParams["distribution"]
        self.assertWithinEpsilon(dParams["mean"], data1[0][2])

        # Can't generate an estimate using no data points
        data1 = numpy.zeros(0)
        with self.assertRaises(ValueError):
            an.estimateAnomalyLikelihoods(data1)

        # Can't update with no scores
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, estimatorParams)
  def testVeryFewScores(self):
    """
    This calls estimateAnomalyLikelihoods and updateAnomalyLikelihoods
    with one or no scores.
    """

    # Generate an estimate using two data points
    data1 = _generateSampleData(mean=42.0, variance=1e-10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data1[0:2])
    )

    self.assertTrue(an.isValidEstimatorParams(estimatorParams))

    # Check that the estimated mean is that value
    dParams = estimatorParams["distribution"]
    self.assertWithinEpsilon(dParams["mean"], data1[0][2])

    # Can't generate an estimate using no data points
    data1 = numpy.zeros(0)
    with self.assertRaises(ValueError):
      an.estimateAnomalyLikelihoods(data1)

    # Can't update with no scores
    with self.assertRaises(ValueError):
      an.updateAnomalyLikelihoods(data1, estimatorParams)
    def testUpdateAnomalyLikelihoods(self):
        """
    A slight more complex test. This calls estimateAnomalyLikelihoods
    to estimate the distribution on fake data, followed by several calls
    to updateAnomalyLikelihoods.
    """

        # ------------------------------------------
        # Step 1. Generate an initial estimate using fake distribution of anomaly
        # scores.
        data1 = _generateSampleData(mean=0.2)[0:1000]
        _, _, estimatorParams = an.estimateAnomalyLikelihoods(data1, averagingWindow=5)

        # ------------------------------------------
        # Step 2. Generate some new data with a higher average anomaly
        # score. Using the estimator from step 1, to compute likelihoods. Now we
        # should see a lot more anomalies.
        data2 = _generateSampleData(mean=0.6)[0:300]
        likelihoods2, avgRecordList2, estimatorParams2 = an.updateAnomalyLikelihoods(data2, estimatorParams)
        self.assertEqual(len(likelihoods2), len(data2))
        self.assertEqual(len(avgRecordList2), len(data2))
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # The new running total should be different
        self.assertNotEqual(estimatorParams2["movingAverage"]["total"], estimatorParams["movingAverage"]["total"])

        # We should have many more samples where likelihood is < 0.01, but not all
        self.assertGreaterEqual(numpy.sum(likelihoods2 < 0.01), 25)
        self.assertLessEqual(numpy.sum(likelihoods2 < 0.01), 250)

        # ------------------------------------------
        # Step 3. Generate some new data with the expected average anomaly score. We
        # should see fewer anomalies than in Step 2.
        data3 = _generateSampleData(mean=0.2)[0:1000]
        likelihoods3, avgRecordList3, estimatorParams3 = an.updateAnomalyLikelihoods(data3, estimatorParams2)

        self.assertEqual(len(likelihoods3), len(data3))
        self.assertEqual(len(avgRecordList3), len(data3))
        self.assertTrue(an.isValidEstimatorParams(estimatorParams3))

        # The new running total should be different
        self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams["movingAverage"]["total"])
        self.assertNotEqual(estimatorParams3["movingAverage"]["total"], estimatorParams2["movingAverage"]["total"])

        # We should have a small number samples where likelihood is < 0.02, but at
        # least one
        self.assertGreaterEqual(numpy.sum(likelihoods3 < 0.01), 1)
        self.assertLessEqual(numpy.sum(likelihoods3 < 0.01), 100)

        # ------------------------------------------
        # Step 4. Validate that sending data incrementally is the same as sending
        # in one batch
        allData = data1
        allData.extend(data2)
        allData.extend(data3)

        # Compute moving average of all the data and check it's the same
        _, historicalValuesAll, totalAll = an._anomalyScoreMovingAverage(allData, windowSize=5)
        self.assertEqual(sum(historicalValuesAll), sum(estimatorParams3["movingAverage"]["historicalValues"]))
        self.assertEqual(totalAll, estimatorParams3["movingAverage"]["total"])
  def testCaseUnusuallyHighSpikeFrequency(self):
    """
    Test B: one anomaly spike every 20 records. Then we suddenly get a bunch
    in a row. The likelihood of those spikes should be low.
    """
    data = self._addSampleData(spikePeriod=20, numSamples=1019)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # If we continue to see the same distribution, we should get reasonable
    # likelihoods
    data = self._addSampleData(numSamples=119, spikePeriod=20)
    likelihoods1, _, estimatorParams1 = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The minimum likelihood should be reasonably high
    self.assertTrue((likelihoods1.min() > 0.1 ))

    data = self._addSampleData(numSamples=20, spikePeriod=2)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams1)
    )

    # The likelihood once you get past the initial averaging should be very low.
    self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.001)
Exemple #5
0
  def testCaseUnusuallyHighSpikeFrequency(self):
    """
    Test B: one anomaly spike every 20 records. Then we suddenly get a bunch
    in a row. The likelihood of those spikes should be low.
    """
    data = self._addSampleData(spikePeriod=20, numSamples=1019)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # If we continue to see the same distribution, we should get reasonable
    # likelihoods
    data = self._addSampleData(numSamples=119, spikePeriod=20)
    likelihoods1, _, estimatorParams1 = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The minimum likelihood should be reasonably high
    self.assertTrue((likelihoods1.min() > 0.1 ))

    data = self._addSampleData(numSamples=20, spikePeriod=2)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams1)
    )

    # The likelihood once you get past the initial averaging should be very low.
    self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.001)
  def testFlatAnomalyScores(self):
    """
    This calls estimateAnomalyLikelihoods with flat distributions and
    ensures things don't crash.
    """

    # Generate an estimate using fake distribution of anomaly scores.
    data1 = _generateSampleData(mean=42.0, variance=1e-10)

    likelihoods, avgRecordList, estimatorParams = (
      an.estimateAnomalyLikelihoods(data1[0:1000])
    )
    self.assertEqual(len(likelihoods), 1000)
    self.assertEqual(len(avgRecordList), 1000)
    self.assertTrue(an.isValidEstimatorParams(estimatorParams))

    ## Check that the estimated mean is correct
    dParams = estimatorParams["distribution"]
    self.assertWithinEpsilon(dParams["mean"], data1[0][2])

    # If you deviate from the mean, you should get probability 0
    # Test this by sending in just slightly different values.
    data2 = _generateSampleData(mean=42.5, variance=1e-10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data2[0:10], estimatorParams)
    )

    # The likelihoods should go to zero very quickly
    self.assertLessEqual(likelihoods2.sum(), 0.01)


    # Test edge case where anomaly scores are very close to 0
    # In this case we don't let likelihood to get too low. An average
    # anomaly score of 0.1 should be essentially zero, but an average
    # of 0.04 should be higher
    data3 = _generateSampleData(mean=0.01, variance=1e-6)

    _, _, estimatorParams3 = (
      an.estimateAnomalyLikelihoods(data3[0:1000])
    )

    data4 = _generateSampleData(mean=0.1, variance=1e-6)
    likelihoods4, _, estimatorParams4 = (
      an.updateAnomalyLikelihoods(data4[0:20], estimatorParams3)
    )

    # Average of 0.1 should go to zero
    self.assertLessEqual(likelihoods4[10:].mean(), 0.002)

    data5 = _generateSampleData(mean=0.05, variance=1e-6)
    likelihoods5, _, _ = (
      an.updateAnomalyLikelihoods(data5[0:20], estimatorParams4)
    )

    # The likelihoods should be low but not near zero
    self.assertLessEqual(likelihoods5[10:].mean(), 0.28)
    self.assertGreater(likelihoods5[10:].mean(), 0.015)
  def testFlatAnomalyScores(self):
    """
    This calls estimateAnomalyLikelihoods with flat distributions and
    ensures things don't crash.
    """

    # Generate an estimate using fake distribution of anomaly scores.
    data1 = _generateSampleData(mean=42.0, variance=1e-10)

    likelihoods, avgRecordList, estimatorParams = (
      an.estimateAnomalyLikelihoods(data1[0:1000])
    )
    self.assertEqual(len(likelihoods), 1000)
    self.assertEqual(len(avgRecordList), 1000)
    self.assertTrue(an.isValidEstimatorParams(estimatorParams))

    ## Check that the estimated mean is correct
    dParams = estimatorParams["distribution"]
    self.assertWithinEpsilon(dParams["mean"], data1[0][2])

    # If you deviate from the mean, you should get probability 0
    # Test this by sending in just slightly different values.
    data2 = _generateSampleData(mean=42.5, variance=1e-10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data2[0:10], estimatorParams)
    )

    # The likelihoods should go to zero very quickly
    self.assertLessEqual(likelihoods2.sum(), 0.01)


    # Test edge case where anomaly scores are very close to 0
    # In this case we don't let likelihood to get too low. An average
    # anomaly score of 0.1 should be essentially zero, but an average
    # of 0.04 should be higher
    data3 = _generateSampleData(mean=0.01, variance=1e-6)

    _, _, estimatorParams3 = (
      an.estimateAnomalyLikelihoods(data3[0:1000])
    )

    data4 = _generateSampleData(mean=0.1, variance=1e-6)
    likelihoods4, _, estimatorParams4 = (
      an.updateAnomalyLikelihoods(data4[0:20], estimatorParams3)
    )

    # Average of 0.1 should go to zero
    self.assertLessEqual(likelihoods4[10:].mean(), 0.002)

    data5 = _generateSampleData(mean=0.05, variance=1e-6)
    likelihoods5, _, _ = (
      an.updateAnomalyLikelihoods(data5[0:20], estimatorParams4)
    )

    # The likelihoods should be low but not near zero
    self.assertLessEqual(likelihoods5[10:].mean(), 0.28)
    self.assertGreater(likelihoods5[10:].mean(), 0.015)
Exemple #8
0
  def testCaseContinuousBunchesOfSpikes(self):
    """
    Test D: bunches of anomalies every 20 records that continue. This should not
    be anomalous.
    """

    # Generate initial data
    data = []
    for _ in range(30):
      data = self._addSampleData(data, spikePeriod=0, numSamples=30)
      data = self._addSampleData(data, spikePeriod=3, numSamples=10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in the same distribution
    data = self._addSampleData(spikePeriod=0, numSamples=30)
    data = self._addSampleData(data, spikePeriod=3, numSamples=10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood should be reasonable high everywhere
    self.assertTrue(likelihoods2.min() > 0.01)
    def testCaseIncreasedAnomalyScore(self):
        """
    Test F: small anomaly score every 20 records, but then a large one when you
    would expect a small one. This should be anomalous.
    """

        # Generate initial data
        data = []
        data = self._addSampleData(data,
                                   spikePeriod=20,
                                   spikeValue=0.4,
                                   numSamples=1000)

        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data))

        # Now feed in a more frequent distribution
        data = self._addSampleData(spikePeriod=20,
                                   spikeValue=1.0,
                                   numSamples=100)
        likelihoods2, _, _ = (an.updateAnomalyLikelihoods(
            data, estimatorParams))

        # We should detect highly unusual behavior
        self.assertTrue(likelihoods2.min() < 0.0003)

        # We should detect it pretty often
        self.assertTrue((likelihoods2 < 0.0003).sum() > 40)
Exemple #10
0
  def likelihood(self, value, anomalyScore, dttm):
    """
    Given the current metric value, plus the current anomaly score, output the
    anomalyLikelihood for this record.
    """
    dataPoint = (dttm, value, anomalyScore)
    # We ignore the first probationaryPeriod data points
    if len(self._historicalScores) < self._probationaryPeriod:
      likelihood = 0.5
    else:
      # On a rolling basis we re-estimate the distribution every 100 iterations
      if self._distribution is None or (self._iteration % 100 == 0):
        _, _, self._distribution = (
          anomaly_likelihood.estimateAnomalyLikelihoods(
            self._historicalScores,
            skipRecords = self._numentaLearningPeriod)
          )

      likelihoods, _, self._distribution = (
        anomaly_likelihood.updateAnomalyLikelihoods([dataPoint],
          self._distribution)
      )
      likelihood = 1.0 - likelihoods[0]

    # Before we exit update historical scores and iteration
    self._historicalScores.append(dataPoint)
    self._iteration += 1

    return likelihood
Exemple #11
0
  def likelihood(self, value, anomalyScore, dttm):
    """
    Given the current metric value, plus the current anomaly score, output the
    anomalyLikelihood for this record.
    """
    dataPoint = (dttm, value, anomalyScore)
    # We ignore the first probationaryPeriod data points
    if len(self._historicalScores) < self._probationaryPeriod:
      likelihood = 0.5
    else:
      # On a rolling basis we re-estimate the distribution every 100 iterations
      if self._distribution is None or (self._iteration % 100 == 0):
        _, _, self._distribution = (
          anomaly_likelihood.estimateAnomalyLikelihoods(
            self._historicalScores,
            skipRecords = self._numentaLearningPeriod)
          )

      likelihoods, _, self._distribution = (
        anomaly_likelihood.updateAnomalyLikelihoods([dataPoint],
          self._distribution)
      )
      likelihood = 1.0 - likelihoods[0]

    # Before we exit update historical scores and iteration
    self._historicalScores.append(dataPoint)
    self._iteration += 1

    return likelihood
  def testCaseIncreasedAnomalyScore(self):
    """
    Test F: small anomaly score every 20 records, but then a large one when you
    would expect a small one. This should be anomalous.
    """

    # Generate initial data
    data = []
    data = self._addSampleData(data, spikePeriod=20,
                               spikeValue=0.4, numSamples=1000)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data)
    )

    # Now feed in a more frequent distribution
    data = self._addSampleData(spikePeriod=20, spikeValue=1.0,
                               numSamples=100)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # We should detect highly unusual behavior
    self.assertTrue(likelihoods2.min() < 0.0003)

    # We should detect it pretty often
    self.assertTrue((likelihoods2 < 0.0003).sum() > 40)
  def testCaseIncreasedSpikeFrequency(self):
    """
    Test E: bunches of anomalies every 20 records that become even more
    frequent. This should be anomalous.
    """

    # Generate initial data
    data = []
    for _ in range(30):
      data = self._addSampleData(data, spikePeriod=0, numSamples=30)
      data = self._addSampleData(data, spikePeriod=3, numSamples=10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in a more frequent distribution
    data = self._addSampleData(spikePeriod=0, numSamples=30)
    data = self._addSampleData(data, spikePeriod=1, numSamples=10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood should become anomalous but only near the end
    self.assertTrue(likelihoods2[0:30].min() > 0.01)
    self.assertTrue(likelihoods2[-5:].min() < 0.002)
  def testCaseContinuousBunchesOfSpikes(self):
    """
    Test D: bunches of anomalies every 20 records that continue. This should not
    be anomalous.
    """

    # Generate initial data
    data = []
    for _ in range(30):
      data = self._addSampleData(data, spikePeriod=0, numSamples=30)
      data = self._addSampleData(data, spikePeriod=3, numSamples=10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in the same distribution
    data = self._addSampleData(spikePeriod=0, numSamples=30)
    data = self._addSampleData(data, spikePeriod=3, numSamples=10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood should be reasonable high everywhere
    self.assertTrue(likelihoods2.min() > 0.01)
Exemple #15
0
  def testCaseIncreasedSpikeFrequency(self):
    """
    Test E: bunches of anomalies every 20 records that become even more
    frequent. This should be anomalous.
    """

    # Generate initial data
    data = []
    for _ in range(30):
      data = self._addSampleData(data, spikePeriod=0, numSamples=30)
      data = self._addSampleData(data, spikePeriod=3, numSamples=10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in a more frequent distribution
    data = self._addSampleData(spikePeriod=0, numSamples=30)
    data = self._addSampleData(data, spikePeriod=1, numSamples=10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood should become anomalous but only near the end
    self.assertTrue(likelihoods2[0:30].min() > 0.01)
    self.assertTrue(likelihoods2[-5:].min() < 0.002)
    def testBadParams(self):
        """
    Calls updateAnomalyLikelihoods with bad params.
    """

        # Generate an estimate using one data point
        data1 = _generateSampleData(mean=42.0, variance=1e-10)

        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:1]))

        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Can't pass in a bad params structure
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, {"haha": "heehee"})

        # Can't pass in something not a dict
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, 42.0)
    def testBadParams(self):
        """
    Calls updateAnomalyLikelihoods with bad params.
    """

        # Generate an estimate using one data point
        data1 = _generateSampleData(mean=42.0, variance=1e-10)

        _, _, estimatorParams = an.estimateAnomalyLikelihoods(data1[0:1])

        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Can't pass in a bad params structure
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, {"haha": "heehee"})

        # Can't pass in something not a dict
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, 42.0)
    def testCaseSingleSpike(self):
        """
    No anomalies, and then you see a single spike. The likelihood of that
    spike should be 0
    """
        data = self._addSampleData(spikePeriod=0, numSamples=1000)

        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data[0:1000]))

        data = self._addSampleData(numSamples=1, spikePeriod=1)
        likelihoods1, _, _ = (an.updateAnomalyLikelihoods(
            data, estimatorParams))

        self.assertWithinEpsilon(likelihoods1[0], 0.0)
  def testCaseSingleSpike(self):
    """
    No anomalies, and then you see a single spike. The likelihood of that
    spike should be 0
    """
    data = self._addSampleData(spikePeriod=0, numSamples=1000)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    data = self._addSampleData(numSamples=1, spikePeriod=1)
    likelihoods1, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    self.assertWithinEpsilon(likelihoods1[0], 0.0)
    def testCaseMissingSpike(self):
        """
    Test C: one anomaly every 20 records, but then see none. The likelihood
    at the end should be very low.
    """

        # Initial data
        data = self._addSampleData(spikePeriod=20, numSamples=1019)
        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data[0:1000]))

        # Now feed in none
        data = self._addSampleData(numSamples=100, spikePeriod=0)
        likelihoods2, _, _ = (an.updateAnomalyLikelihoods(
            data, estimatorParams))

        # The likelihood once you get past the initial averaging should be very low.
        self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.0001)
  def testCaseMissingSpike(self):
    """
    Test C: one anomaly every 20 records, but then see none. The likelihood
    at the end should be very low.
    """

    # Initial data
    data = self._addSampleData(spikePeriod=20, numSamples=1019)
    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in none
    data = self._addSampleData(numSamples=100, spikePeriod=0)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood once you get past the initial averaging should be very low.
    self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.0001)
Exemple #22
0
 def testNABAnomalyLikelihood(self):
   """
   Tests the specific calls to nupic/algorithms/anomaly_likelihood as they"re
   made in "NAB/detectors/numenta/numenta_detector.py".
   Note "NAB/.../numenta_detector.py" has its own class AnomalyLikelihood,
   different from nupic/algorithms/anomaly_likelihood.AnomalyLikelihood, but
   which calls the functions estimateAnomalyLikelihoods() and 
   updateAnomalyLikelihoods() from "nupic/algorithms/anomaly_likelihood.py".
   """
   # AnomalyLikelihood object initial values
   iteration = 0
   probationaryPeriod = 4
   historicalScores = []
   
   likelihoodList = []
   for dataPoint in self.data:
     # Ignore the first probationaryPeriod data points
     if len(historicalScores) < probationaryPeriod:
       likelihood = 0.5
     else:
       if iteration % 4 == 0:
         _, _, distribution = an.estimateAnomalyLikelihoods(
                                historicalScores,
                                skipRecords = probationaryPeriod)
         likelihoods, _, distribution = an.updateAnomalyLikelihoods(
                               [dataPoint], distribution)
         likelihood = 1.0 - likelihoods[0]
     historicalScores.append(dataPoint)
     iteration += 1
     likelihoodList.append(likelihood)
   
   truthLikelihoodList = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
                          0.044565462999999972, 0.044565462999999972,
                          0.044565462999999972, 0.044565462999999972,
                          0.90319951499999995, 0.90319951499999995,
                          0.90319951499999995, 0.90319951499999995,
                          0.78814460099999994, 0.78814460099999994,
                          0.78814460099999994, 0.78814460099999994]
   for i in xrange(len(likelihoodList)):
     self.assertAlmostEqual(likelihoodList[i], truthLikelihoodList[i],
       msg="unequal values are at index %i" % i)
Exemple #23
0
    def testNABAnomalyLikelihood(self):
        """
    Tests the specific calls to nupic/algorithms/anomaly_likelihood as they"re
    made in "NAB/detectors/numenta/numenta_detector.py".
    Note "NAB/.../numenta_detector.py" has its own class AnomalyLikelihood,
    different from nupic/algorithms/anomaly_likelihood.AnomalyLikelihood, but
    which calls the functions estimateAnomalyLikelihoods() and 
    updateAnomalyLikelihoods() from "nupic/algorithms/anomaly_likelihood.py".
    """
        # AnomalyLikelihood object initial values
        iteration = 0
        probationaryPeriod = 4
        historicalScores = []

        likelihoodList = []
        for dataPoint in self.data:
            # Ignore the first probationaryPeriod data points
            if len(historicalScores) < probationaryPeriod:
                likelihood = 0.5
            else:
                if iteration % 4 == 0:
                    _, _, distribution = an.estimateAnomalyLikelihoods(
                        historicalScores, skipRecords=probationaryPeriod)
                    likelihoods, _, distribution = an.updateAnomalyLikelihoods(
                        [dataPoint], distribution)
                    likelihood = 1.0 - likelihoods[0]
            historicalScores.append(dataPoint)
            iteration += 1
            likelihoodList.append(likelihood)

        truthLikelihoodList = [
            0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.044565462999999972,
            0.044565462999999972, 0.044565462999999972, 0.044565462999999972,
            0.90319951499999995, 0.90319951499999995, 0.90319951499999995,
            0.90319951499999995, 0.78814460099999994, 0.78814460099999994,
            0.78814460099999994, 0.78814460099999994
        ]
        for i in xrange(len(likelihoodList)):
            self.assertAlmostEqual(likelihoodList[i],
                                   truthLikelihoodList[i],
                                   msg="unequal values are at index %i" % i)
    def testUpdateAnomalyLikelihoods(self):
        """
    A slight more complex test. This calls estimateAnomalyLikelihoods
    to estimate the distribution on fake data, followed by several calls
    to updateAnomalyLikelihoods.
    """

        #------------------------------------------
        # Step 1. Generate an initial estimate using fake distribution of anomaly
        # scores.
        data1 = _generateSampleData(mean=0.2)[0:1000]
        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(
            data1, averagingWindow=5))

        #------------------------------------------
        # Step 2. Generate some new data with a higher average anomaly
        # score. Using the estimator from step 1, to compute likelihoods. Now we
        # should see a lot more anomalies.
        data2 = _generateSampleData(mean=0.6)[0:300]
        likelihoods2, avgRecordList2, estimatorParams2 = (
            an.updateAnomalyLikelihoods(data2, estimatorParams))
        self.assertEqual(len(likelihoods2), len(data2))
        self.assertEqual(len(avgRecordList2), len(data2))
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # The new running total should be different
        self.assertNotEqual(estimatorParams2["movingAverage"]["total"],
                            estimatorParams["movingAverage"]["total"])

        # We should have many more samples where likelihood is < 0.01, but not all
        self.assertGreaterEqual(numpy.sum(likelihoods2 < 0.01), 25)
        self.assertLessEqual(numpy.sum(likelihoods2 < 0.01), 250)

        #------------------------------------------
        # Step 3. Generate some new data with the expected average anomaly score. We
        # should see fewer anomalies than in Step 2.
        data3 = _generateSampleData(mean=0.2)[0:1000]
        likelihoods3, avgRecordList3, estimatorParams3 = (
            an.updateAnomalyLikelihoods(data3, estimatorParams2))

        self.assertEqual(len(likelihoods3), len(data3))
        self.assertEqual(len(avgRecordList3), len(data3))
        self.assertTrue(an.isValidEstimatorParams(estimatorParams3))

        # The new running total should be different
        self.assertNotEqual(estimatorParams3["movingAverage"]["total"],
                            estimatorParams["movingAverage"]["total"])
        self.assertNotEqual(estimatorParams3["movingAverage"]["total"],
                            estimatorParams2["movingAverage"]["total"])

        # We should have a small number samples where likelihood is < 0.02, but at
        # least one
        self.assertGreaterEqual(numpy.sum(likelihoods3 < 0.01), 1)
        self.assertLessEqual(numpy.sum(likelihoods3 < 0.01), 100)

        #------------------------------------------
        # Step 4. Validate that sending data incrementally is the same as sending
        # in one batch
        allData = data1
        allData.extend(data2)
        allData.extend(data3)

        # Compute moving average of all the data and check it's the same
        _, historicalValuesAll, totalAll = (an._anomalyScoreMovingAverage(
            allData, windowSize=5))
        self.assertEqual(
            sum(historicalValuesAll),
            sum(estimatorParams3["movingAverage"]["historicalValues"]))
        self.assertEqual(totalAll, estimatorParams3["movingAverage"]["total"])
  def updateModelAnomalyScores(self, engine, metricObj, metricDataRows):
    """
    Calculate the anomaly scores based on the anomaly likelihoods. Update
    anomaly scores in the given metricDataRows MetricData instances, and
    calculate new anomaly likelihood params for the model.

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine
    :param metricObj: the model's Metric instance
    :param metricDataRows: a sequence of MetricData instances in the
      processed order (ascending by timestamp) with updated raw_anomaly_score
      and zeroed out anomaly_score corresponding to the new model inference
      results, but not yet updated in the database. Will update their
      anomaly_score properties, as needed.

    :returns: new anomaly likelihood params for the model

    *NOTE:*
      the processing must be idempotent due to the "at least once" delivery
      semantics of the message bus

    *NOTE:*
      the performance goal is to minimize costly database access and avoid
      falling behind while processing model results, especially during the
      model's initial "catch-up" phase when large inference result batches are
      prevalent.
    """
    # When populated, a cached list of MetricData instances for updating
    # anomaly likelyhood params
    statsSampleCache = None

    # Index into metricDataRows where processing is to resume
    startRowIndex = 0

    statisticsRefreshInterval = self._getStatisticsRefreshInterval(
      batchSize=len(metricDataRows))

    if metricObj.status != MetricStatus.ACTIVE:
      raise MetricNotActiveError(
        "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; "
        "status=%s; resource=%s" % (metricObj.uid,
                                    metricObj.status,
                                    metricObj.server,))

    modelParams = jsonDecode(metricObj.model_params)
    anomalyParams = modelParams.get("anomalyLikelihoodParams", None)
    if not anomalyParams:
      # We don't have a likelihood model yet. Create one if we have sufficient
      # records with raw anomaly scores
      (anomalyParams, statsSampleCache, startRowIndex) = (
        self._initAnomalyLikelihoodModel(engine=engine,
                                         metricObj=metricObj,
                                         metricDataRows=metricDataRows))

    # Do anomaly likelihood processing on the rest of the new samples
    # NOTE: this loop will be skipped if there are still not enough samples for
    #  creating the anomaly likelihood params
    while startRowIndex < len(metricDataRows):
      # Determine where to stop processing rows prior to next statistics refresh

      if (statsSampleCache is None or
          len(statsSampleCache) >= self._statisticsMinSampleSize):
        # We're here if:
        #   a. We haven't tried updating anomaly likelihood stats yet
        #                 OR
        #   b. We already updated anomaly likelihood stats (we had sufficient
        #      samples for it)
        # TODO: unit-test
        endRowID = (anomalyParams["last_rowid_for_stats"] +
                    statisticsRefreshInterval)

        if endRowID < metricDataRows[startRowIndex].rowid:
          # We're here if:
          #   a. Statistics refresh interval is smaller than during last stats
          #      update; this is the typical/normal case when backlog catch-up
          #      is tapering off, and refresh interval is reduced for smaller
          #      batches. OR
          #   b. There is a gap of anomaly scores preceding the start of the
          #      current chunk. OR
          #   c. Statistics config changed.
          # TODO: unit-test

          self._log.warning(
            "Anomaly run cutoff precedes samples (smaller stats "
            "refreshInterval or gap in anomaly scores or statistics config "
            "changed) : model=%s; rows=[%s..%s]",
            metricObj.uid, metricDataRows[startRowIndex].rowid, endRowID)

          if statsSampleCache is not None:
            # We already attempted to update anomaly likelihood params, so fix
            # up endRowID to make sure we make progress and don't get stuck in
            # an infinite loop
            endRowID = metricDataRows[startRowIndex].rowid
            self._log.warning(
              "Advanced anomaly run cutoff to make progress: "
              "model=%s; rows=[%s..%s]",
              metricObj.uid, metricDataRows[startRowIndex].rowid, endRowID)
      else:
        # During prior iteration, there were not enough samples in cache for
        # updating anomaly params

        # We extend the end row so that there will be enough samples
        # to avoid getting stuck in this rut in the current and following
        # iterations
        # TODO: unit-test this
        endRowID = metricDataRows[startRowIndex].rowid + (
          self._statisticsMinSampleSize - len(statsSampleCache) - 1)

      # Translate endRowID into metricDataRows limitIndex for current run
      if endRowID < metricDataRows[startRowIndex].rowid:
        # Cut-off precedes the remaining samples
        # Normally shouldn't be here (unless statistics config changed or there
        # is a gap in anomaly scores in metric_data table)
        # TODO: unit-test this

        # Set limit to bypass processing of samples for immediate refresh of
        # anomaly likelihood params
        limitIndex = startRowIndex
        self._log.warning(
          "Anomaly run cutoff precedes samples, so forcing refresh of anomaly "
          "likelihood params: modelInfo=<%s>; rows=[%s..%s]",
          getMetricLogPrefix(metricObj),
          metricDataRows[startRowIndex].rowid, endRowID)
      else:
        # Cutoff is either inside or after the remaining samples
        # TODO: unit-test this
        limitIndex = startRowIndex + min(
          len(metricDataRows) - startRowIndex,
          endRowID + 1 - metricDataRows[startRowIndex].rowid)

      # Process the next new sample run
      self._log.debug(
        "Starting anomaly run: model=%s; "
        "startRowIndex=%s; limitIndex=%s; rows=[%s..%s]; "
        "last_rowid_for_stats=%s; refreshInterval=%s; batchSize=%s",
        metricObj.uid,
        startRowIndex, limitIndex, metricDataRows[startRowIndex].rowid,
        endRowID, anomalyParams["last_rowid_for_stats"],
        statisticsRefreshInterval, len(metricDataRows))

      consumedSamples = []
      for md in itertools.islice(metricDataRows, startRowIndex, limitIndex):
        consumedSamples.append(md)

        (likelihood,), _, anomalyParams["params"] = (
          algorithms.updateAnomalyLikelihoods(
            ((md.timestamp, md.metric_value, md.raw_anomaly_score),),
            anomalyParams["params"]))

        # TODO: the float "cast" here seems redundant
        md.anomaly_score = float(1.0 - likelihood)

        # If anomaly score > 0.99 then we greedily update the statistics. 0.99
        # should not repeat too often, but to be safe we wait a few more
        # records before updating again, in order to avoid overloading the DB.
        #
        # TODO: the magic 0.99 and the magic 3 value below should either
        #  be constants or config settings. Where should they be defined?
        if (md.anomaly_score > 0.99 and
            (anomalyParams["last_rowid_for_stats"] + 3) < md.rowid):
          if statsSampleCache is None or (
              len(statsSampleCache) + len(consumedSamples) >=
              self._statisticsMinSampleSize):
            # TODO: unit-test this
            self._log.info("Forcing refresh of anomaly params for model=%s due "
                           "to exceeded anomaly_score threshold in sample=%r",
                           metricObj.uid, md)
            break

      if startRowIndex + len(consumedSamples) < len(metricDataRows) or (
          consumedSamples[-1].rowid >= endRowID):
        # We stopped before the end of new samples, including a bypass-run,
        # or stopped after processing the last item and need one final refresh
        # of anomaly params
        anomalyParams, statsSampleCache = self._refreshAnomalyParams(
          engine=engine,
          metricID=metricObj.uid,
          statsSampleCache=statsSampleCache,
          consumedSamples=consumedSamples,
          defaultAnomalyParams=anomalyParams)


      startRowIndex += len(consumedSamples)
    # <--- while

    return anomalyParams
Exemple #26
0
    def updateModelAnomalyScores(self, engine, metricObj, metricDataRows):
        """
    Calculate the anomaly scores based on the anomaly likelihoods. Update
    anomaly scores in the given metricDataRows MetricData instances, and
    calculate new anomaly likelihood params for the model.

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine
    :param metricObj: the model's Metric instance
    :param metricDataRows: a sequence of MetricData instances in the
      processed order (ascending by timestamp) with updated raw_anomaly_score
      and zeroed out anomaly_score corresponding to the new model inference
      results, but not yet updated in the database. Will update their
      anomaly_score properties, as needed.

    :returns: new anomaly likelihood params for the model

    *NOTE:*
      the processing must be idempotent due to the "at least once" delivery
      semantics of the message bus

    *NOTE:*
      the performance goal is to minimize costly database access and avoid
      falling behind while processing model results, especially during the
      model's initial "catch-up" phase when large inference result batches are
      prevalent.
    """
        # When populated, a cached list of MetricData instances for updating
        # anomaly likelyhood params
        statsSampleCache = None

        # Index into metricDataRows where processing is to resume
        startRowIndex = 0

        statisticsRefreshInterval = self._getStatisticsRefreshInterval(
            batchSize=len(metricDataRows))

        if metricObj.status != MetricStatus.ACTIVE:
            raise MetricNotActiveError(
                "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; "
                "status=%s; resource=%s" % (
                    metricObj.uid,
                    metricObj.status,
                    metricObj.server,
                ))

        modelParams = jsonDecode(metricObj.model_params)
        anomalyParams = modelParams.get("anomalyLikelihoodParams", None)
        if not anomalyParams:
            # We don't have a likelihood model yet. Create one if we have sufficient
            # records with raw anomaly scores
            (anomalyParams, statsSampleCache,
             startRowIndex) = (self._initAnomalyLikelihoodModel(
                 engine=engine,
                 metricObj=metricObj,
                 metricDataRows=metricDataRows))

        # Do anomaly likelihood processing on the rest of the new samples
        # NOTE: this loop will be skipped if there are still not enough samples for
        #  creating the anomaly likelihood params
        while startRowIndex < len(metricDataRows):
            # Determine where to stop processing rows prior to next statistics refresh

            if (statsSampleCache is None
                    or len(statsSampleCache) >= self._statisticsMinSampleSize):
                # We're here if:
                #   a. We haven't tried updating anomaly likelihood stats yet
                #                 OR
                #   b. We already updated anomaly likelyhood stats (we had sufficient
                #      samples for it)
                # TODO: unit-test
                endRowID = (anomalyParams["last_rowid_for_stats"] +
                            statisticsRefreshInterval)

                if endRowID < metricDataRows[startRowIndex].rowid:
                    # We're here if:
                    #   a. Statistics refresh interval is smaller than during last stats
                    #      update; this is the typical/normal case when backlog catch-up
                    #      is tapering off, and refresh interval is reduced for smaller
                    #      batches. OR
                    #   b. There is a gap of anomaly scores preceeding the start of the
                    #      current chunk. OR
                    #   c. Statistics config changed.
                    # TODO: unit-test

                    self._log.warning(
                        "Anomaly run cutoff precedes samples (smaller stats "
                        "refreshInterval or gap in anomaly scores or statistics config "
                        "changed) : model=%s; rows=[%s..%s]", metricObj.uid,
                        metricDataRows[startRowIndex].rowid, endRowID)

                    if statsSampleCache is not None:
                        # We already attempted to update anomaly likelihood params, so fix
                        # up endRowID to make sure we make progress and don't get stuck in
                        # an infinite loop
                        endRowID = metricDataRows[startRowIndex].rowid
                        self._log.warning(
                            "Advanced anomaly run cutoff to make progress: "
                            "model=%s; rows=[%s..%s]", metricObj.uid,
                            metricDataRows[startRowIndex].rowid, endRowID)
            else:
                # During prior iteration, there were not enough samples in cache for
                # updating anomaly params

                # We extend the end row so that there will be enough samples
                # to avoid getting stuck in this rut in the current and following
                # iterations
                # TODO: unit-test this
                endRowID = metricDataRows[startRowIndex].rowid + (
                    self._statisticsMinSampleSize - len(statsSampleCache) - 1)

            # Translate endRowID into metricDataRows limitIndex for current run
            if endRowID < metricDataRows[startRowIndex].rowid:
                # Cut-off precedes the remaining samples
                # Normally shouldn't be here (unless statistics config changed or there
                # is a gap in anomaly scores in metric_data table)
                # TODO: unit-test this

                # Set limit to bypass processing of samples for immediate refresh of
                # anomaly likelihood params
                limitIndex = startRowIndex
                self._log.warning(
                    "Anomaly run cutoff precedes samples, so forcing refresh of anomaly "
                    "likelihood params: modelInfo=<%s>; rows=[%s..%s]",
                    getMetricLogPrefix(metricObj),
                    metricDataRows[startRowIndex].rowid, endRowID)
            else:
                # Cutoff is either inside or after the remaining samples
                # TODO: unit-test this
                limitIndex = startRowIndex + min(
                    len(metricDataRows) - startRowIndex,
                    endRowID + 1 - metricDataRows[startRowIndex].rowid)

            # Process the next new sample run
            self._log.debug(
                "Starting anomaly run: model=%s; "
                "startRowIndex=%s; limitIndex=%s; rows=[%s..%s]; "
                "last_rowid_for_stats=%s; refreshInterval=%s; batchSize=%s",
                metricObj.uid, startRowIndex, limitIndex,
                metricDataRows[startRowIndex].rowid, endRowID,
                anomalyParams["last_rowid_for_stats"],
                statisticsRefreshInterval, len(metricDataRows))

            consumedSamples = []
            for md in itertools.islice(metricDataRows, startRowIndex,
                                       limitIndex):
                consumedSamples.append(md)

                (likelihood, ), _, anomalyParams["params"] = (
                    algorithms.updateAnomalyLikelihoods(
                        ((md.timestamp, md.metric_value,
                          md.raw_anomaly_score), ), anomalyParams["params"]))

                # TODO: the float "cast" here seems redundant
                md.anomaly_score = float(1.0 - likelihood)

                # If anomaly score > 0.99 then we greedily update the statistics. 0.99
                # should not repeat too often, but to be safe we wait a few more
                # records before updating again, in order to avoid overloading the DB.
                #
                # TODO: the magic 0.99 and the magic 3 value below should either
                #  be constants or config settings. Where should they be defined?
                if (md.anomaly_score > 0.99 and
                    (anomalyParams["last_rowid_for_stats"] + 3) < md.rowid):
                    if statsSampleCache is None or (
                            len(statsSampleCache) + len(consumedSamples) >=
                            self._statisticsMinSampleSize):
                        # TODO: unit-test this
                        self._log.info(
                            "Forcing refresh of anomaly params for model=%s due "
                            "to exceeded anomaly_score threshold in sample=%r",
                            metricObj.uid, md)
                        break

            if startRowIndex + len(consumedSamples) < len(metricDataRows) or (
                    consumedSamples[-1].rowid >= endRowID):
                # We stopped before the end of new samples, including a bypass-run,
                # or stopped after processing the last item and need one final refresh
                # of anomaly params
                anomalyParams, statsSampleCache = self._refreshAnomalyParams(
                    engine=engine,
                    metricID=metricObj.uid,
                    statsSampleCache=statsSampleCache,
                    consumedSamples=consumedSamples,
                    defaultAnomalyParams=anomalyParams)

            startRowIndex += len(consumedSamples)
        # <--- while

        return anomalyParams