def testVeryFewScores(self):
        """
    This calls estimateAnomalyLikelihoods and updateAnomalyLikelihoods
    with one or no scores.
    """
        # Generate an estimate using two data points
        data1 = _generateSampleData(mean=42.0,
                                    variance=1e-10,
                                    seed=self.GLOBAL_TEST_SEED)

        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:2]))

        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Check that the estimated mean is that value
        dParams = estimatorParams["distribution"]
        self.assertWithinEpsilon(dParams["mean"], data1[0][2])

        # Can't generate an estimate using no data points
        data1 = numpy.zeros(0)
        with self.assertRaises(ValueError):
            an.estimateAnomalyLikelihoods(data1)

        # Can't update with no scores
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, estimatorParams)
    def testFlatAnomalyScores(self):
        """
    This calls estimateAnomalyLikelihoods with flat distributions and
    ensures things don't crash.
    """

        # Generate an estimate using fake distribution of anomaly scores.
        data1 = _generateSampleData(mean=42.0,
                                    variance=1e-10,
                                    seed=self.GLOBAL_TEST_SEED)

        likelihoods, avgRecordList, estimatorParams = (
            an.estimateAnomalyLikelihoods(data1[0:1000]))
        self.assertEqual(len(likelihoods), 1000)
        self.assertEqual(len(avgRecordList), 1000)
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        ## Check that the estimated mean is correct
        dParams = estimatorParams["distribution"]
        self.assertWithinEpsilon(dParams["mean"], data1[0][2])

        # If you deviate from the mean, you should get probability 0
        # Test this by sending in just slightly different values.
        data2 = _generateSampleData(mean=42.5,
                                    variance=1e-10,
                                    seed=self.GLOBAL_TEST_SEED)
        likelihoods2, _, _ = (an.updateAnomalyLikelihoods(
            data2[0:10], estimatorParams))

        # The likelihoods should go to zero very quickly
        self.assertLessEqual(likelihoods2.sum(), 0.01)

        # Test edge case where anomaly scores are very close to 0
        # In this case we don't let likelihood to get too low. An average
        # anomaly score of 0.1 should be essentially zero, but an average
        # of 0.04 should be higher
        data3 = _generateSampleData(mean=0.01,
                                    variance=1e-6,
                                    seed=self.GLOBAL_TEST_SEED)

        _, _, estimatorParams3 = (an.estimateAnomalyLikelihoods(data3[0:1000]))

        data4 = _generateSampleData(mean=0.1,
                                    variance=1e-6,
                                    seed=self.GLOBAL_TEST_SEED)
        likelihoods4, _, estimatorParams4 = (an.updateAnomalyLikelihoods(
            data4[0:20], estimatorParams3))

        # Average of 0.1 should go to zero
        self.assertLessEqual(likelihoods4[10:].mean(), 0.002)

        data5 = _generateSampleData(mean=0.05,
                                    variance=1e-6,
                                    seed=self.GLOBAL_TEST_SEED)
        likelihoods5, _, _ = (an.updateAnomalyLikelihoods(
            data5[0:20], estimatorParams4))

        # The likelihoods should be low but not near zero
        self.assertLessEqual(likelihoods5[10:].mean(), 0.28)
        self.assertGreater(likelihoods5[10:].mean(), 0.015)
    def testSkipRecords(self):
        """
    This calls estimateAnomalyLikelihoods with various values of skipRecords
    """

        # Check happy path
        data1 = _generateSampleData(mean=0.1,
                                    seed=self.GLOBAL_TEST_SEED)[0:200]
        data1 = data1 + (_generateSampleData(
            mean=0.9, seed=self.GLOBAL_TEST_SEED)[0:200])

        likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods(
            data1, skipRecords=200))

        # Check results are correct, i.e. we are actually skipping the first 50
        dParams = estimatorParams["distribution"]
        self.assertWithinEpsilon(dParams["mean"], 0.9, epsilon=0.15)

        # Check case where skipRecords > num records
        # In this case a null distribution should be returned which makes all
        # the likelihoods reasonably high
        likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods(
            data1, skipRecords=500))
        self.assertEqual(len(likelihoods), len(data1))
        self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods))

        # Check the case where skipRecords == num records
        likelihoods, _, estimatorParams = (an.estimateAnomalyLikelihoods(
            data1, skipRecords=len(data1)))
        self.assertEqual(len(likelihoods), len(data1))
        self.assertTrue(likelihoods.sum() >= 0.3 * len(likelihoods))
  def testCaseIncreasedAnomalyScore(self):
    """
    Test F: small anomaly score every 20 records, but then a large one when you
    would expect a small one. This should be anomalous.
    """

    # Generate initial data
    data = []
    data = self._addSampleData(data, spikePeriod=20,
                               spikeValue=0.4, numSamples=1000)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data)
    )

    # Now feed in a more frequent distribution
    data = self._addSampleData(spikePeriod=20, spikeValue=1.0,
                               numSamples=100)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # We should detect highly unusual behavior
    self.assertTrue(likelihoods2.min() < 0.0003)

    # We should detect it pretty often
    self.assertTrue((likelihoods2 < 0.0003).sum() > 40)
  def testCaseIncreasedSpikeFrequency(self):
    """
    Test E: bunches of anomalies every 20 records that become even more
    frequent. This should be anomalous.
    """

    # Generate initial data
    data = []
    for _ in range(30):
      data = self._addSampleData(data, spikePeriod=0, numSamples=30)
      data = self._addSampleData(data, spikePeriod=3, numSamples=10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in a more frequent distribution
    data = self._addSampleData(spikePeriod=0, numSamples=30)
    data = self._addSampleData(data, spikePeriod=1, numSamples=10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood should become anomalous but only near the end
    self.assertTrue(likelihoods2[0:30].min() > 0.01)
    self.assertTrue(likelihoods2[-5:].min() < 0.002)
  def testCaseContinuousBunchesOfSpikes(self):
    """
    Test D: bunches of anomalies every 20 records that continue. This should not
    be anomalous.
    """

    # Generate initial data
    data = []
    for _ in range(30):
      data = self._addSampleData(data, spikePeriod=0, numSamples=30)
      data = self._addSampleData(data, spikePeriod=3, numSamples=10)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in the same distribution
    data = self._addSampleData(spikePeriod=0, numSamples=30)
    data = self._addSampleData(data, spikePeriod=3, numSamples=10)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood should be reasonable high everywhere
    self.assertTrue(likelihoods2.min() > 0.01)
  def testCaseUnusuallyHighSpikeFrequency(self):
    """
    Test B: one anomaly spike every 20 records. Then we suddenly get a bunch
    in a row. The likelihood of those spikes should be low.
    """
    data = self._addSampleData(spikePeriod=20, numSamples=1019)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # If we continue to see the same distribution, we should get reasonable
    # likelihoods
    data = self._addSampleData(numSamples=119, spikePeriod=20)
    likelihoods1, _, estimatorParams1 = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The minimum likelihood should be reasonably high
    self.assertTrue((likelihoods1.min() > 0.1 ))

    data = self._addSampleData(numSamples=20, spikePeriod=2)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams1)
    )

    # The likelihood once you get past the initial averaging should be very low.
    self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.001)
    def testEstimateAnomalyLikelihoodsMalformedRecords(self):
        """
    This calls estimateAnomalyLikelihoods with malformed records, which should
    be quietly skipped.
    """

        # Generate a fake distribution of anomaly scores, and add malformed records
        data1 = _generateSampleData(mean=0.2, seed=self.GLOBAL_TEST_SEED)
        data1 = data1[0:1000] + [(2, 2)] + [(2, 2, 2, 2)] + [()] + [(2)]

        likelihoods, avgRecordList, estimatorParams = (
            an.estimateAnomalyLikelihoods(data1[0:1004]))
        self.assertEqual(len(likelihoods), 1000)
        self.assertEqual(len(avgRecordList), 1000)
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Check that the sum is correct
        avgParams = estimatorParams["movingAverage"]
        total = 0
        for v in avgRecordList:
            total = total + v[2]
        self.assertTrue(avgParams["total"], total)

        # Check that the estimated mean is correct
        dParams = estimatorParams["distribution"]
        self.assertWithinEpsilon(dParams["mean"],
                                 total / float(len(avgRecordList)))
    def testEstimateAnomalyLikelihoods(self):
        """
    This calls estimateAnomalyLikelihoods to estimate the distribution on fake
    data and validates the results
    """

        # Generate an estimate using fake distribution of anomaly scores.
        data1 = _generateSampleData(mean=0.2, seed=self.GLOBAL_TEST_SEED)

        likelihoods, avgRecordList, estimatorParams = (
            an.estimateAnomalyLikelihoods(data1[0:1000]))
        self.assertEqual(len(likelihoods), 1000)
        self.assertEqual(len(avgRecordList), 1000)
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Check that the sum is correct
        avgParams = estimatorParams["movingAverage"]
        total = 0
        for v in avgRecordList:
            total = total + v[2]
        self.assertTrue(avgParams["total"], total)

        # Check that the estimated mean is correct
        dParams = estimatorParams["distribution"]
        self.assertWithinEpsilon(dParams["mean"],
                                 total / float(len(avgRecordList)))

        # Number of points with lower than 2% probability should be pretty low
        # but not zero. Can't use exact 2% here due to random variations
        self.assertLessEqual(numpy.sum(likelihoods < 0.02), 66)
        self.assertGreaterEqual(numpy.sum(likelihoods < 0.02), 1)
    def testEstimateAnomalyLikelihoodsCategoryValues(self):
        start = datetime.datetime(2017, 1, 1, 0, 0, 0)
        delta = datetime.timedelta(minutes=5)
        dts = [start + (i * delta) for i in range(10)]
        values = ["a", "b", "c", "d", "e"] * 2
        rawScores = [0.1 * i for i in range(10)]
        data = list(zip(dts, values, rawScores))

        likelihoods, avgRecordList, estimatorParams = (
            an.estimateAnomalyLikelihoods(data))
        self.assertEqual(len(likelihoods), 10)
        self.assertEqual(len(avgRecordList), 10)
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))
  def testCaseSingleSpike(self):
    """
    No anomalies, and then you see a single spike. The likelihood of that
    spike should be 0
    """
    data = self._addSampleData(spikePeriod=0, numSamples=1000)

    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    data = self._addSampleData(numSamples=1, spikePeriod=1)
    likelihoods1, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    self.assertWithinEpsilon(likelihoods1[0], 0.0)
  def testCaseMissingSpike(self):
    """
    Test C: one anomaly every 20 records, but then see none. The likelihood
    at the end should be very low.
    """

    # Initial data
    data = self._addSampleData(spikePeriod=20, numSamples=1019)
    _, _, estimatorParams = (
      an.estimateAnomalyLikelihoods(data[0:1000])
    )

    # Now feed in none
    data = self._addSampleData(numSamples=100, spikePeriod=0)
    likelihoods2, _, _ = (
      an.updateAnomalyLikelihoods(data, estimatorParams)
    )

    # The likelihood once you get past the initial averaging should be very low.
    self.assertTrue((likelihoods2[5:].sum() / 15.0) < 0.0001)
    def testBadParams(self):
        """
    Calls updateAnomalyLikelihoods with bad params.
    """

        # Generate an estimate using one data point
        data1 = _generateSampleData(mean=42.0,
                                    variance=1e-10,
                                    seed=self.GLOBAL_TEST_SEED)

        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(data1[0:1]))

        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # Can't pass in a bad params structure
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, {"haha": "heehee"})

        # Can't pass in something not a dict
        with self.assertRaises(ValueError):
            an.updateAnomalyLikelihoods(data1, 42.0)
    def testFlatMetricScores(self):
        """
    This calls estimateAnomalyLikelihoods with flat metric values. In this case
    we should use the null distribution, which gets reasonably high likelihood
    for everything.
    """
        # Generate samples with very flat metric values
        data1 = _generateSampleData(metricMean=42.0,
                                    metricVariance=1e-10,
                                    seed=self.GLOBAL_TEST_SEED)[0:1000]

        likelihoods, _, estimatorParams = (
            an.estimateAnomalyLikelihoods(data1))

        # Check that we do indeed get reasonable likelihood values
        self.assertEqual(len(likelihoods), len(data1))
        self.assertTrue(likelihoods.sum() >= 0.4 * len(likelihoods))

        # Check that we do indeed get null distribution and with appropriate mean
        self.assertDictEqual(
            estimatorParams["distribution"],
            an.nullDistribution(estimatorParams["distribution"]["mean"]))
    def testUpdateAnomalyLikelihoods(self):
        """
    A slight more complex test. This calls estimateAnomalyLikelihoods
    to estimate the distribution on fake data, followed by several calls
    to updateAnomalyLikelihoods.
    """

        #------------------------------------------
        # Step 1. Generate an initial estimate using fake distribution of anomaly
        # scores.
        data1 = _generateSampleData(mean=0.2,
                                    seed=self.GLOBAL_TEST_SEED)[0:1000]
        _, _, estimatorParams = (an.estimateAnomalyLikelihoods(
            data1, averagingWindow=5))

        #------------------------------------------
        # Step 2. Generate some new data with a higher average anomaly
        # score. Using the estimator from step 1, to compute likelihoods. Now we
        # should see a lot more anomalies.
        data2 = _generateSampleData(mean=0.6,
                                    seed=self.GLOBAL_TEST_SEED)[0:300]
        likelihoods2, avgRecordList2, estimatorParams2 = (
            an.updateAnomalyLikelihoods(data2, estimatorParams))
        self.assertEqual(len(likelihoods2), len(data2))
        self.assertEqual(len(avgRecordList2), len(data2))
        self.assertTrue(an.isValidEstimatorParams(estimatorParams))

        # The new running total should be different
        self.assertNotEqual(estimatorParams2["movingAverage"]["total"],
                            estimatorParams["movingAverage"]["total"])

        # We should have many more samples where likelihood is < 0.01, but not all
        self.assertGreaterEqual(numpy.sum(likelihoods2 < 0.01), 25)
        self.assertLessEqual(numpy.sum(likelihoods2 < 0.01), 250)

        #------------------------------------------
        # Step 3. Generate some new data with the expected average anomaly score. We
        # should see fewer anomalies than in Step 2.
        # Note: same data properties as in step1 but different seed
        seed2 = self.GLOBAL_TEST_SEED + 1 if self.GLOBAL_TEST_SEED != None else None

        data3 = _generateSampleData(mean=0.2, seed=seed2)[0:1000]
        likelihoods3, avgRecordList3, estimatorParams3 = (
            an.updateAnomalyLikelihoods(data3, estimatorParams2))

        self.assertEqual(len(likelihoods3), len(data3))
        self.assertEqual(len(avgRecordList3), len(data3))
        self.assertTrue(an.isValidEstimatorParams(estimatorParams3))

        # The new running total should be different
        self.assertNotEqual(estimatorParams3["movingAverage"]["total"],
                            estimatorParams["movingAverage"]["total"])
        self.assertNotEqual(estimatorParams3["movingAverage"]["total"],
                            estimatorParams2["movingAverage"]["total"])

        # We should have a small number samples where likelihood is < 0.02, but at
        # least one
        self.assertGreaterEqual(numpy.sum(likelihoods3 < 0.01), 1)
        self.assertLessEqual(numpy.sum(likelihoods3 < 0.01), 100)

        #------------------------------------------
        # Step 4. Validate that sending data incrementally is the same as sending
        # in one batch
        allData = data1
        allData.extend(data2)
        allData.extend(data3)

        # Compute moving average of all the data and check it's the same
        _, historicalValuesAll, totalAll = (an._anomalyScoreMovingAverage(
            allData, windowSize=5))
        self.assertEqual(
            sum(historicalValuesAll),
            sum(estimatorParams3["movingAverage"]["historicalValues"]))
        self.assertEqual(totalAll, estimatorParams3["movingAverage"]["total"])