def testFourFalseNegatives(self):
        """
    A false negative with four windows should have exactly four times
    the negative of the false negative score.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 2000
        numWindows = 4
        windowSize = 10

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        labels = generateLabels(timestamps, windows)
        predictions = pandas.Series([0] * length)

        scorer = Scorer(timestamps,
                        predictions,
                        labels,
                        windows,
                        self.costMatrix,
                        probationaryPeriod=0)
        (_, score) = scorer.getScore()

        self.assertTrue(abs(score + 4 * self.costMatrix['fnWeight']) < 0.01)
        self._checkCounts(scorer.counts, length - windowSize * numWindows, 0,
                          0, windowSize * numWindows)
Beispiel #2
0
  def testRowsLabeledAnomalousWithinAWindow(self):
    """
    All timestamps labeled as anomalous should be within a label window.
    """
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2014-01-01"),
      datetime.timedelta(minutes=5), 10)})

    windows = [["2014-01-01 00:15", "2014-01-01 00:30"]]

    writeCorpus(self.tempCorpusPath, {"test_data_file.csv": data})
    writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows})

    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    corpusLabel = nab.labeler.CorpusLabel(self.tempCorpusLabelPath, corpus)

    for relativePath, lab in corpusLabel.labels.iteritems():
      windows = corpusLabel.windows[relativePath]

      for row in lab[lab["label"] == 1].iterrows():
        self.assertTrue(
          all([w[0] <= row[1]["timestamp"] <= w[1] for w in windows]),
            "The label at %s of file %s is not within a label window"
            % (row[1]["timestamp"], relativePath))
Beispiel #3
0
    def testFalsePositiveMeansNegativeScore(self):
        """
    A false positive should make the score negative.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 1000
        numWindows = 1
        windowSize = 10

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        labels = generateLabels(timestamps, windows)
        predictions = pandas.Series([0] * length)

        predictions[0] = 1
        scorer = Scorer(timestamps,
                        predictions,
                        labels,
                        windows,
                        self.costMatrix,
                        probationaryPeriod=0)
        (_, score) = scorer.getScore()
        self.assertTrue(score < 0)
        self._checkCounts(scorer.counts, length - windowSize * numWindows - 1,
                          0, 1, windowSize * numWindows)
Beispiel #4
0
  def testFourFalseNegatives(self):
    """
    A false negative with four windows should have exactly four times
    the negative of the false negative score.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 2000
    numWindows = 4
    windowSize = 10

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    anomalyScores = pandas.Series([0] * length)
    threshold = 1

    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
    (scores, matchingRow) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores,
      windows,
      "testData",
      threshold
    )

    self.assertEqual(matchingRow.score, 4 * -self.costMatrix["fnWeight"])
    self._checkCounts(matchingRow, length - windowSize * numWindows, 0, 0,
                      windowSize * numWindows)
Beispiel #5
0
  def testTwoFalsePositivesIsWorseThanOne(self):
    """
    For two false positives A and B in a file, the score given A and B should be
    more negative than the score given just A.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 1000
    numWindows = 1
    windowSize = 10

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    labels = generateLabels(timestamps, windows)
    predictions = pandas.Series([0]*length)

    predictions[0] = 1
    scorer1 = Scorer(timestamps, predictions, labels, windows, self.costMatrix,
      probationaryPeriod=0)
    (_, score1) = scorer1.getScore()

    predictions[1] = 1
    scorer2 = Scorer(timestamps, predictions, labels, windows, self.costMatrix,
      probationaryPeriod=0)
    (_, score2) = scorer2.getScore()

    self.assertTrue(score2 < score1)
    self._checkCounts(scorer1.counts, length-windowSize*numWindows-1, 0, 1,
      windowSize*numWindows)
    self._checkCounts(scorer2.counts, length-windowSize*numWindows-2, 0, 2,
      windowSize*numWindows)
Beispiel #6
0
  def testFirstTruePositiveWithinWindow(self):
    """
    First record within window has a score approximately equal to 
    self.costMatrix["tpWeight"]; within 4 decimal places is more than enough
    precision.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    windowSize = 2

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    labels = generateLabels(timestamps, windows)
    predictions = pandas.Series([0]*length)

    index = timestamps[timestamps == windows[0][0]].index[0]
    predictions[index] = 1
    scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix,
      probationaryPeriod=0)
    (_, score) = scorer.getScore()

    self.assertAlmostEquals(score, self.costMatrix["tpWeight"], 4)
    self._checkCounts(scorer.counts, length-windowSize*numWindows, 1, 0,
      windowSize*numWindows-1)
Beispiel #7
0
  def testNonexistentDatafileOrLabelsThrowsError(self):
    """
    A KeyError should be thrown when there are not corresponding windows labels
    for a data file (or vice-versa) in the corpus.
    """
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2014-01-01"),
      datetime.timedelta(minutes=5), 10)})

    windows = [["2014-01-01 00:15", "2014-01-01 00:30"]]

    # Case 1: nonexistent datafile for window labels
    writeCorpus(self.tempCorpusPath, {"test_data_file.csv": data})
    writeCorpusLabel(self.tempCorpusLabelPath,
      {"test_data_file.csv": windows, "non_existent_data_file.csv": windows})
    
    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    self.assertRaises(
      KeyError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
  
    # Case 2: nonexistent window labels for datafile
    writeCorpus(self.tempCorpusPath,
      {"test_data_file.csv": data, "non_existent_data_file.csv": data})
    writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows})
    
    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    self.assertRaises(
      KeyError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
Beispiel #8
0
  def testFalsePositiveMeansNegativeScore(self):
    """
    A false positive should make the score negative.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 1000
    numWindows = 1
    windowSize = 10
    threshold = 0.5

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    anomalyScores = pandas.Series([0]*length)

    anomalyScores[0] = 1
    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
    (scores, matchingRow) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores,
      windows,
      "testData",
      threshold
    )
    self.assertTrue(matchingRow.score < 0)
    self._checkCounts(matchingRow, length-windowSize*numWindows-1, 0, 1,
      windowSize*numWindows)
Beispiel #9
0
  def testOneFalsePositiveNoWindow(self):
    """
    When there is no window (i.e. no anomaly), a false positive should still
    result in a negative score, specifically negative the FP weight.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 1000
    numWindows = 0
    windowSize = 10
    threshold = 0.5

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    anomalyScores = pandas.Series([0]*length)

    anomalyScores[0] = 1
    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
    (scores, matchingRow) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores,
      windows,
      "testData",
      threshold
    )
    
    self.assertEqual(matchingRow.score, -self.costMatrix["fpWeight"])
    self._checkCounts(matchingRow, length-windowSize*numWindows-1, 0, 1,
      windowSize*numWindows)
Beispiel #10
0
  def testNonexistentDatafileOrLabelsThrowsError(self):
    """
    A KeyError should be thrown when there are not corresponding windows labels
    for a data file (or vice-versa) in the corpus.
    """
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2014-01-01"),
      datetime.timedelta(minutes=5), 10)})

    windows = [["2014-01-01 00:15", "2014-01-01 00:30"]]

    # Case 1: nonexistent datafile for window labels
    writeCorpus(self.tempCorpusPath, {"test_data_file.csv": data})
    writeCorpusLabel(self.tempCorpusLabelPath,
      {"test_data_file.csv": windows, "non_existent_data_file.csv": windows})
    
    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    self.assertRaises(
      KeyError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
  
    # Case 2: nonexistent window labels for datafile
    writeCorpus(self.tempCorpusPath,
      {"test_data_file.csv": data, "non_existent_data_file.csv": data})
    writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows})
    
    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    self.assertRaises(
      KeyError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
    def testEarlierFalsePositiveAfterWindowIsBetter(self):
        """For two false positives A and B, where A occurs earlier than B, the
    score change due to A will be less than the score change due to B.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 10
        numWindows = 1
        windowSize = 2
        threshold = 0.5

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        anomalyScores1 = pandas.Series([0] * length)
        anomalyScores2 = pandas.Series([0] * length)
        t1, t2 = windows[0]

        index1 = timestamps[timestamps == t2].index[0] + 1
        anomalyScores1[index1] = 1
        sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
        (scores, matchingRow1) = sweeper.scoreDataSet(timestamps,
                                                      anomalyScores1, windows,
                                                      "testData", threshold)

        anomalyScores2[index1 + 1] = 1
        sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
        (scores, matchingRow2) = sweeper.scoreDataSet(timestamps,
                                                      anomalyScores2, windows,
                                                      "testData", threshold)

        self.assertTrue(matchingRow1.score > matchingRow2.score)
        self._checkCounts(matchingRow1, length - windowSize * numWindows - 1,
                          0, 1, windowSize * numWindows)
        self._checkCounts(matchingRow2, length - windowSize * numWindows - 1,
                          0, 1, windowSize * numWindows)
    def testTwoFalsePositivesIsWorseThanOne(self):
        """
    For two false positives A and B in a file, the score given A and B should be
    more negative than the score given just A.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 1000
        numWindows = 1
        windowSize = 10
        threshold = 0.5

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        anomalyScores = pandas.Series([0] * length)

        anomalyScores[0] = 1
        sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
        (scores, matchingRow1) = sweeper.scoreDataSet(timestamps,
                                                      anomalyScores, windows,
                                                      "testData", threshold)

        anomalyScores[1] = 1
        sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
        (scores, matchingRow2) = sweeper.scoreDataSet(timestamps,
                                                      anomalyScores, windows,
                                                      "testData", threshold)

        self.assertTrue(matchingRow2.score < matchingRow1.score)
        self._checkCounts(matchingRow1, length - windowSize * numWindows - 1,
                          0, 1, windowSize * numWindows)
        self._checkCounts(matchingRow2, length - windowSize * numWindows - 2,
                          0, 2, windowSize * numWindows)
Beispiel #13
0
  def testGetLabels(self):
    """
    Labels dictionary generated by CorpusLabel.getLabels() should match the
    label windows.
    """
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2014-01-01"),
      datetime.timedelta(minutes=5), 10)})

    windows = [["2014-01-01 00:00", "2014-01-01 00:10"],
               ["2014-01-01 00:10", "2014-01-01 00:15"]]
    
    writeCorpus(self.tempCorpusPath, {"test_data_file.csv" : data})
    writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows})

    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    corpusLabel = nab.labeler.CorpusLabel(self.tempCorpusLabelPath, corpus)

    for relativePath, l in corpusLabel.labels.iteritems():
      windows = corpusLabel.windows[relativePath]

      for t, lab in corpusLabel.labels["test_data_file.csv"].values:
        for w in windows:
          if (w[0] <= t and t <= w[1]):
            self.assertEqual(lab, 1,
              "Incorrect label value for timestamp %r" % t)
    def testOneFalsePositiveNoWindow(self):
        """
    When there is no window (i.e. no anomaly), a false positive should still
    result in a negative score, specifically negative the FP weight.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 1000
        numWindows = 0
        windowSize = 10
        threshold = 0.5

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        anomalyScores = pandas.Series([0] * length)

        anomalyScores[0] = 1
        sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
        (scores, matchingRow) = sweeper.scoreDataSet(timestamps, anomalyScores,
                                                     windows, "testData",
                                                     threshold)

        self.assertEqual(matchingRow.score, -self.costMatrix["fpWeight"])
        self._checkCounts(matchingRow, length - windowSize * numWindows - 1, 0,
                          1, windowSize * numWindows)
Beispiel #15
0
  def testScoringAllMetrics(self):
    """
    This tests an example set of detections, where all metrics have counts > 0.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 100
    numWindows = 2
    windowSize = 5
    threshold = 0.5
    
    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    anomalyScores = pandas.Series([0]*length)
    
    index = timestamps[timestamps == windows[0][0]].index[0]
    # TP, add'l TP, and FP
    anomalyScores[index] = 1
    anomalyScores[index+1] = 1
    anomalyScores[index+7] = 1

    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)

    (scores, matchingRow) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores,
      windows,
      "testData",
      threshold
    )
    
    self.assertAlmostEquals(matchingRow.score, -0.9540, 4)
    self._checkCounts(matchingRow, length-windowSize*numWindows-1, 2, 1, 8)
Beispiel #16
0
  def test_FourFalseNegatives(self):
    """
    A false negative with four windows should have exactly four times
    the negative of the false negative score.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 2000
    numWindows = 4
    windowSize = 10

    timestamps = generateTimestamps(start, increment, length)
    predictions = pandas.Series([0]*length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    labels = generateLabels(timestamps, windows)

    costMatrix = {"tpWeight": 1.0,
                  "fnWeight": 2.0,
                  "fpWeight": 3.0,
                  "tnWeight": 4.0}

    scorer = Scorer(timestamps, predictions, labels, windows, costMatrix,
      probationaryPeriod=0)

    self.assertTrue(abs(scorer.getScore() + 4*costMatrix['fnWeight']) < 0.01)

    # Ensure counts are correct.
    self.assertEqual(scorer.counts['tn'], length-windowSize*numWindows)
    self.assertEqual(scorer.counts['tp'], 0)
    self.assertEqual(scorer.counts['fp'], 0)
    self.assertEqual(scorer.counts['fn'], windowSize*numWindows)
Beispiel #17
0
  def test_firstTruePositiveWithinWindow(self):
    """
    First record within window has a score close to costMatrix["tpWeight"].
    Since we use Sigmoids, it will never be exactly 1.
    """

    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    windowSize = 2

    timestamps = generateTimestamps(start, increment, length)
    predictions = pandas.Series([0]*length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    labels = generateLabels(timestamps, windows)

    costMatrix = {"tpWeight": 1.0,
                  "fnWeight": 2.0,
                  "fpWeight": 3.0,
                  "tnWeight": 4.0}

    index = timestamps[timestamps == windows[0][0]].index[0]
    predictions[index] = 1

    scorer = Scorer(timestamps, predictions, labels, windows, costMatrix,
      probationaryPeriod=0)

    self.assertTrue(costMatrix["tpWeight"] - scorer.getScore() <= 1)
Beispiel #18
0
  def test_oneFalsePositiveNoWindow(self):
    """
    When there is no window (meaning no anomaly), a false positive should still
    result in a negative score.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 1000
    numWindows = 0
    windowSize = 10

    timestamps = generateTimestamps(start, increment, length)

    predictions = pandas.Series([0]*length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    labels = generateLabels(timestamps, windows)

    costMatrix = {"tpWeight": 1.0,
    "fnWeight": 1.0,
    "fpWeight": 1.0,
    "tnWeight": 1.0}

    predictions[0] = 1

    scorer = Scorer(timestamps, predictions, labels, windows, costMatrix,
      probationaryPeriod=0)

    self.assertTrue(scorer.getScore() == -costMatrix["fpWeight"])

    # Ensure counts are correct.
    self.assertEqual(scorer.counts['tn'], length-windowSize*numWindows-1)
    self.assertEqual(scorer.counts['tp'], 0)
    self.assertEqual(scorer.counts['fp'], 1)
    self.assertEqual(scorer.counts['fn'], windowSize*numWindows)
Beispiel #19
0
  def test_falsePositiveMeansNegativeScore(self):
    """
    A false positive should make the score negative.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 1000
    numWindows = 1
    windowSize = 10

    timestamps = generateTimestamps(start, increment, length)
    predictions = pandas.Series([0]*length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    labels = generateLabels(timestamps, windows)

    costMatrix = {"tpWeight": 1.0,
    "fnWeight": 1.0,
    "fpWeight": 1.0,
    "tnWeight": 1.0}

    predictions[0] = 1

    scorer = Scorer(timestamps, predictions, labels, windows, costMatrix,
      probationaryPeriod=0)
    score = scorer.getScore()

    self.assertTrue(score < 0)

    # Ensure counts are correct.
    self.assertEqual(scorer.counts['tn'], length-windowSize*numWindows-1)
    self.assertEqual(scorer.counts['tp'], 0)
    self.assertEqual(scorer.counts['fp'], 1)
    self.assertEqual(scorer.counts['fn'], windowSize*numWindows)
  def testBucketMerge(self):
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2015-12-01"),
      datetime.timedelta(days=1), 31)})
    dataFileName = "test_data_file.csv"
    writeCorpus(self.tempCorpusPath, {dataFileName : data})

    rawLabels = (["2015-12-24 00:00:00",
                  "2015-12-31 00:00:00"],
                 ["2015-12-01 00:00:00",
                  "2015-12-25 00:00:00",
                  "2015-12-31 00:00:00"],
                 ["2015-12-25 00:00:00"])

    for i, labels in enumerate(rawLabels):
      labelsPath = self.tempCorpusLabelPath.replace(
        os.path.sep+"label.json", os.path.sep+"raw"+os.path.sep+"label{}.json".format(i))
      writeCorpusLabel(labelsPath, {"test_data_file.csv": labels})
    labelsDir = labelsPath.replace(os.path.sep+"label{}.json".format(i), "")

    corpus = nab.corpus.Corpus(self.tempCorpusPath)
    labelCombiner = nab.labeler.LabelCombiner(
      labelsDir, corpus, 0.5, 0.10, 0.15, 0)
    labelCombiner.getRawLabels()
    labelTimestamps, _ = labelCombiner.combineLabels()

    expectedLabels = ['2015-12-25 00:00:00', '2015-12-31 00:00:00']
    self.assertEqual(expectedLabels, labelTimestamps[dataFileName],
      "The combined labels did not bucket and merge as expected.")
    def testScoringAllMetrics(self):
        """
    This tests an example set of detections, where all metrics have counts > 0.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 100
        numWindows = 2
        windowSize = 5
        threshold = 0.5

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        anomalyScores = pandas.Series([0] * length)

        index = timestamps[timestamps == windows[0][0]].index[0]
        # TP, add'l TP, and FP
        anomalyScores[index] = 1
        anomalyScores[index + 1] = 1
        anomalyScores[index + 7] = 1

        sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)

        (scores, matchingRow) = sweeper.scoreDataSet(timestamps, anomalyScores,
                                                     windows, "testData",
                                                     threshold)

        self.assertAlmostEquals(matchingRow.score, -0.9540, 4)
        self._checkCounts(matchingRow, length - windowSize * numWindows - 1, 2,
                          1, 8)
Beispiel #22
0
  def testOnlyScoreFirstTruePositiveWithinWindow(self):
    """
    An algorithm making multiple detections within a window (i.e. true positive)
    should only be scored for the earliest true positive.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    windowSize = 2

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    labels = generateLabels(timestamps, windows)
    predictions = pandas.Series([0]*length)
    window = windows[0]
    t1, t2 = window

    index1 = timestamps[timestamps == t1].index[0]
    predictions[index1] = 1
    scorer1 = Scorer(timestamps, predictions, labels, windows, self.costMatrix,
      probationaryPeriod=0)
    (_, score1) = scorer1.getScore()

    index2 = timestamps[timestamps == t2].index[0]
    predictions[index2] = 1
    scorer2 = Scorer(timestamps, predictions, labels, windows, self.costMatrix,
      probationaryPeriod=0)
    (_, score2) = scorer2.getScore()

    self.assertEqual(score1, score2)
    self._checkCounts(scorer1.counts, length-windowSize*numWindows, 1, 0,
      windowSize*numWindows-1)
    self._checkCounts(scorer2.counts, length-windowSize*numWindows, 2, 0,
      windowSize*numWindows-2)
Beispiel #23
0
  def testRewardLowFalsePositives(self):
    """
    Given false positives in the set of detections, the score output with the
    Reward Low False Positives application profile will be greater than with
    the Standard application profile.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 100
    numWindows = 0
    windowSize = 10
    
    timestamps = generateTimestamps(start, increment, length)
    windows = []
    labels = generateLabels(timestamps, windows)
    predictions = pandas.Series([0]*length)
    
    costMatrixFP = copy.deepcopy(self.costMatrix)
    costMatrixFP["fpWeight"] = 2.0
    costMatrixFP["fnWeight"] = 0.5
    # FP
    predictions[0] = 1

    scorer1 = Scorer(timestamps, predictions, labels, windows, self.costMatrix,
      probationaryPeriod=0)
    (_, score1) = scorer1.getScore()
    scorer2 = Scorer(timestamps, predictions, labels, windows, costMatrixFP,
      probationaryPeriod=0)
    (_, score2) = scorer2.getScore()
    
    self.assertEqual(score1, 0.5*score2)
    self._checkCounts(scorer1.counts, length-windowSize*numWindows-1, 0, 1, 0)
    self._checkCounts(scorer2.counts, length-windowSize*numWindows-1, 0, 1, 0)
  def testGetLabels(self):
    """
    Labels dictionary generated by CorpusLabel.getLabels() should match the
    label windows.
    """
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2014-01-01"),
      datetime.timedelta(minutes=5), 10)})

    windows = [["2014-01-01 00:00", "2014-01-01 00:10"],
               ["2014-01-01 00:10", "2014-01-01 00:15"]]

    writeCorpus(self.tempCorpusPath, {"test_data_file.csv" : data})
    writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows})

    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    corpusLabel = nab.labeler.CorpusLabel(self.tempCorpusLabelPath, corpus)

    for relativePath, l in corpusLabel.labels.iteritems():
      windows = corpusLabel.windows[relativePath]

      for t, lab in corpusLabel.labels["test_data_file.csv"].values:
        for w in windows:
          if (w[0] <= t and t <= w[1]):
            self.assertEqual(lab, 1,
              "Incorrect label value for timestamp %r" % t)
Beispiel #25
0
    def testOneFalsePositiveNoWindow(self):
        """
    When there is no window (i.e. no anomaly), a false positive should still
    result in a negative score, specifically negative the FP weight.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 1000
        numWindows = 0
        windowSize = 10

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        labels = generateLabels(timestamps, windows)
        predictions = pandas.Series([0] * length)

        predictions[0] = 1
        scorer = Scorer(timestamps,
                        predictions,
                        labels,
                        windows,
                        self.costMatrix,
                        probationaryPeriod=0)
        (_, score) = scorer.getScore()

        self.assertTrue(score == -self.costMatrix["fpWeight"])
        self._checkCounts(scorer.counts, length - windowSize * numWindows - 1,
                          0, 1, windowSize * numWindows)
    def testFirstTruePositiveWithinWindow(self):
        """
    First record within window has a score approximately equal to 
    self.costMatrix["tpWeight"]; within 4 decimal places is more than enough
    precision.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 10
        numWindows = 1
        windowSize = 2

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        labels = generateLabels(timestamps, windows)
        predictions = pandas.Series([0] * length)

        index = timestamps[timestamps == windows[0][0]].index[0]
        predictions[index] = 1
        scorer = Scorer(timestamps,
                        predictions,
                        labels,
                        windows,
                        self.costMatrix,
                        probationaryPeriod=0)
        (_, score) = scorer.getScore()

        self.assertAlmostEquals(score, self.costMatrix["tpWeight"], 4)
        self._checkCounts(scorer.counts, length - windowSize * numWindows, 1,
                          0, windowSize * numWindows - 1)
  def testRowsLabeledAnomalousWithinAWindow(self):
    """
    All timestamps labeled as anomalous should be within a label window.
    """
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2014-01-01"),
      datetime.timedelta(minutes=5), 10)})

    windows = [["2014-01-01 00:15", "2014-01-01 00:30"]]

    writeCorpus(self.tempCorpusPath, {"test_data_file.csv": data})
    writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows})

    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    corpusLabel = nab.labeler.CorpusLabel(self.tempCorpusLabelPath, corpus)

    for relativePath, lab in corpusLabel.labels.iteritems():
      windows = corpusLabel.windows[relativePath]

      for row in lab[lab["label"] == 1].iterrows():
        self.assertTrue(
          all([w[0] <= row[1]["timestamp"] <= w[1] for w in windows]),
            "The label at %s of file %s is not within a label window"
            % (row[1]["timestamp"], relativePath))
  def testFirstTruePositiveWithinWindow(self):
    """
    First record within window has a score approximately equal to 
    self.costMatrix["tpWeight"]; within 4 decimal places is more than enough
    precision.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    windowSize = 2

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    anomalyScores = pandas.Series([0] * length)
    threshold = 0.5

    # Set a single true positive
    index = timestamps[timestamps == windows[0][0]].index[0]
    anomalyScores[index] = 1.0

    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
    (scores, matchingRow) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores,
      windows,
      "testData",
      threshold
    )

    self.assertEqual(matchingRow.score, self.costMatrix["tpWeight"])
    self._checkCounts(matchingRow, length - windowSize * numWindows, 1, 0,
                      windowSize * numWindows - 1)
Beispiel #29
0
  def testRewardLowFalsePositives(self):
    """
    Given false positives in the set of detections, the score output with the
    Reward Low False Positives application profile will be greater than with
    the Standard application profile.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 100
    numWindows = 0
    windowSize = 10
    
    timestamps = generateTimestamps(start, increment, length)
    windows = []
    labels = generateLabels(timestamps, windows)
    predictions = pandas.Series([0]*length)
    
    costMatrixFP = copy.deepcopy(self.costMatrix)
    costMatrixFP["fpWeight"] = 2.0
    costMatrixFP["fnWeight"] = 0.5
    # FP
    predictions[0] = 1

    scorer1 = Scorer(timestamps, predictions, labels, windows, self.costMatrix,
      probationaryPeriod=0)
    (_, score1) = scorer1.getScore()
    scorer2 = Scorer(timestamps, predictions, labels, windows, costMatrixFP,
      probationaryPeriod=0)
    (_, score2) = scorer2.getScore()
    
    self.assertEqual(score1, 0.5*score2)
    self._checkCounts(scorer1.counts, length-windowSize*numWindows-1, 0, 1, 0)
    self._checkCounts(scorer2.counts, length-windowSize*numWindows-1, 0, 1, 0)
Beispiel #30
0
 def testScoringAllMetrics(self):
   """
   This tests an example set of detections, where all metrics have counts > 0.
   """
   start = datetime.datetime.now()
   increment = datetime.timedelta(minutes=5)
   length = 100
   numWindows = 2
   windowSize = 5
   
   timestamps = generateTimestamps(start, increment, length)
   windows = generateWindows(timestamps, numWindows, windowSize)
   labels = generateLabels(timestamps, windows)
   predictions = pandas.Series([0]*length)
   
   index = timestamps[timestamps == windows[0][0]].index[0]
   # TP, add'l TP, and FP
   predictions[index] = 1
   predictions[index+1] = 1
   predictions[index+7] = 1
   
   scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix,
     probationaryPeriod=0)
   (_, score) = scorer.getScore()
   
   self.assertAlmostEquals(score, -0.9540, 4)
   self._checkCounts(scorer.counts, length-windowSize*numWindows-1, 2, 1, 8)
Beispiel #31
0
  def testEarlierFalsePositiveAfterWindowIsBetter(self):
    """For two false positives A and B, where A occurs earlier than B, the
    score change due to A will be less than the score change due to B.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    windowSize = 2

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    labels = generateLabels(timestamps, windows)
    predictions1 = pandas.Series([0]*length)
    predictions2 = pandas.Series([0]*length)
    t1, t2 = windows[0]

    index1 = timestamps[timestamps == t2].index[0] + 1
    predictions1[index1] = 1
    scorer1 = Scorer(timestamps, predictions1, labels, windows, self.costMatrix,
      probationaryPeriod=0)
    (_, score1) = scorer1.getScore()

    predictions2[index1+1] = 1
    scorer2 = Scorer(timestamps, predictions2, labels, windows, self.costMatrix,
      probationaryPeriod=0)
    (_, score2) = scorer2.getScore()

    self.assertTrue(score1 > score2)
    self._checkCounts(scorer1.counts, length-windowSize*numWindows-1, 0, 1,
      windowSize*numWindows)
    self._checkCounts(scorer2.counts, length-windowSize*numWindows-1, 0, 1,
      windowSize*numWindows)
Beispiel #32
0
    def testScoringAllMetrics(self):
        """
    This tests an example set of detections, where all metrics have counts > 0.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 100
        numWindows = 2
        windowSize = 5

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        labels = generateLabels(timestamps, windows)
        predictions = pandas.Series([0] * length)

        index = timestamps[timestamps == windows[0][0]].index[0]
        # TP, add'l TP, and FP
        predictions[index] = 1
        predictions[index + 1] = 1
        predictions[index + 7] = 1

        scorer = Scorer(timestamps,
                        predictions,
                        labels,
                        windows,
                        self.costMatrix,
                        probationaryPeriod=0)
        (_, score) = scorer.getScore()

        self.assertAlmostEquals(score, -0.9540, 4)
        self._checkCounts(scorer.counts, length - windowSize * numWindows - 1,
                          2, 1, 8)
Beispiel #33
0
  def test_earlierFalsePositiveAfterWindowIsBetter(self):
    """Imagine there are two false positives A and B that both occur right after
    a window. If A occurs earlier than B, then the score change due to A will be
    less than the score change due to B.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    windowSize = 2

    timestamps = generateTimestamps(start, increment, length)

    predictions1 = pandas.Series([0]*length)
    predictions2 = pandas.Series([0]*length)

    windows = generateWindows(timestamps, numWindows, windowSize)
    labels = generateLabels(timestamps, windows)

    window = windows[0]
    t1, t2 = window

    costMatrix = {"tpWeight": 1.0,
                  "fnWeight": 1.0,
                  "fpWeight": 1.0,
                  "tnWeight": 1.0}

    index1 = timestamps[timestamps == t2].index[0] + 1
    predictions1[index1] = 1

    scorer1 = Scorer(timestamps, predictions1, labels, windows, costMatrix,
      probationaryPeriod=0)
    score1 = scorer1.getScore()

    predictions2[index1+1] = 1

    scorer2 = Scorer(timestamps, predictions2, labels, windows, costMatrix,
      probationaryPeriod=0)
    score2 = scorer2.getScore()

    self.assertTrue(score1 > score2)

    # Ensure counts are correct.
    self.assertEqual(scorer1.counts['tn'], length-windowSize*numWindows-1)
    self.assertEqual(scorer1.counts['tp'], 0)
    self.assertEqual(scorer1.counts['fp'], 1)
    self.assertEqual(scorer1.counts['fn'], windowSize*numWindows)

    self.assertEqual(scorer2.counts['tn'], length-windowSize*numWindows-1)
    self.assertEqual(scorer2.counts['tp'], 0)
    self.assertEqual(scorer2.counts['fp'], 1)
    self.assertEqual(scorer2.counts['fn'], windowSize*numWindows)
Beispiel #34
0
  def test_twoFalsePositivesIsWorseThanOne(self):
    """False positives have an additive effect on the score. If there are two
    false positives, A and B, in a file, then the score given A and B should be
    larger than the score given just A.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 1000
    numWindows = 1
    windowSize = 10

    timestamps = generateTimestamps(start, increment, length)
    predictions = pandas.Series([0]*length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    labels = generateLabels(timestamps, windows)


    costMatrix = {"tpWeight": 1.0,
    "fnWeight": 1.0,
    "fpWeight": 1.0,
    "tnWeight": 1.0}

    predictions[0] = 1

    scorer1 = Scorer(timestamps, predictions, labels, windows, costMatrix,
      probationaryPeriod=0)

    score1 = scorer1.getScore()


    predictions[1] = 1

    scorer2 = Scorer(timestamps, predictions, labels, windows, costMatrix,
      probationaryPeriod=0)

    score2 = scorer2.getScore()

    self.assertTrue(score1 > score2)

    # Ensure counts are correct.
    self.assertEqual(scorer1.counts['tn'], length-windowSize*numWindows-1)
    self.assertEqual(scorer1.counts['tp'], 0)
    self.assertEqual(scorer1.counts['fp'], 1)
    self.assertEqual(scorer1.counts['fn'], windowSize*numWindows)

    self.assertEqual(scorer2.counts['tn'], length-windowSize*numWindows-2)
    self.assertEqual(scorer2.counts['tp'], 0)
    self.assertEqual(scorer2.counts['fp'], 2)
    self.assertEqual(scorer2.counts['fn'], windowSize*numWindows)
  def testNonexistentDatafileForLabelsThrowsError(self):
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2014-01-01"),
      datetime.timedelta(minutes=5), 10)})

    windows = [["2014-01-01 00:15", "2014-01-01 00:30"]]

    writeCorpus(self.tempCorpusPath, {"test_data_file.csv": data})
    writeCorpusLabel(self.tempCorpusLabelPath,
      {"test_data_file.csv": windows, "non_existent_data_file.csv": windows})

    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    self.assertRaises(
      KeyError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
  def testTruePositiveAtRightEdgeOfWindow(self):
    """
    True positives at the right edge of a window should yield a score of
    approximately zero; the scaled sigmoid scoring function crosses the zero
    between a given window's last timestamp and the next timestamp (immediately
    following the window.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 1000
    numWindows = 1
    windowSize = 100
    threshold = 0.5

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    anomalyScores = pandas.Series([0]*length)

    # Make prediction at end of the window; TP
    index = timestamps[timestamps == windows[0][1]].index[0]
    anomalyScores[index] = 1
    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
    (_, matchingRow1) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores,
      windows,
      "testData",
      threshold
    )
    # Make prediction just after the window; FP
    anomalyScores[index] = 0
    index += 1
    anomalyScores[index] = 1
    (_, matchingRow2) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores,
      windows,
      "testData",
      threshold
    )

    # TP score + FP score + 1 should be very close to 0; the 1 is added to
    # account for the subsequent FN contribution.
    self.assertAlmostEqual(matchingRow1.score + matchingRow2.score + 1, 0.0, 3)
    self._checkCounts(matchingRow1, length-windowSize*numWindows, 1, 0,
      windowSize*numWindows-1)
    self._checkCounts(matchingRow2, length-windowSize*numWindows-1, 0, 1,
      windowSize*numWindows)
    def testTruePositiveAtRightEdgeOfWindow(self):
        """
    True positives at the right edge of a window should yield a score of
    approximately zero; the scaled sigmoid scoring function crosses the zero
    between a given window's last timestamp and the next timestamp (immediately
    following the window.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 1000
        numWindows = 1
        windowSize = 100

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        labels = generateLabels(timestamps, windows)
        predictions = pandas.Series([0] * length)

        # Make prediction at end of the window; TP
        index = timestamps[timestamps == windows[0][1]].index[0]
        predictions[index] = 1
        scorer1 = Scorer(timestamps,
                         predictions,
                         labels,
                         windows,
                         self.costMatrix,
                         probationaryPeriod=0)
        (_, score1) = scorer1.getScore()
        # Make prediction just after the window; FP
        predictions[index] = 0
        index += 1
        predictions[index] = 1
        scorer2 = Scorer(timestamps,
                         predictions,
                         labels,
                         windows,
                         self.costMatrix,
                         probationaryPeriod=0)
        (_, score2) = scorer2.getScore()

        # TP score + FP score + 1 should be very close to 0; the 1 is added to
        # account for the subsequent FN contribution.
        self.assertAlmostEquals(score1 + score2 + 1, 0.0, 3)
        self._checkCounts(scorer1.counts, length - windowSize * numWindows, 1,
                          0, windowSize * numWindows - 1)
        self._checkCounts(scorer2.counts, length - windowSize * numWindows - 1,
                          0, 1, windowSize * numWindows)
  def testEarlierTruePositiveIsBetter(self):
    """
    If two algorithms both get a true positive within a window, the algorithm
    with the earlier true positive (in the window) should get a higher score.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    windowSize = 2

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    anomalyScores1 = pandas.Series([0] * length)
    anomalyScores2 = pandas.Series([0] * length)
    threshold = 0.5
    t1, t2 = windows[0]

    index1 = timestamps[timestamps == t1].index[0]
    anomalyScores1[index1] = 1
    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
    (_, matchingRow1) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores1,
      windows,
      "testData",
      threshold
    )

    index2 = timestamps[timestamps == t2].index[0]
    anomalyScores2[index2] = 1
    (_, matchingRow2) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores2,
      windows,
      "testData",
      threshold
    )
    score1 = matchingRow1.score
    score2 = matchingRow2.score

    self.assertTrue(score1 > score2, "The earlier TP score is not greater than "
      "the later TP. They are %f and %f, respectively." % (score1, score2))
    self._checkCounts(matchingRow1, length-windowSize*numWindows, 1, 0,
      windowSize*numWindows-1)
    self._checkCounts(matchingRow2, length-windowSize*numWindows, 1, 0,
      windowSize*numWindows-1)
  def testOnlyScoreFirstTruePositiveWithinWindow(self):
    """
    An algorithm making multiple detections within a window (i.e. true positive)
    should only be scored for the earliest true positive.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    windowSize = 2

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    anomalyScores = pandas.Series([0] * length)
    threshold = 0.5
    window = windows[0]
    t1, t2 = window

    # Score with a single true positive at start of window
    index1 = timestamps[timestamps == t1].index[0]
    anomalyScores[index1] = 1
    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
    (_, matchingRow1) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores,
      windows,
      "testData",
      threshold
    )

    # Add a second true positive to end of window
    index2 = timestamps[timestamps == t2].index[0]
    anomalyScores[index2] = 1
    (_, matchingRow2) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores,
      windows,
      "testData",
      threshold
    )

    self.assertEqual(matchingRow1.score, matchingRow2.score)
    self._checkCounts(matchingRow1, length-windowSize*numWindows, 1, 0,
      windowSize*numWindows-1)
    self._checkCounts(matchingRow2, length-windowSize*numWindows, 2, 0,
      windowSize*numWindows-2)
Beispiel #40
0
  def testRewardLowFalseNegatives(self):
    """
    Given false negatives in the set of detections, the score output with the
    Reward Low False Negatives application profile will be greater than with
    the Standard application profile.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 100
    numWindows = 1
    windowSize = 10
    threshold = 0.5

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    anomalyScores = pandas.Series([0]*length)
    
    costMatrixFN = copy.deepcopy(self.costMatrix)
    costMatrixFN["fnWeight"] = 2.0
    costMatrixFN["fpWeight"] = 0.055

    sweeper1 = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
    sweeper2 = Sweeper(probationPercent=0, costMatrix=costMatrixFN)

    (scores, matchingRow1) = sweeper1.scoreDataSet(
      timestamps,
      anomalyScores,
      windows,
      "testData",
      threshold
    )

    (scores, matchingRow2) = sweeper2.scoreDataSet(
      timestamps,
      anomalyScores,
      windows,
      "testData",
      threshold
    )


    self.assertEqual(matchingRow1.score, 0.5*matchingRow2.score)
    self._checkCounts(matchingRow1, length-windowSize*numWindows, 0, 0,
      windowSize*numWindows)
    self._checkCounts(matchingRow2, length-windowSize*numWindows, 0, 0,
      windowSize*numWindows)
Beispiel #41
0
  def testNullCase(self):
    """No windows and no predictions should yield a score of 0.0."""
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10

    timestamps = generateTimestamps(start, increment, length)
    predictions = pandas.Series([0]*length)
    labels = pandas.Series([0]*length)
    windows = []

    scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix,
      probationaryPeriod=0)
    (_, score) = scorer.getScore()

    self.assertEqual(score, 0.0)
    self._checkCounts(scorer.counts, 10, 0, 0, 0)
  def testTruePositivesWithDifferentWindowSizes(self):
    """
    True positives  at the left edge of windows should have the same score
    regardless of width of window.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    timestamps = generateTimestamps(start, increment, length)
    threshold = 0.5

    windowSize1 = 2
    windows1 = generateWindows(timestamps, numWindows, windowSize1)
    index = timestamps[timestamps == windows1[0][0]].index[0]
    anomalyScores1 = pandas.Series([0]*length)
    anomalyScores1[index] = 1
    
    windowSize2 = 3
    windows2 = generateWindows(timestamps, numWindows, windowSize2)
    index = timestamps[timestamps == windows2[0][0]].index[0]
    anomalyScores2 = pandas.Series([0]*length)
    anomalyScores2[index] = 1

    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
    (_, matchingRow1) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores1,
      windows1,
      "testData",
      threshold
    )

    (_, matchingRow2) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores2,
      windows2,
      "testData",
      threshold
    )
    
    self.assertEqual(matchingRow1.score, matchingRow2.score)
    self._checkCounts(matchingRow1, length-windowSize1*numWindows, 1, 0,
      windowSize1*numWindows-1)
    self._checkCounts(matchingRow2, length-windowSize2*numWindows, 1, 0,
      windowSize2*numWindows-1)
    def testEarlierTruePositiveIsBetter(self):
        """
    If two algorithms both get a true positive within a window, the algorithm
    with the earlier true positive (in the window) should get a higher score.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 10
        numWindows = 1
        windowSize = 2

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        labels = generateLabels(timestamps, windows)
        predictions1 = pandas.Series([0] * length)
        predictions2 = pandas.Series([0] * length)
        t1, t2 = windows[0]

        index1 = timestamps[timestamps == t1].index[0]
        predictions1[index1] = 1
        scorer1 = Scorer(timestamps,
                         predictions1,
                         labels,
                         windows,
                         self.costMatrix,
                         probationaryPeriod=0)
        (_, score1) = scorer1.getScore()

        index2 = timestamps[timestamps == t2].index[0]
        predictions2[index2] = 1
        scorer2 = Scorer(timestamps,
                         predictions2,
                         labels,
                         windows,
                         self.costMatrix,
                         probationaryPeriod=0)
        (_, score2) = scorer2.getScore()

        self.assertTrue(
            score1 > score2, "The earlier TP score is not greater than "
            "the later TP. They are %f and %f, respectively." %
            (score1, score2))
        self._checkCounts(scorer1.counts, length - windowSize * numWindows, 1,
                          0, windowSize * numWindows - 1)
        self._checkCounts(scorer2.counts, length - windowSize * numWindows, 1,
                          0, windowSize * numWindows - 1)
  def testWindowTimestampsNotInDataFileThrowsError(self):
    """
    A ValueError should be thrown when label windows contain timestamps
    that do no exist in the data file.
    """
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2014-01-01"), None, 1)})

    windows = [["2015-01-01", "2015-01-01"]]

    writeCorpus(self.tempCorpusPath, {"test_data_file.csv" : data})
    writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows})

    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    self.assertRaises(ValueError,
      nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
Beispiel #45
0
  def testNullCase(self):
    """No windows and no predictions should yield a score of 0.0."""
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10

    timestamps = generateTimestamps(start, increment, length)
    predictions = pandas.Series([0]*length)
    labels = pandas.Series([0]*length)
    windows = []

    scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix,
      probationaryPeriod=0)
    (_, score) = scorer.getScore()

    self.assertEqual(score, 0.0)
    self._checkCounts(scorer.counts, 10, 0, 0, 0)
Beispiel #46
0
  def testWindowTimestampsNotInDataFileThrowsError(self):
    """
    A ValueError should be thrown when label windows contain timestamps
    that do no exist in the data file.
    """
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2014-01-01"), None, 1)})

    windows = [["2015-01-01", "2015-01-01"]]

    writeCorpus(self.tempCorpusPath, {"test_data_file.csv" : data})
    writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows})

    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    self.assertRaises(ValueError,
      nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
    def testTruePositivesWithDifferentWindowSizes(self):
        """
    True positives  at the left edge of windows should have the same score
    regardless of width of window.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 10
        numWindows = 1
        timestamps = generateTimestamps(start, increment, length)

        windowSize1 = 2
        windows1 = generateWindows(timestamps, numWindows, windowSize1)
        labels1 = generateLabels(timestamps, windows1)
        index = timestamps[timestamps == windows1[0][0]].index[0]
        predictions1 = pandas.Series([0] * length)
        predictions1[index] = 1

        windowSize2 = 3
        windows2 = generateWindows(timestamps, numWindows, windowSize2)
        labels2 = generateLabels(timestamps, windows2)
        index = timestamps[timestamps == windows2[0][0]].index[0]
        predictions2 = pandas.Series([0] * length)
        predictions2[index] = 1

        scorer1 = Scorer(timestamps,
                         predictions1,
                         labels1,
                         windows1,
                         self.costMatrix,
                         probationaryPeriod=0)
        (_, score1) = scorer1.getScore()
        scorer2 = Scorer(timestamps,
                         predictions2,
                         labels2,
                         windows2,
                         self.costMatrix,
                         probationaryPeriod=0)
        (_, score2) = scorer2.getScore()

        self.assertEqual(score1, score2)
        self._checkCounts(scorer1.counts, length - windowSize1 * numWindows, 1,
                          0, windowSize1 * numWindows - 1)
        self._checkCounts(scorer2.counts, length - windowSize2 * numWindows, 1,
                          0, windowSize2 * numWindows - 1)
    def testNullCase(self):
        """No windows and no predictions should yield a score of 0.0."""
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 10
        threshold = 0.5

        timestamps = generateTimestamps(start, increment, length)
        anomalyScores = pandas.Series([0] * length)
        windows = []

        sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
        (scores, matchingRow) = sweeper.scoreDataSet(timestamps, anomalyScores,
                                                     windows, "testData",
                                                     threshold)

        self.assertEqual(matchingRow.score, 0.0)
        self._checkCounts(matchingRow, 10, 0, 0, 0)
Beispiel #49
0
  def testFalsePositiveScaling(self):
    """
    Test scaling the weight of false positives results in an approximate
    balance with the true positives.
    
    The contributions of TP and FP scores should approximately cancel; i.e.
    total score =0. With x windows, this total score should on average decrease
    x/2 because of x FNs. Thus, the acceptable range for score should be
    centered about -x/2.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 100
    numWindows = 1
    windowSize = 10
    threshold = 0.5
    
    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    
    # Scale for 10% = windowSize/length
    self.costMatrix["fpWeight"] = 0.11
    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)

    # Make arbitrary detections, score, repeat
    scores = []
    for _ in xrange(20):
      anomalyScores = pandas.Series([0]*length)
      indices = random.sample(range(length), 10)
      anomalyScores[indices] = 1
      (scores, matchingRow) = sweeper.scoreDataSet(
        timestamps,
        anomalyScores,
        windows,
        "testData",
        threshold
      )
      scores.append(matchingRow.score)
  
    avgScore = sum(scores)/float(len(scores))

    self.assertTrue(-1.5 <= avgScore <= 0.5, "The average score across 20 sets "
      "of random detections is %f, which is not within the acceptable range "
      "-1.5 to 0.5." % avgScore)
Beispiel #50
0
    def testFalsePositiveScaling(self):
        """
    Test scaling the weight of false positives results in an approximate
    balance with the true positives.
    
    The contributions of TP and FP scores should approximately cancel; i.e.
    total score =0. With x windows, this total score should on average decrease
    x/2 because of x FNs. Thus, the acceptable range for score should be
    centered about -x/2.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 100
        numWindows = 1
        windowSize = 10

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        labels = generateLabels(timestamps, windows)

        # Scale for 10% = windowSize/length
        self.costMatrix["fpWeight"] = 0.11

        # Make arbitrary detections, score, repeat
        scores = []
        for _ in xrange(20):
            predictions = pandas.Series([0] * length)
            indices = random.sample(range(length), 10)
            predictions[indices] = 1
            scorer = Scorer(timestamps,
                            predictions,
                            labels,
                            windows,
                            self.costMatrix,
                            probationaryPeriod=0)
            (_, score) = scorer.getScore()
            scores.append(score)

        avgScore = sum(scores) / float(len(scores))

        self.assertTrue(
            -1.5 <= avgScore <= 0.5, "The average score across 20 sets "
            "of random detections is %f, which is not within the acceptable range "
            "-1.5 to 0.5." % avgScore)
Beispiel #51
0
  def testEarlierFalsePositiveAfterWindowIsBetter(self):
    """For two false positives A and B, where A occurs earlier than B, the
    score change due to A will be less than the score change due to B.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    windowSize = 2
    threshold = 0.5

    timestamps = generateTimestamps(start, increment, length)
    windows = generateWindows(timestamps, numWindows, windowSize)
    anomalyScores1 = pandas.Series([0]*length)
    anomalyScores2 = pandas.Series([0]*length)
    t1, t2 = windows[0]

    index1 = timestamps[timestamps == t2].index[0] + 1
    anomalyScores1[index1] = 1
    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
    (scores, matchingRow1) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores1,
      windows,
      "testData",
      threshold
    )

    anomalyScores2[index1+1] = 1
    sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)
    (scores, matchingRow2) = sweeper.scoreDataSet(
      timestamps,
      anomalyScores2,
      windows,
      "testData",
      threshold
    )

    self.assertTrue(matchingRow1.score > matchingRow2.score)
    self._checkCounts(matchingRow1, length-windowSize*numWindows-1, 0, 1,
      windowSize*numWindows)
    self._checkCounts(matchingRow2, length-windowSize*numWindows-1, 0, 1,
      windowSize*numWindows)
    def testOnlyScoreFirstTruePositiveWithinWindow(self):
        """
    An algorithm making multiple detections within a window (i.e. true positive)
    should only be scored for the earliest true positive.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 10
        numWindows = 1
        windowSize = 2

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)
        labels = generateLabels(timestamps, windows)
        predictions = pandas.Series([0] * length)
        window = windows[0]
        t1, t2 = window

        index1 = timestamps[timestamps == t1].index[0]
        predictions[index1] = 1
        scorer1 = Scorer(timestamps,
                         predictions,
                         labels,
                         windows,
                         self.costMatrix,
                         probationaryPeriod=0)
        (_, score1) = scorer1.getScore()

        index2 = timestamps[timestamps == t2].index[0]
        predictions[index2] = 1
        scorer2 = Scorer(timestamps,
                         predictions,
                         labels,
                         windows,
                         self.costMatrix,
                         probationaryPeriod=0)
        (_, score2) = scorer2.getScore()

        self.assertEqual(score1, score2)
        self._checkCounts(scorer1.counts, length - windowSize * numWindows, 1,
                          0, windowSize * numWindows - 1)
        self._checkCounts(scorer2.counts, length - windowSize * numWindows, 2,
                          0, windowSize * numWindows - 2)
Beispiel #53
0
  def test_secondTruePositiveWithinWindowIsIgnored(self):
    """
    If there are two true positives within the same window, then the score
    should be only decided by whichever true positive occurred earlier.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    windowSize = 2

    timestamps = generateTimestamps(start, increment, length)

    predictions = pandas.Series([0]*length)

    windows = generateWindows(timestamps, numWindows, windowSize)

    labels = generateLabels(timestamps, windows)
    window = windows[0]
    t1, t2 = window

    costMatrix = {"tpWeight": 1.0,
                  "fnWeight": 2.0,
                  "fpWeight": 3.0,
                  "tnWeight": 4.0}

    index1 = timestamps[timestamps == t1].index[0]
    predictions[index1] = 1

    scorer1 = Scorer(timestamps, predictions, labels, windows, costMatrix,
      probationaryPeriod=0)

    score1 = scorer1.getScore()

    index2 = timestamps[timestamps == t2].index[0]
    predictions[index2] = 1

    scorer2 = Scorer(timestamps, predictions, labels, windows, costMatrix,
      probationaryPeriod=0)

    score2 = scorer2.getScore()

    self.assertEqual(score1, score2)
    def testFalsePositiveScaling(self):
        """
    Test scaling the weight of false positives results in an approximate
    balance with the true positives.
    
    The contributions of TP and FP scores should approximately cancel; i.e.
    total score =0. With x windows, this total score should on average decrease
    x/2 because of x FNs. Thus, the acceptable range for score should be
    centered about -x/2.
    """
        start = datetime.datetime.now()
        increment = datetime.timedelta(minutes=5)
        length = 100
        numWindows = 1
        windowSize = 10
        threshold = 0.5

        timestamps = generateTimestamps(start, increment, length)
        windows = generateWindows(timestamps, numWindows, windowSize)

        # Scale for 10% = windowSize/length
        self.costMatrix["fpWeight"] = 0.11
        sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix)

        # Make arbitrary detections, score, repeat
        scores = []
        for _ in range(20):
            anomalyScores = pandas.Series([0] * length)
            indices = random.sample(list(range(length)), 10)
            anomalyScores[indices] = 1
            (scores,
             matchingRow) = sweeper.scoreDataSet(timestamps, anomalyScores,
                                                 windows, "testData",
                                                 threshold)
            scores.append(matchingRow.score)

        avgScore = sum(scores) / float(len(scores))

        self.assertTrue(
            -1.5 <= avgScore <= 0.5, "The average score across 20 sets "
            "of random detections is %f, which is not within the acceptable range "
            "-1.5 to 0.5." % avgScore)
Beispiel #55
0
  def test_earlierTruePositiveIsBetter(self):
    """
    If two algorithms both get a true positive within a window, the algorithm
    that labeled a true positive earlier in the window will get a higher score.
    """
    start = datetime.datetime.now()
    increment = datetime.timedelta(minutes=5)
    length = 10
    numWindows = 1
    windowSize = 2

    timestamps = generateTimestamps(start, increment, length)

    predictions1 = pandas.Series([0]*length)
    predictions2 = pandas.Series([0]*length)

    windows = generateWindows(timestamps, numWindows, windowSize)

    labels = generateLabels(timestamps, windows)
    window = windows[0]
    t1, t2 = window

    costMatrix = {"tpWeight": 1.0,
                  "fnWeight": 2.0,
                  "fpWeight": 3.0,
                  "tnWeight": 4.0}

    index1 = timestamps[timestamps == t1].index[0]
    predictions1[index1] = 1

    scorer1 = Scorer(timestamps, predictions1, labels, windows, costMatrix,
      probationaryPeriod=0)
    score1 = scorer1.getScore()

    index2 = timestamps[timestamps == t2].index[0]
    predictions2[index2] = 1

    scorer2 = Scorer(timestamps, predictions2, labels, windows, costMatrix,
      probationaryPeriod=0)
    score2 = scorer2.getScore()

    self.assertTrue(score1 > score2)
Beispiel #56
0
  def testWindowTimestampsNonChronologicalThrowsError(self):
    """
    A ValueError should be thrown when a label window's start and end
    times are not in chronological order.
    """
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2014-01-01"),
      datetime.timedelta(minutes=5), 10)})

    # Windows both in and out of order
    windows = [["2014-01-01 00:45", "2014-01-01 00:00"],
               ["2014-01-01 10:15", "2014-01-01 11:15"]]
    
    writeCorpus(self.tempCorpusPath, {"test_data_file.csv" : data})
    writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows})

    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    self.assertRaises(
      ValueError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
Beispiel #57
0
  def testRedundantTimestampsRaiseException(self):
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2015-01-01"),
      datetime.timedelta(days=1), 365)})
    dataFileName = "test_data_file.csv"
    writeCorpus(self.tempCorpusPath, {dataFileName : data})

    labels = ["2015-12-25 00:00:00",
              "2015-12-26 00:00:00",
              "2015-12-31 00:00:00"]
    labelsDir = self.tempCorpusLabelPath.replace(
      "/label.json", "/raw/label.json")
    writeCorpusLabel(labelsDir, {dataFileName: labels})

    corpus = nab.corpus.Corpus(self.tempCorpusPath)
    labDir = labelsDir.replace("/label.json", "")
    labelCombiner = nab.labeler.LabelCombiner(
      labDir, corpus, 0.5, 0.10, 0.15, 0)

    self.assertRaises(ValueError, labelCombiner.combine)
  def testWindowTimestampsNonChronologicalThrowsError(self):
    """
    A ValueError should be thrown when a label window's start and end
    times are not in chronological order.
    """
    data = pandas.DataFrame({"timestamp" :
      generateTimestamps(strp("2014-01-01"),
      datetime.timedelta(minutes=5), 10)})

    # Windows both in and out of order
    windows = [["2014-01-01 00:45", "2014-01-01 00:00"],
               ["2014-01-01 10:15", "2014-01-01 11:15"]]

    writeCorpus(self.tempCorpusPath, {"test_data_file.csv" : data})
    writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows})

    corpus = nab.corpus.Corpus(self.tempCorpusPath)

    self.assertRaises(
      ValueError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)