def testFourFalseNegatives(self): """ A false negative with four windows should have exactly four times the negative of the false negative score. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 2000 numWindows = 4 windowSize = 10 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions = pandas.Series([0] * length) scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score) = scorer.getScore() self.assertTrue(abs(score + 4 * self.costMatrix['fnWeight']) < 0.01) self._checkCounts(scorer.counts, length - windowSize * numWindows, 0, 0, windowSize * numWindows)
def testRowsLabeledAnomalousWithinAWindow(self): """ All timestamps labeled as anomalous should be within a label window. """ data = pandas.DataFrame({"timestamp" : generateTimestamps(strp("2014-01-01"), datetime.timedelta(minutes=5), 10)}) windows = [["2014-01-01 00:15", "2014-01-01 00:30"]] writeCorpus(self.tempCorpusPath, {"test_data_file.csv": data}) writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows}) corpus = nab.corpus.Corpus(self.tempCorpusPath) corpusLabel = nab.labeler.CorpusLabel(self.tempCorpusLabelPath, corpus) for relativePath, lab in corpusLabel.labels.iteritems(): windows = corpusLabel.windows[relativePath] for row in lab[lab["label"] == 1].iterrows(): self.assertTrue( all([w[0] <= row[1]["timestamp"] <= w[1] for w in windows]), "The label at %s of file %s is not within a label window" % (row[1]["timestamp"], relativePath))
def testFalsePositiveMeansNegativeScore(self): """ A false positive should make the score negative. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 1 windowSize = 10 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions = pandas.Series([0] * length) predictions[0] = 1 scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score) = scorer.getScore() self.assertTrue(score < 0) self._checkCounts(scorer.counts, length - windowSize * numWindows - 1, 0, 1, windowSize * numWindows)
def testFourFalseNegatives(self): """ A false negative with four windows should have exactly four times the negative of the false negative score. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 2000 numWindows = 4 windowSize = 10 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores = pandas.Series([0] * length) threshold = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow) = sweeper.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) self.assertEqual(matchingRow.score, 4 * -self.costMatrix["fnWeight"]) self._checkCounts(matchingRow, length - windowSize * numWindows, 0, 0, windowSize * numWindows)
def testTwoFalsePositivesIsWorseThanOne(self): """ For two false positives A and B in a file, the score given A and B should be more negative than the score given just A. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 1 windowSize = 10 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions = pandas.Series([0]*length) predictions[0] = 1 scorer1 = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score1) = scorer1.getScore() predictions[1] = 1 scorer2 = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score2) = scorer2.getScore() self.assertTrue(score2 < score1) self._checkCounts(scorer1.counts, length-windowSize*numWindows-1, 0, 1, windowSize*numWindows) self._checkCounts(scorer2.counts, length-windowSize*numWindows-2, 0, 2, windowSize*numWindows)
def testFirstTruePositiveWithinWindow(self): """ First record within window has a score approximately equal to self.costMatrix["tpWeight"]; within 4 decimal places is more than enough precision. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions = pandas.Series([0]*length) index = timestamps[timestamps == windows[0][0]].index[0] predictions[index] = 1 scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score) = scorer.getScore() self.assertAlmostEquals(score, self.costMatrix["tpWeight"], 4) self._checkCounts(scorer.counts, length-windowSize*numWindows, 1, 0, windowSize*numWindows-1)
def testNonexistentDatafileOrLabelsThrowsError(self): """ A KeyError should be thrown when there are not corresponding windows labels for a data file (or vice-versa) in the corpus. """ data = pandas.DataFrame({"timestamp" : generateTimestamps(strp("2014-01-01"), datetime.timedelta(minutes=5), 10)}) windows = [["2014-01-01 00:15", "2014-01-01 00:30"]] # Case 1: nonexistent datafile for window labels writeCorpus(self.tempCorpusPath, {"test_data_file.csv": data}) writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows, "non_existent_data_file.csv": windows}) corpus = nab.corpus.Corpus(self.tempCorpusPath) self.assertRaises( KeyError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus) # Case 2: nonexistent window labels for datafile writeCorpus(self.tempCorpusPath, {"test_data_file.csv": data, "non_existent_data_file.csv": data}) writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows}) corpus = nab.corpus.Corpus(self.tempCorpusPath) self.assertRaises( KeyError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
def testFalsePositiveMeansNegativeScore(self): """ A false positive should make the score negative. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 1 windowSize = 10 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores = pandas.Series([0]*length) anomalyScores[0] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow) = sweeper.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) self.assertTrue(matchingRow.score < 0) self._checkCounts(matchingRow, length-windowSize*numWindows-1, 0, 1, windowSize*numWindows)
def testOneFalsePositiveNoWindow(self): """ When there is no window (i.e. no anomaly), a false positive should still result in a negative score, specifically negative the FP weight. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 0 windowSize = 10 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores = pandas.Series([0]*length) anomalyScores[0] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow) = sweeper.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) self.assertEqual(matchingRow.score, -self.costMatrix["fpWeight"]) self._checkCounts(matchingRow, length-windowSize*numWindows-1, 0, 1, windowSize*numWindows)
def testEarlierFalsePositiveAfterWindowIsBetter(self): """For two false positives A and B, where A occurs earlier than B, the score change due to A will be less than the score change due to B. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores1 = pandas.Series([0] * length) anomalyScores2 = pandas.Series([0] * length) t1, t2 = windows[0] index1 = timestamps[timestamps == t2].index[0] + 1 anomalyScores1[index1] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow1) = sweeper.scoreDataSet(timestamps, anomalyScores1, windows, "testData", threshold) anomalyScores2[index1 + 1] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow2) = sweeper.scoreDataSet(timestamps, anomalyScores2, windows, "testData", threshold) self.assertTrue(matchingRow1.score > matchingRow2.score) self._checkCounts(matchingRow1, length - windowSize * numWindows - 1, 0, 1, windowSize * numWindows) self._checkCounts(matchingRow2, length - windowSize * numWindows - 1, 0, 1, windowSize * numWindows)
def testTwoFalsePositivesIsWorseThanOne(self): """ For two false positives A and B in a file, the score given A and B should be more negative than the score given just A. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 1 windowSize = 10 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores = pandas.Series([0] * length) anomalyScores[0] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow1) = sweeper.scoreDataSet(timestamps, anomalyScores, windows, "testData", threshold) anomalyScores[1] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow2) = sweeper.scoreDataSet(timestamps, anomalyScores, windows, "testData", threshold) self.assertTrue(matchingRow2.score < matchingRow1.score) self._checkCounts(matchingRow1, length - windowSize * numWindows - 1, 0, 1, windowSize * numWindows) self._checkCounts(matchingRow2, length - windowSize * numWindows - 2, 0, 2, windowSize * numWindows)
def testGetLabels(self): """ Labels dictionary generated by CorpusLabel.getLabels() should match the label windows. """ data = pandas.DataFrame({"timestamp" : generateTimestamps(strp("2014-01-01"), datetime.timedelta(minutes=5), 10)}) windows = [["2014-01-01 00:00", "2014-01-01 00:10"], ["2014-01-01 00:10", "2014-01-01 00:15"]] writeCorpus(self.tempCorpusPath, {"test_data_file.csv" : data}) writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows}) corpus = nab.corpus.Corpus(self.tempCorpusPath) corpusLabel = nab.labeler.CorpusLabel(self.tempCorpusLabelPath, corpus) for relativePath, l in corpusLabel.labels.iteritems(): windows = corpusLabel.windows[relativePath] for t, lab in corpusLabel.labels["test_data_file.csv"].values: for w in windows: if (w[0] <= t and t <= w[1]): self.assertEqual(lab, 1, "Incorrect label value for timestamp %r" % t)
def testOneFalsePositiveNoWindow(self): """ When there is no window (i.e. no anomaly), a false positive should still result in a negative score, specifically negative the FP weight. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 0 windowSize = 10 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores = pandas.Series([0] * length) anomalyScores[0] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow) = sweeper.scoreDataSet(timestamps, anomalyScores, windows, "testData", threshold) self.assertEqual(matchingRow.score, -self.costMatrix["fpWeight"]) self._checkCounts(matchingRow, length - windowSize * numWindows - 1, 0, 1, windowSize * numWindows)
def testScoringAllMetrics(self): """ This tests an example set of detections, where all metrics have counts > 0. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 100 numWindows = 2 windowSize = 5 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores = pandas.Series([0]*length) index = timestamps[timestamps == windows[0][0]].index[0] # TP, add'l TP, and FP anomalyScores[index] = 1 anomalyScores[index+1] = 1 anomalyScores[index+7] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow) = sweeper.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) self.assertAlmostEquals(matchingRow.score, -0.9540, 4) self._checkCounts(matchingRow, length-windowSize*numWindows-1, 2, 1, 8)
def test_FourFalseNegatives(self): """ A false negative with four windows should have exactly four times the negative of the false negative score. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 2000 numWindows = 4 windowSize = 10 timestamps = generateTimestamps(start, increment, length) predictions = pandas.Series([0]*length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) costMatrix = {"tpWeight": 1.0, "fnWeight": 2.0, "fpWeight": 3.0, "tnWeight": 4.0} scorer = Scorer(timestamps, predictions, labels, windows, costMatrix, probationaryPeriod=0) self.assertTrue(abs(scorer.getScore() + 4*costMatrix['fnWeight']) < 0.01) # Ensure counts are correct. self.assertEqual(scorer.counts['tn'], length-windowSize*numWindows) self.assertEqual(scorer.counts['tp'], 0) self.assertEqual(scorer.counts['fp'], 0) self.assertEqual(scorer.counts['fn'], windowSize*numWindows)
def test_firstTruePositiveWithinWindow(self): """ First record within window has a score close to costMatrix["tpWeight"]. Since we use Sigmoids, it will never be exactly 1. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) predictions = pandas.Series([0]*length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) costMatrix = {"tpWeight": 1.0, "fnWeight": 2.0, "fpWeight": 3.0, "tnWeight": 4.0} index = timestamps[timestamps == windows[0][0]].index[0] predictions[index] = 1 scorer = Scorer(timestamps, predictions, labels, windows, costMatrix, probationaryPeriod=0) self.assertTrue(costMatrix["tpWeight"] - scorer.getScore() <= 1)
def test_oneFalsePositiveNoWindow(self): """ When there is no window (meaning no anomaly), a false positive should still result in a negative score. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 0 windowSize = 10 timestamps = generateTimestamps(start, increment, length) predictions = pandas.Series([0]*length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) costMatrix = {"tpWeight": 1.0, "fnWeight": 1.0, "fpWeight": 1.0, "tnWeight": 1.0} predictions[0] = 1 scorer = Scorer(timestamps, predictions, labels, windows, costMatrix, probationaryPeriod=0) self.assertTrue(scorer.getScore() == -costMatrix["fpWeight"]) # Ensure counts are correct. self.assertEqual(scorer.counts['tn'], length-windowSize*numWindows-1) self.assertEqual(scorer.counts['tp'], 0) self.assertEqual(scorer.counts['fp'], 1) self.assertEqual(scorer.counts['fn'], windowSize*numWindows)
def test_falsePositiveMeansNegativeScore(self): """ A false positive should make the score negative. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 1 windowSize = 10 timestamps = generateTimestamps(start, increment, length) predictions = pandas.Series([0]*length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) costMatrix = {"tpWeight": 1.0, "fnWeight": 1.0, "fpWeight": 1.0, "tnWeight": 1.0} predictions[0] = 1 scorer = Scorer(timestamps, predictions, labels, windows, costMatrix, probationaryPeriod=0) score = scorer.getScore() self.assertTrue(score < 0) # Ensure counts are correct. self.assertEqual(scorer.counts['tn'], length-windowSize*numWindows-1) self.assertEqual(scorer.counts['tp'], 0) self.assertEqual(scorer.counts['fp'], 1) self.assertEqual(scorer.counts['fn'], windowSize*numWindows)
def testBucketMerge(self): data = pandas.DataFrame({"timestamp" : generateTimestamps(strp("2015-12-01"), datetime.timedelta(days=1), 31)}) dataFileName = "test_data_file.csv" writeCorpus(self.tempCorpusPath, {dataFileName : data}) rawLabels = (["2015-12-24 00:00:00", "2015-12-31 00:00:00"], ["2015-12-01 00:00:00", "2015-12-25 00:00:00", "2015-12-31 00:00:00"], ["2015-12-25 00:00:00"]) for i, labels in enumerate(rawLabels): labelsPath = self.tempCorpusLabelPath.replace( os.path.sep+"label.json", os.path.sep+"raw"+os.path.sep+"label{}.json".format(i)) writeCorpusLabel(labelsPath, {"test_data_file.csv": labels}) labelsDir = labelsPath.replace(os.path.sep+"label{}.json".format(i), "") corpus = nab.corpus.Corpus(self.tempCorpusPath) labelCombiner = nab.labeler.LabelCombiner( labelsDir, corpus, 0.5, 0.10, 0.15, 0) labelCombiner.getRawLabels() labelTimestamps, _ = labelCombiner.combineLabels() expectedLabels = ['2015-12-25 00:00:00', '2015-12-31 00:00:00'] self.assertEqual(expectedLabels, labelTimestamps[dataFileName], "The combined labels did not bucket and merge as expected.")
def testScoringAllMetrics(self): """ This tests an example set of detections, where all metrics have counts > 0. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 100 numWindows = 2 windowSize = 5 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores = pandas.Series([0] * length) index = timestamps[timestamps == windows[0][0]].index[0] # TP, add'l TP, and FP anomalyScores[index] = 1 anomalyScores[index + 1] = 1 anomalyScores[index + 7] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow) = sweeper.scoreDataSet(timestamps, anomalyScores, windows, "testData", threshold) self.assertAlmostEquals(matchingRow.score, -0.9540, 4) self._checkCounts(matchingRow, length - windowSize * numWindows - 1, 2, 1, 8)
def testOnlyScoreFirstTruePositiveWithinWindow(self): """ An algorithm making multiple detections within a window (i.e. true positive) should only be scored for the earliest true positive. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions = pandas.Series([0]*length) window = windows[0] t1, t2 = window index1 = timestamps[timestamps == t1].index[0] predictions[index1] = 1 scorer1 = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score1) = scorer1.getScore() index2 = timestamps[timestamps == t2].index[0] predictions[index2] = 1 scorer2 = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score2) = scorer2.getScore() self.assertEqual(score1, score2) self._checkCounts(scorer1.counts, length-windowSize*numWindows, 1, 0, windowSize*numWindows-1) self._checkCounts(scorer2.counts, length-windowSize*numWindows, 2, 0, windowSize*numWindows-2)
def testRewardLowFalsePositives(self): """ Given false positives in the set of detections, the score output with the Reward Low False Positives application profile will be greater than with the Standard application profile. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 100 numWindows = 0 windowSize = 10 timestamps = generateTimestamps(start, increment, length) windows = [] labels = generateLabels(timestamps, windows) predictions = pandas.Series([0]*length) costMatrixFP = copy.deepcopy(self.costMatrix) costMatrixFP["fpWeight"] = 2.0 costMatrixFP["fnWeight"] = 0.5 # FP predictions[0] = 1 scorer1 = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score1) = scorer1.getScore() scorer2 = Scorer(timestamps, predictions, labels, windows, costMatrixFP, probationaryPeriod=0) (_, score2) = scorer2.getScore() self.assertEqual(score1, 0.5*score2) self._checkCounts(scorer1.counts, length-windowSize*numWindows-1, 0, 1, 0) self._checkCounts(scorer2.counts, length-windowSize*numWindows-1, 0, 1, 0)
def testOneFalsePositiveNoWindow(self): """ When there is no window (i.e. no anomaly), a false positive should still result in a negative score, specifically negative the FP weight. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 0 windowSize = 10 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions = pandas.Series([0] * length) predictions[0] = 1 scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score) = scorer.getScore() self.assertTrue(score == -self.costMatrix["fpWeight"]) self._checkCounts(scorer.counts, length - windowSize * numWindows - 1, 0, 1, windowSize * numWindows)
def testFirstTruePositiveWithinWindow(self): """ First record within window has a score approximately equal to self.costMatrix["tpWeight"]; within 4 decimal places is more than enough precision. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions = pandas.Series([0] * length) index = timestamps[timestamps == windows[0][0]].index[0] predictions[index] = 1 scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score) = scorer.getScore() self.assertAlmostEquals(score, self.costMatrix["tpWeight"], 4) self._checkCounts(scorer.counts, length - windowSize * numWindows, 1, 0, windowSize * numWindows - 1)
def testFirstTruePositiveWithinWindow(self): """ First record within window has a score approximately equal to self.costMatrix["tpWeight"]; within 4 decimal places is more than enough precision. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores = pandas.Series([0] * length) threshold = 0.5 # Set a single true positive index = timestamps[timestamps == windows[0][0]].index[0] anomalyScores[index] = 1.0 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow) = sweeper.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) self.assertEqual(matchingRow.score, self.costMatrix["tpWeight"]) self._checkCounts(matchingRow, length - windowSize * numWindows, 1, 0, windowSize * numWindows - 1)
def testScoringAllMetrics(self): """ This tests an example set of detections, where all metrics have counts > 0. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 100 numWindows = 2 windowSize = 5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions = pandas.Series([0]*length) index = timestamps[timestamps == windows[0][0]].index[0] # TP, add'l TP, and FP predictions[index] = 1 predictions[index+1] = 1 predictions[index+7] = 1 scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score) = scorer.getScore() self.assertAlmostEquals(score, -0.9540, 4) self._checkCounts(scorer.counts, length-windowSize*numWindows-1, 2, 1, 8)
def testEarlierFalsePositiveAfterWindowIsBetter(self): """For two false positives A and B, where A occurs earlier than B, the score change due to A will be less than the score change due to B. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions1 = pandas.Series([0]*length) predictions2 = pandas.Series([0]*length) t1, t2 = windows[0] index1 = timestamps[timestamps == t2].index[0] + 1 predictions1[index1] = 1 scorer1 = Scorer(timestamps, predictions1, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score1) = scorer1.getScore() predictions2[index1+1] = 1 scorer2 = Scorer(timestamps, predictions2, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score2) = scorer2.getScore() self.assertTrue(score1 > score2) self._checkCounts(scorer1.counts, length-windowSize*numWindows-1, 0, 1, windowSize*numWindows) self._checkCounts(scorer2.counts, length-windowSize*numWindows-1, 0, 1, windowSize*numWindows)
def testScoringAllMetrics(self): """ This tests an example set of detections, where all metrics have counts > 0. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 100 numWindows = 2 windowSize = 5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions = pandas.Series([0] * length) index = timestamps[timestamps == windows[0][0]].index[0] # TP, add'l TP, and FP predictions[index] = 1 predictions[index + 1] = 1 predictions[index + 7] = 1 scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score) = scorer.getScore() self.assertAlmostEquals(score, -0.9540, 4) self._checkCounts(scorer.counts, length - windowSize * numWindows - 1, 2, 1, 8)
def test_earlierFalsePositiveAfterWindowIsBetter(self): """Imagine there are two false positives A and B that both occur right after a window. If A occurs earlier than B, then the score change due to A will be less than the score change due to B. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) predictions1 = pandas.Series([0]*length) predictions2 = pandas.Series([0]*length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) window = windows[0] t1, t2 = window costMatrix = {"tpWeight": 1.0, "fnWeight": 1.0, "fpWeight": 1.0, "tnWeight": 1.0} index1 = timestamps[timestamps == t2].index[0] + 1 predictions1[index1] = 1 scorer1 = Scorer(timestamps, predictions1, labels, windows, costMatrix, probationaryPeriod=0) score1 = scorer1.getScore() predictions2[index1+1] = 1 scorer2 = Scorer(timestamps, predictions2, labels, windows, costMatrix, probationaryPeriod=0) score2 = scorer2.getScore() self.assertTrue(score1 > score2) # Ensure counts are correct. self.assertEqual(scorer1.counts['tn'], length-windowSize*numWindows-1) self.assertEqual(scorer1.counts['tp'], 0) self.assertEqual(scorer1.counts['fp'], 1) self.assertEqual(scorer1.counts['fn'], windowSize*numWindows) self.assertEqual(scorer2.counts['tn'], length-windowSize*numWindows-1) self.assertEqual(scorer2.counts['tp'], 0) self.assertEqual(scorer2.counts['fp'], 1) self.assertEqual(scorer2.counts['fn'], windowSize*numWindows)
def test_twoFalsePositivesIsWorseThanOne(self): """False positives have an additive effect on the score. If there are two false positives, A and B, in a file, then the score given A and B should be larger than the score given just A. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 1 windowSize = 10 timestamps = generateTimestamps(start, increment, length) predictions = pandas.Series([0]*length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) costMatrix = {"tpWeight": 1.0, "fnWeight": 1.0, "fpWeight": 1.0, "tnWeight": 1.0} predictions[0] = 1 scorer1 = Scorer(timestamps, predictions, labels, windows, costMatrix, probationaryPeriod=0) score1 = scorer1.getScore() predictions[1] = 1 scorer2 = Scorer(timestamps, predictions, labels, windows, costMatrix, probationaryPeriod=0) score2 = scorer2.getScore() self.assertTrue(score1 > score2) # Ensure counts are correct. self.assertEqual(scorer1.counts['tn'], length-windowSize*numWindows-1) self.assertEqual(scorer1.counts['tp'], 0) self.assertEqual(scorer1.counts['fp'], 1) self.assertEqual(scorer1.counts['fn'], windowSize*numWindows) self.assertEqual(scorer2.counts['tn'], length-windowSize*numWindows-2) self.assertEqual(scorer2.counts['tp'], 0) self.assertEqual(scorer2.counts['fp'], 2) self.assertEqual(scorer2.counts['fn'], windowSize*numWindows)
def testNonexistentDatafileForLabelsThrowsError(self): data = pandas.DataFrame({"timestamp" : generateTimestamps(strp("2014-01-01"), datetime.timedelta(minutes=5), 10)}) windows = [["2014-01-01 00:15", "2014-01-01 00:30"]] writeCorpus(self.tempCorpusPath, {"test_data_file.csv": data}) writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows, "non_existent_data_file.csv": windows}) corpus = nab.corpus.Corpus(self.tempCorpusPath) self.assertRaises( KeyError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
def testTruePositiveAtRightEdgeOfWindow(self): """ True positives at the right edge of a window should yield a score of approximately zero; the scaled sigmoid scoring function crosses the zero between a given window's last timestamp and the next timestamp (immediately following the window. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 1 windowSize = 100 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores = pandas.Series([0]*length) # Make prediction at end of the window; TP index = timestamps[timestamps == windows[0][1]].index[0] anomalyScores[index] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (_, matchingRow1) = sweeper.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) # Make prediction just after the window; FP anomalyScores[index] = 0 index += 1 anomalyScores[index] = 1 (_, matchingRow2) = sweeper.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) # TP score + FP score + 1 should be very close to 0; the 1 is added to # account for the subsequent FN contribution. self.assertAlmostEqual(matchingRow1.score + matchingRow2.score + 1, 0.0, 3) self._checkCounts(matchingRow1, length-windowSize*numWindows, 1, 0, windowSize*numWindows-1) self._checkCounts(matchingRow2, length-windowSize*numWindows-1, 0, 1, windowSize*numWindows)
def testTruePositiveAtRightEdgeOfWindow(self): """ True positives at the right edge of a window should yield a score of approximately zero; the scaled sigmoid scoring function crosses the zero between a given window's last timestamp and the next timestamp (immediately following the window. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 1000 numWindows = 1 windowSize = 100 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions = pandas.Series([0] * length) # Make prediction at end of the window; TP index = timestamps[timestamps == windows[0][1]].index[0] predictions[index] = 1 scorer1 = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score1) = scorer1.getScore() # Make prediction just after the window; FP predictions[index] = 0 index += 1 predictions[index] = 1 scorer2 = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score2) = scorer2.getScore() # TP score + FP score + 1 should be very close to 0; the 1 is added to # account for the subsequent FN contribution. self.assertAlmostEquals(score1 + score2 + 1, 0.0, 3) self._checkCounts(scorer1.counts, length - windowSize * numWindows, 1, 0, windowSize * numWindows - 1) self._checkCounts(scorer2.counts, length - windowSize * numWindows - 1, 0, 1, windowSize * numWindows)
def testEarlierTruePositiveIsBetter(self): """ If two algorithms both get a true positive within a window, the algorithm with the earlier true positive (in the window) should get a higher score. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores1 = pandas.Series([0] * length) anomalyScores2 = pandas.Series([0] * length) threshold = 0.5 t1, t2 = windows[0] index1 = timestamps[timestamps == t1].index[0] anomalyScores1[index1] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (_, matchingRow1) = sweeper.scoreDataSet( timestamps, anomalyScores1, windows, "testData", threshold ) index2 = timestamps[timestamps == t2].index[0] anomalyScores2[index2] = 1 (_, matchingRow2) = sweeper.scoreDataSet( timestamps, anomalyScores2, windows, "testData", threshold ) score1 = matchingRow1.score score2 = matchingRow2.score self.assertTrue(score1 > score2, "The earlier TP score is not greater than " "the later TP. They are %f and %f, respectively." % (score1, score2)) self._checkCounts(matchingRow1, length-windowSize*numWindows, 1, 0, windowSize*numWindows-1) self._checkCounts(matchingRow2, length-windowSize*numWindows, 1, 0, windowSize*numWindows-1)
def testOnlyScoreFirstTruePositiveWithinWindow(self): """ An algorithm making multiple detections within a window (i.e. true positive) should only be scored for the earliest true positive. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores = pandas.Series([0] * length) threshold = 0.5 window = windows[0] t1, t2 = window # Score with a single true positive at start of window index1 = timestamps[timestamps == t1].index[0] anomalyScores[index1] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (_, matchingRow1) = sweeper.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) # Add a second true positive to end of window index2 = timestamps[timestamps == t2].index[0] anomalyScores[index2] = 1 (_, matchingRow2) = sweeper.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) self.assertEqual(matchingRow1.score, matchingRow2.score) self._checkCounts(matchingRow1, length-windowSize*numWindows, 1, 0, windowSize*numWindows-1) self._checkCounts(matchingRow2, length-windowSize*numWindows, 2, 0, windowSize*numWindows-2)
def testRewardLowFalseNegatives(self): """ Given false negatives in the set of detections, the score output with the Reward Low False Negatives application profile will be greater than with the Standard application profile. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 100 numWindows = 1 windowSize = 10 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores = pandas.Series([0]*length) costMatrixFN = copy.deepcopy(self.costMatrix) costMatrixFN["fnWeight"] = 2.0 costMatrixFN["fpWeight"] = 0.055 sweeper1 = Sweeper(probationPercent=0, costMatrix=self.costMatrix) sweeper2 = Sweeper(probationPercent=0, costMatrix=costMatrixFN) (scores, matchingRow1) = sweeper1.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) (scores, matchingRow2) = sweeper2.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) self.assertEqual(matchingRow1.score, 0.5*matchingRow2.score) self._checkCounts(matchingRow1, length-windowSize*numWindows, 0, 0, windowSize*numWindows) self._checkCounts(matchingRow2, length-windowSize*numWindows, 0, 0, windowSize*numWindows)
def testNullCase(self): """No windows and no predictions should yield a score of 0.0.""" start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 timestamps = generateTimestamps(start, increment, length) predictions = pandas.Series([0]*length) labels = pandas.Series([0]*length) windows = [] scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score) = scorer.getScore() self.assertEqual(score, 0.0) self._checkCounts(scorer.counts, 10, 0, 0, 0)
def testTruePositivesWithDifferentWindowSizes(self): """ True positives at the left edge of windows should have the same score regardless of width of window. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 timestamps = generateTimestamps(start, increment, length) threshold = 0.5 windowSize1 = 2 windows1 = generateWindows(timestamps, numWindows, windowSize1) index = timestamps[timestamps == windows1[0][0]].index[0] anomalyScores1 = pandas.Series([0]*length) anomalyScores1[index] = 1 windowSize2 = 3 windows2 = generateWindows(timestamps, numWindows, windowSize2) index = timestamps[timestamps == windows2[0][0]].index[0] anomalyScores2 = pandas.Series([0]*length) anomalyScores2[index] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (_, matchingRow1) = sweeper.scoreDataSet( timestamps, anomalyScores1, windows1, "testData", threshold ) (_, matchingRow2) = sweeper.scoreDataSet( timestamps, anomalyScores2, windows2, "testData", threshold ) self.assertEqual(matchingRow1.score, matchingRow2.score) self._checkCounts(matchingRow1, length-windowSize1*numWindows, 1, 0, windowSize1*numWindows-1) self._checkCounts(matchingRow2, length-windowSize2*numWindows, 1, 0, windowSize2*numWindows-1)
def testEarlierTruePositiveIsBetter(self): """ If two algorithms both get a true positive within a window, the algorithm with the earlier true positive (in the window) should get a higher score. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions1 = pandas.Series([0] * length) predictions2 = pandas.Series([0] * length) t1, t2 = windows[0] index1 = timestamps[timestamps == t1].index[0] predictions1[index1] = 1 scorer1 = Scorer(timestamps, predictions1, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score1) = scorer1.getScore() index2 = timestamps[timestamps == t2].index[0] predictions2[index2] = 1 scorer2 = Scorer(timestamps, predictions2, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score2) = scorer2.getScore() self.assertTrue( score1 > score2, "The earlier TP score is not greater than " "the later TP. They are %f and %f, respectively." % (score1, score2)) self._checkCounts(scorer1.counts, length - windowSize * numWindows, 1, 0, windowSize * numWindows - 1) self._checkCounts(scorer2.counts, length - windowSize * numWindows, 1, 0, windowSize * numWindows - 1)
def testWindowTimestampsNotInDataFileThrowsError(self): """ A ValueError should be thrown when label windows contain timestamps that do no exist in the data file. """ data = pandas.DataFrame({"timestamp" : generateTimestamps(strp("2014-01-01"), None, 1)}) windows = [["2015-01-01", "2015-01-01"]] writeCorpus(self.tempCorpusPath, {"test_data_file.csv" : data}) writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows}) corpus = nab.corpus.Corpus(self.tempCorpusPath) self.assertRaises(ValueError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
def testTruePositivesWithDifferentWindowSizes(self): """ True positives at the left edge of windows should have the same score regardless of width of window. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 timestamps = generateTimestamps(start, increment, length) windowSize1 = 2 windows1 = generateWindows(timestamps, numWindows, windowSize1) labels1 = generateLabels(timestamps, windows1) index = timestamps[timestamps == windows1[0][0]].index[0] predictions1 = pandas.Series([0] * length) predictions1[index] = 1 windowSize2 = 3 windows2 = generateWindows(timestamps, numWindows, windowSize2) labels2 = generateLabels(timestamps, windows2) index = timestamps[timestamps == windows2[0][0]].index[0] predictions2 = pandas.Series([0] * length) predictions2[index] = 1 scorer1 = Scorer(timestamps, predictions1, labels1, windows1, self.costMatrix, probationaryPeriod=0) (_, score1) = scorer1.getScore() scorer2 = Scorer(timestamps, predictions2, labels2, windows2, self.costMatrix, probationaryPeriod=0) (_, score2) = scorer2.getScore() self.assertEqual(score1, score2) self._checkCounts(scorer1.counts, length - windowSize1 * numWindows, 1, 0, windowSize1 * numWindows - 1) self._checkCounts(scorer2.counts, length - windowSize2 * numWindows, 1, 0, windowSize2 * numWindows - 1)
def testNullCase(self): """No windows and no predictions should yield a score of 0.0.""" start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) anomalyScores = pandas.Series([0] * length) windows = [] sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow) = sweeper.scoreDataSet(timestamps, anomalyScores, windows, "testData", threshold) self.assertEqual(matchingRow.score, 0.0) self._checkCounts(matchingRow, 10, 0, 0, 0)
def testFalsePositiveScaling(self): """ Test scaling the weight of false positives results in an approximate balance with the true positives. The contributions of TP and FP scores should approximately cancel; i.e. total score =0. With x windows, this total score should on average decrease x/2 because of x FNs. Thus, the acceptable range for score should be centered about -x/2. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 100 numWindows = 1 windowSize = 10 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) # Scale for 10% = windowSize/length self.costMatrix["fpWeight"] = 0.11 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) # Make arbitrary detections, score, repeat scores = [] for _ in xrange(20): anomalyScores = pandas.Series([0]*length) indices = random.sample(range(length), 10) anomalyScores[indices] = 1 (scores, matchingRow) = sweeper.scoreDataSet( timestamps, anomalyScores, windows, "testData", threshold ) scores.append(matchingRow.score) avgScore = sum(scores)/float(len(scores)) self.assertTrue(-1.5 <= avgScore <= 0.5, "The average score across 20 sets " "of random detections is %f, which is not within the acceptable range " "-1.5 to 0.5." % avgScore)
def testFalsePositiveScaling(self): """ Test scaling the weight of false positives results in an approximate balance with the true positives. The contributions of TP and FP scores should approximately cancel; i.e. total score =0. With x windows, this total score should on average decrease x/2 because of x FNs. Thus, the acceptable range for score should be centered about -x/2. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 100 numWindows = 1 windowSize = 10 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) # Scale for 10% = windowSize/length self.costMatrix["fpWeight"] = 0.11 # Make arbitrary detections, score, repeat scores = [] for _ in xrange(20): predictions = pandas.Series([0] * length) indices = random.sample(range(length), 10) predictions[indices] = 1 scorer = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score) = scorer.getScore() scores.append(score) avgScore = sum(scores) / float(len(scores)) self.assertTrue( -1.5 <= avgScore <= 0.5, "The average score across 20 sets " "of random detections is %f, which is not within the acceptable range " "-1.5 to 0.5." % avgScore)
def testEarlierFalsePositiveAfterWindowIsBetter(self): """For two false positives A and B, where A occurs earlier than B, the score change due to A will be less than the score change due to B. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) anomalyScores1 = pandas.Series([0]*length) anomalyScores2 = pandas.Series([0]*length) t1, t2 = windows[0] index1 = timestamps[timestamps == t2].index[0] + 1 anomalyScores1[index1] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow1) = sweeper.scoreDataSet( timestamps, anomalyScores1, windows, "testData", threshold ) anomalyScores2[index1+1] = 1 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) (scores, matchingRow2) = sweeper.scoreDataSet( timestamps, anomalyScores2, windows, "testData", threshold ) self.assertTrue(matchingRow1.score > matchingRow2.score) self._checkCounts(matchingRow1, length-windowSize*numWindows-1, 0, 1, windowSize*numWindows) self._checkCounts(matchingRow2, length-windowSize*numWindows-1, 0, 1, windowSize*numWindows)
def testOnlyScoreFirstTruePositiveWithinWindow(self): """ An algorithm making multiple detections within a window (i.e. true positive) should only be scored for the earliest true positive. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) predictions = pandas.Series([0] * length) window = windows[0] t1, t2 = window index1 = timestamps[timestamps == t1].index[0] predictions[index1] = 1 scorer1 = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score1) = scorer1.getScore() index2 = timestamps[timestamps == t2].index[0] predictions[index2] = 1 scorer2 = Scorer(timestamps, predictions, labels, windows, self.costMatrix, probationaryPeriod=0) (_, score2) = scorer2.getScore() self.assertEqual(score1, score2) self._checkCounts(scorer1.counts, length - windowSize * numWindows, 1, 0, windowSize * numWindows - 1) self._checkCounts(scorer2.counts, length - windowSize * numWindows, 2, 0, windowSize * numWindows - 2)
def test_secondTruePositiveWithinWindowIsIgnored(self): """ If there are two true positives within the same window, then the score should be only decided by whichever true positive occurred earlier. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) predictions = pandas.Series([0]*length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) window = windows[0] t1, t2 = window costMatrix = {"tpWeight": 1.0, "fnWeight": 2.0, "fpWeight": 3.0, "tnWeight": 4.0} index1 = timestamps[timestamps == t1].index[0] predictions[index1] = 1 scorer1 = Scorer(timestamps, predictions, labels, windows, costMatrix, probationaryPeriod=0) score1 = scorer1.getScore() index2 = timestamps[timestamps == t2].index[0] predictions[index2] = 1 scorer2 = Scorer(timestamps, predictions, labels, windows, costMatrix, probationaryPeriod=0) score2 = scorer2.getScore() self.assertEqual(score1, score2)
def testFalsePositiveScaling(self): """ Test scaling the weight of false positives results in an approximate balance with the true positives. The contributions of TP and FP scores should approximately cancel; i.e. total score =0. With x windows, this total score should on average decrease x/2 because of x FNs. Thus, the acceptable range for score should be centered about -x/2. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 100 numWindows = 1 windowSize = 10 threshold = 0.5 timestamps = generateTimestamps(start, increment, length) windows = generateWindows(timestamps, numWindows, windowSize) # Scale for 10% = windowSize/length self.costMatrix["fpWeight"] = 0.11 sweeper = Sweeper(probationPercent=0, costMatrix=self.costMatrix) # Make arbitrary detections, score, repeat scores = [] for _ in range(20): anomalyScores = pandas.Series([0] * length) indices = random.sample(list(range(length)), 10) anomalyScores[indices] = 1 (scores, matchingRow) = sweeper.scoreDataSet(timestamps, anomalyScores, windows, "testData", threshold) scores.append(matchingRow.score) avgScore = sum(scores) / float(len(scores)) self.assertTrue( -1.5 <= avgScore <= 0.5, "The average score across 20 sets " "of random detections is %f, which is not within the acceptable range " "-1.5 to 0.5." % avgScore)
def test_earlierTruePositiveIsBetter(self): """ If two algorithms both get a true positive within a window, the algorithm that labeled a true positive earlier in the window will get a higher score. """ start = datetime.datetime.now() increment = datetime.timedelta(minutes=5) length = 10 numWindows = 1 windowSize = 2 timestamps = generateTimestamps(start, increment, length) predictions1 = pandas.Series([0]*length) predictions2 = pandas.Series([0]*length) windows = generateWindows(timestamps, numWindows, windowSize) labels = generateLabels(timestamps, windows) window = windows[0] t1, t2 = window costMatrix = {"tpWeight": 1.0, "fnWeight": 2.0, "fpWeight": 3.0, "tnWeight": 4.0} index1 = timestamps[timestamps == t1].index[0] predictions1[index1] = 1 scorer1 = Scorer(timestamps, predictions1, labels, windows, costMatrix, probationaryPeriod=0) score1 = scorer1.getScore() index2 = timestamps[timestamps == t2].index[0] predictions2[index2] = 1 scorer2 = Scorer(timestamps, predictions2, labels, windows, costMatrix, probationaryPeriod=0) score2 = scorer2.getScore() self.assertTrue(score1 > score2)
def testWindowTimestampsNonChronologicalThrowsError(self): """ A ValueError should be thrown when a label window's start and end times are not in chronological order. """ data = pandas.DataFrame({"timestamp" : generateTimestamps(strp("2014-01-01"), datetime.timedelta(minutes=5), 10)}) # Windows both in and out of order windows = [["2014-01-01 00:45", "2014-01-01 00:00"], ["2014-01-01 10:15", "2014-01-01 11:15"]] writeCorpus(self.tempCorpusPath, {"test_data_file.csv" : data}) writeCorpusLabel(self.tempCorpusLabelPath, {"test_data_file.csv": windows}) corpus = nab.corpus.Corpus(self.tempCorpusPath) self.assertRaises( ValueError, nab.labeler.CorpusLabel, self.tempCorpusLabelPath, corpus)
def testRedundantTimestampsRaiseException(self): data = pandas.DataFrame({"timestamp" : generateTimestamps(strp("2015-01-01"), datetime.timedelta(days=1), 365)}) dataFileName = "test_data_file.csv" writeCorpus(self.tempCorpusPath, {dataFileName : data}) labels = ["2015-12-25 00:00:00", "2015-12-26 00:00:00", "2015-12-31 00:00:00"] labelsDir = self.tempCorpusLabelPath.replace( "/label.json", "/raw/label.json") writeCorpusLabel(labelsDir, {dataFileName: labels}) corpus = nab.corpus.Corpus(self.tempCorpusPath) labDir = labelsDir.replace("/label.json", "") labelCombiner = nab.labeler.LabelCombiner( labDir, corpus, 0.5, 0.10, 0.15, 0) self.assertRaises(ValueError, labelCombiner.combine)