def checkWindows(self): """ This takes the anomaly windows and checks for overlap with both each other and with the probationary period. Overlapping windows are merged into a single window. Windows overlapping with the probationary period are deleted. """ for relativePath, windows in self.combinedWindows.iteritems(): numWindows = len(windows) if numWindows > 0: fileLength = self.corpus.dataFiles[relativePath].data.shape[0] probationIndex = getProbationPeriod( self.probationaryPercent, fileLength) probationTimestamp = self.corpus.dataFiles[relativePath].data[ "timestamp"][probationIndex] if (pandas.to_datetime(windows[0][0]) -probationTimestamp).total_seconds() < 0: del windows[0] print ("The first window in {} overlaps with the probationary period " ", so we're deleting it.".format(relativePath)) i = 0 while len(windows)-1 > i: if (pandas.to_datetime(windows[i+1][0]) - pandas.to_datetime(windows[i][1])).total_seconds() <= 0: # merge windows windows[i] = [windows[i][0], windows[i+1][1]] del windows[i+1] i += 1
def checkWindows(self): """ This takes the anomaly windows and checks for overlap with both each other and with the probationary period. Overlapping windows are merged into a single window. Windows overlapping with the probationary period are deleted. """ for relativePath, windows in self.combinedWindows.iteritems(): numWindows = len(windows) if numWindows > 0: fileLength = self.corpus.dataFiles[relativePath].data.shape[0] probationIndex = getProbationPeriod(self.probationaryPercent, fileLength) probationTimestamp = self.corpus.dataFiles[relativePath].data[ "timestamp"][probationIndex] if (pandas.to_datetime(windows[0][0]) - probationTimestamp).total_seconds() < 0: del windows[0] print( "The first window in {} overlaps with the probationary period " ", so we're deleting it.".format(relativePath)) i = 0 while len(windows) - 1 > i: if (pandas.to_datetime(windows[i + 1][0]) - pandas.to_datetime( windows[i][1])).total_seconds() <= 0: # merge windows windows[i] = [windows[i][0], windows[i + 1][1]] del windows[i + 1] i += 1
def __init__(self, dataSet, probationaryPercent): self.dataSet = dataSet self.probationaryPeriod = getProbationPeriod(probationaryPercent, dataSet.data.shape[0]) self.inputMin = self.dataSet.data["value"].min() self.inputMax = self.dataSet.data["value"].max()
def testGetProbationPeriod(self): fileLengths = (1000, 4032, 5000, 15000) expectedIndices = (150, 604, 750, 750) for length, idx in zip(fileLengths, expectedIndices): probationIndex = getProbationPeriod(0.15, length) self.assertEqual(idx, probationIndex, "Expected probation index of {} " "got {}.".format(idx, probationIndex))
def __init__( self, dataSet, probationaryPercent): self.dataSet = dataSet self.probationaryPeriod = getProbationPeriod( probationaryPercent, dataSet.data.shape[0]) self.inputMin = self.dataSet.data["value"].min() self.inputMax = self.dataSet.data["value"].max()
def __init__(self, dataSet, probationaryPercent): self.dataSet = dataSet self.probationaryPeriod = getProbationPeriod(probationaryPercent, dataSet.data.shape[0]) self.inputMin = self.dataSet.data["value"].min() self.inputMax = self.dataSet.data["value"].max() # Added attribute threadId so that users can use it in their detectors when printing debug information this attribute is filled by detectDataSet() function below self.threadId = None
def __init__( self, dataSet, probationaryPercent): self.dataSet = dataSet self.cols = dataSet.shape[1] self.probationaryPeriod = getProbationPeriod(probationaryPercent, dataSet.shape[0]) self.inputMin = numpy.asarray(dataSet.min(numeric_only=True)) self.inputMax = numpy.asarray(dataSet.max(numeric_only=True)) self.inputMean = numpy.asarray(dataSet.mean(numeric_only=True)) self.inputStd = numpy.asarray(dataSet.std(numeric_only=True)) #check if there are no 0s in the difference #otherwise some detectors might divide by 0 during normilization assert numpy.count_nonzero((self.inputMax-self.inputMin)==0) == 0, '0 found in diff'
def scoreCorpus(threshold, args): """Scores the corpus given a detector's results and a user profile. Scores the corpus in parallel. @param threshold (float) Threshold value to convert an anomaly score value to a detection. @param args (tuple) Contains: pool (multiprocessing.Pool) Pool of processes to perform tasks in parallel. detectorName (string) Name of detector. profileName (string) Name of scoring profile. costMatrix (dict) Cost matrix to weight the true positives, false negatives, and false positives during scoring. resultsDetectorDir (string) Directory for the results CSVs. resultsCorpus (nab.Corpus) Corpus object that holds the per record anomaly scores for a given detector. corpusLabel (nab.CorpusLabel) Ground truth anomaly labels for the NAB corpus. probationaryPercent (float) Percent of each data file not to be considered during scoring. """ (pool, detectorName, profileName, costMatrix, resultsDetectorDir, resultsCorpus, corpusLabel, probationaryPercent, scoreFlag) = args args = [] for relativePath, dataSet in resultsCorpus.dataFiles.iteritems(): if "_scores.csv" in relativePath: continue # relativePath: raw dataset file, # e.g. 'artificialNoAnomaly/art_noisy.csv' relativePath = convertResultsPathToDataPath( \ os.path.join(detectorName, relativePath)) # outputPath: dataset results file, # e.g. 'results/detector/artificialNoAnomaly/detector_art_noisy.csv' relativeDir, fileName = os.path.split(relativePath) fileName = detectorName + "_" + fileName outputPath = os.path.join(resultsDetectorDir, relativeDir, fileName) windows = corpusLabel.windows[relativePath] labels = corpusLabel.labels[relativePath] probationaryPeriod = getProbationPeriod(probationaryPercent, labels.shape[0]) predicted = convertAnomalyScoresToDetections( dataSet.data["anomaly_score"], threshold) args.append((detectorName, profileName, relativePath, outputPath, threshold, predicted, windows, labels, costMatrix, probationaryPeriod, scoreFlag)) results = pool.map(scoreDataSet, args) # Total the 6 scoring metrics for all data files totals = [None] * 3 + [0] * 6 for row in results: for i in xrange(6): totals[i + 3] += row[i + 4] results.append(["Totals"] + totals) resultsDF = pandas.DataFrame(data=results, columns=("Detector", "Profile", "File", "Threshold", "Score", "TP", "TN", "FP", "FN", "Total_Count")) return resultsDF
def scoreCorpus(threshold, args): """Scores the corpus given a detector's results and a user profile. Scores the corpus in parallel. @param threshold (float) Threshold value to convert an anomaly score value to a detection. @param args (tuple) Contains: pool (multiprocessing.Pool) Pool of processes to perform tasks in parallel. detectorName (string) Name of detector. profileName (string) Name of scoring profile. costMatrix (dict) Cost matrix to weight the true positives, false negatives, and false positives during scoring. resultsDetectorDir (string) Directory for the results CSVs. resultsCorpus (nab.Corpus) Corpus object that holds the per record anomaly scores for a given detector. corpusLabel (nab.CorpusLabel) Ground truth anomaly labels for the NAB corpus. probationaryPercent (float) Percent of each data file not to be considered during scoring. """ (pool, detectorName, profileName, costMatrix, resultsDetectorDir, resultsCorpus, corpusLabel, probationaryPercent, scoreFlag) = args args = [] for relativePath, dataSet in resultsCorpus.dataFiles.iteritems(): if "_scores.csv" in relativePath: continue # relativePath: raw dataset file, # e.g. 'artificialNoAnomaly/art_noisy.csv' relativePath = convertResultsPathToDataPath( \ os.path.join(detectorName, relativePath)) # outputPath: dataset results file, # e.g. 'results/detector/artificialNoAnomaly/detector_art_noisy.csv' relativeDir, fileName = os.path.split(relativePath) fileName = detectorName + "_" + fileName outputPath = os.path.join(resultsDetectorDir, relativeDir, fileName) windows = corpusLabel.windows[relativePath] labels = corpusLabel.labels[relativePath] probationaryPeriod = getProbationPeriod( probationaryPercent, labels.shape[0]) predicted = convertAnomalyScoresToDetections( dataSet.data["anomaly_score"], threshold) args.append(( detectorName, profileName, relativePath, outputPath, threshold, predicted, windows, labels, costMatrix, probationaryPeriod, scoreFlag)) # Using `map_async` instead of `map` so interrupts are properly handled. # See: http://stackoverflow.com/a/1408476 results = pool.map_async(scoreDataSet, args).get(99999999) # Total the 6 scoring metrics for all data files totals = [None]*3 + [0]*6 for row in results: for i in xrange(6): totals[i+3] += row[i+4] results.append(["Totals"] + totals) resultsDF = pandas.DataFrame(data=results, columns=("Detector", "Profile", "File", "Threshold", "Score", "TP", "TN", "FP", "FN", "Total_Count")) return resultsDF