def scoreCorpus(threshold, args): """Given a score to the corpus given a detector and a user profile. Scores the corpus in parallel. @param threshold (float) Threshold value to convert an anomaly score value to a detection. @param args (tuple) Arguments necessary to call scoreHelper """ (pool, detector, username, costMatrix, resultsCorpus, corpusLabel, probationaryPercent) = args args = [] for relativePath, dataSet in resultsCorpus.dataFiles.iteritems(): if relativePath == detector + "_scores.csv": continue relativePath = convertResultsPathToDataPath( \ os.path.join(detector, relativePath)) windows = corpusLabel.windows[relativePath] labels = corpusLabel.labels[relativePath] probationaryPeriod = math.floor( probationaryPercent * labels.shape[0]) predicted = convertAnomalyScoresToDetections( dataSet.data["anomaly_score"], threshold) args.append(( detector, username, relativePath, threshold, predicted, windows, labels, costMatrix, probationaryPeriod)) results = pool.map(scoreDataSet, args) return results
def optimizeThreshold(args): """Optimize the threshold for a given combination of detector and profile. @param args (tuple) Contains: detectorName (string) Name of detector. costMatrix (dict) Cost matrix to weight the true positives, false negatives, and false positives during scoring. resultsCorpus (nab.Corpus) Corpus object that holds the per record anomaly scores for a given detector. corpusLabel (nab.CorpusLabel) Ground truth anomaly labels for the NAB corpus. probationaryPercent (float) Percent of each data file not to be considered during scoring. @return (dict) Contains: "threshold" (float) Threshold that returns the largest score from the Objective function. "score" (float) The score from the objective function given the threshold. """ (detectorName, costMatrix, resultsCorpus, corpusLabel, probationaryPercent) = args sweeper = Sweeper(probationPercent=probationaryPercent, costMatrix=costMatrix) # First, get the sweep-scores for each row in each data set allAnomalyRows = [] for relativePath, dataSet in resultsCorpus.dataFiles.iteritems(): if "_scores.csv" in relativePath: continue # relativePath: raw dataset file, # e.g. 'artificialNoAnomaly/art_noisy.csv' relativePath = convertResultsPathToDataPath( os.path.join(detectorName, relativePath)) windows = corpusLabel.windows[relativePath] labels = corpusLabel.labels[relativePath] timestamps = labels['timestamp'] anomalyScores = dataSet.data["anomaly_score"] curAnomalyRows = sweeper.calcSweepScore(timestamps, anomalyScores, windows, relativePath) allAnomalyRows.extend(curAnomalyRows) # Get scores by threshold for the entire corpus scoresByThreshold = sweeper.calcScoreByThreshold(allAnomalyRows) scoresByThreshold = sorted(scoresByThreshold, key=lambda x: x.score, reverse=True) bestParams = scoresByThreshold[0] print( "Optimizer found a max score of {} with anomaly threshold {}.".format( bestParams.score, bestParams.threshold)) return {"threshold": bestParams.threshold, "score": bestParams.score}
def scoreCorpus(threshold, args): """Scores the corpus given a detector's results and a user profile. Scores the corpus in parallel. @param threshold (float) Threshold value to convert an anomaly score value to a detection. @param args (tuple) Contains: pool (multiprocessing.Pool) Pool of processes to perform tasks in parallel. detectorName (string) Name of detector. profileName (string) Name of scoring profile. costMatrix (dict) Cost matrix to weight the true positives, false negatives, and false positives during scoring. resultsDetectorDir (string) Directory for the results CSVs. resultsCorpus (nab.Corpus) Corpus object that holds the per record anomaly scores for a given detector. corpusLabel (nab.CorpusLabel) Ground truth anomaly labels for the NAB corpus. probationaryPercent (float) Percent of each data file not to be considered during scoring. """ (pool, detectorName, profileName, costMatrix, resultsDetectorDir, resultsCorpus, corpusLabel, probationaryPercent, scoreFlag) = args args = [] for relativePath, dataSet in resultsCorpus.dataFiles.iteritems(): if "_scores.csv" in relativePath: continue # relativePath: raw dataset file, # e.g. 'artificialNoAnomaly/art_noisy.csv' relativePath = convertResultsPathToDataPath( \ os.path.join(detectorName, relativePath)) # outputPath: dataset results file, # e.g. 'results/detector/artificialNoAnomaly/detector_art_noisy.csv' relativeDir, fileName = os.path.split(relativePath) fileName = detectorName + "_" + fileName outputPath = os.path.join(resultsDetectorDir, relativeDir, fileName) windows = corpusLabel.windows[relativePath] labels = corpusLabel.labels[relativePath] probationaryPeriod = math.floor(probationaryPercent * labels.shape[0]) predicted = convertAnomalyScoresToDetections( dataSet.data["anomaly_score"], threshold) args.append(( detectorName, profileName, relativePath, outputPath, threshold, predicted, windows, labels, costMatrix, probationaryPeriod, scoreFlag)) results = pool.map(scoreDataSet, args) # Total the 6 scoring metrics for all data files totals = [None]*3 + [0]*6 for row in results: for i in xrange(6): totals[i+3] += row[i+4] results.append(["Totals"] + totals) resultsDF = pandas.DataFrame(data=results, columns=("Detector", "Profile", "File", "Threshold", "Score", "TP", "TN", "FP", "FN", "Total_Count")) return resultsDF
def scoreCorpus(threshold, args): """Scores the corpus given a detector's results and a user profile. Scores the corpus in parallel. @param threshold (float) Threshold value to convert an anomaly score value to a detection. @param args (tuple) Contains: pool (multiprocessing.Pool) Pool of processes to perform tasks in parallel. detectorName (string) Name of detector. profileName (string) Name of scoring profile. costMatrix (dict) Cost matrix to weight the true positives, false negatives, and false positives during scoring. resultsDetectorDir (string) Directory for the results CSVs. resultsCorpus (nab.Corpus) Corpus object that holds the per record anomaly scores for a given detector. corpusLabel (nab.CorpusLabel) Ground truth anomaly labels for the NAB corpus. probationaryPercent (float) Percent of each data file not to be considered during scoring. """ (pool, detectorName, profileName, costMatrix, resultsDetectorDir, resultsCorpus, corpusLabel, probationaryPercent, scoreFlag) = args args = [] for relativePath, dataSet in resultsCorpus.dataFiles.iteritems(): if "_scores.csv" in relativePath: continue # relativePath: raw dataset file, # e.g. 'artificialNoAnomaly/art_noisy.csv' relativePath = convertResultsPathToDataPath( \ os.path.join(detectorName, relativePath)) # outputPath: dataset results file, # e.g. 'results/detector/artificialNoAnomaly/detector_art_noisy.csv' relativeDir, fileName = os.path.split(relativePath) fileName = detectorName + "_" + fileName outputPath = os.path.join(resultsDetectorDir, relativeDir, fileName) windows = corpusLabel.windows[relativePath] labels = corpusLabel.labels[relativePath] probationaryPeriod = getProbationPeriod(probationaryPercent, labels.shape[0]) predicted = convertAnomalyScoresToDetections( dataSet.data["anomaly_score"], threshold) args.append((detectorName, profileName, relativePath, outputPath, threshold, predicted, windows, labels, costMatrix, probationaryPeriod, scoreFlag)) results = pool.map(scoreDataSet, args) # Total the 6 scoring metrics for all data files totals = [None] * 3 + [0] * 6 for row in results: for i in xrange(6): totals[i + 3] += row[i + 4] results.append(["Totals"] + totals) resultsDF = pandas.DataFrame(data=results, columns=("Detector", "Profile", "File", "Threshold", "Score", "TP", "TN", "FP", "FN", "Total_Count")) return resultsDF
def optimizeThreshold(args): """Optimize the threshold for a given combination of detector and profile. @param args (tuple) Contains: detectorName (string) Name of detector. costMatrix (dict) Cost matrix to weight the true positives, false negatives, and false positives during scoring. resultsCorpus (nab.Corpus) Corpus object that holds the per record anomaly scores for a given detector. corpusLabel (nab.CorpusLabel) Ground truth anomaly labels for the NAB corpus. probationaryPercent (float) Percent of each data file not to be considered during scoring. @return (dict) Contains: "threshold" (float) Threshold that returns the largest score from the Objective function. "score" (float) The score from the objective function given the threshold. """ (detectorName, costMatrix, resultsCorpus, corpusLabel, probationaryPercent) = args sweeper = Sweeper( probationPercent=probationaryPercent, costMatrix=costMatrix ) # First, get the sweep-scores for each row in each data set allAnomalyRows = [] for relativePath, dataSet in resultsCorpus.dataFiles.iteritems(): if "_scores.csv" in relativePath: continue # relativePath: raw dataset file, # e.g. 'artificialNoAnomaly/art_noisy.csv' relativePath = convertResultsPathToDataPath( os.path.join(detectorName, relativePath)) windows = corpusLabel.windows[relativePath] labels = corpusLabel.labels[relativePath] timestamps = labels['timestamp'] anomalyScores = dataSet.data["anomaly_score"] curAnomalyRows = sweeper.calcSweepScore( timestamps, anomalyScores, windows, relativePath ) allAnomalyRows.extend(curAnomalyRows) # Get scores by threshold for the entire corpus scoresByThreshold = sweeper.calcScoreByThreshold(allAnomalyRows) scoresByThreshold = sorted( scoresByThreshold,key=lambda x: x.score, reverse=True) bestParams = scoresByThreshold[0] print("Optimizer found a max score of {} with anomaly threshold {}.".format( bestParams.score, bestParams.threshold )) return { "threshold": bestParams.threshold, "score": bestParams.score }