def main(args): dataDir = os.path.join(root, args.dataDir) destDir = args.destDir labelDir = args.labelDir threshold = 1 labelCombiner = LabelCombiner(labelDir, dataDir, threshold) print "Combining Labels" labelCombiner.combine() print "Writing combined labels" labelCombiner.write(destDir) print "Attempting to load objects as a test" corpusLabel = CorpusLabel(destDir, dataDir) corpusLabel.initialize() print "Success!"
class Runner(object): """Class to run a configured nab benchmark.""" def __init__(self, dataDir, labelDir, resultsDir, profilesPath, thresholdPath, probationaryPercent=0.15, numCPUs=None): """ @param dataDir (string) Directory where all the raw datasets exist. @param labelDir (string) Directory where the labels of the datasets exist. @param resultsDir (string) Directory where the detector anomaly scores will be scored. @param profilesPath (string) Path to user profiles prescribing the username and the cost matrix. @param thresholdPath (string) Path to thresholds dictionary containing the best thresholds (and their corresponding score) for a combination of detector and user profile. @probationaryPercent (float) Percent of each dataset which will be ignored during the scoring process. @param numCPUs (int) Number of CPUs to be used for calls to multiprocessing.pool.map """ self.dataDir = dataDir self.labelDir = labelDir self.resultsDir = resultsDir self.profilesPath = profilesPath self.thresholdPath = thresholdPath self.probationaryPercent = probationaryPercent self.pool = multiprocessing.Pool(numCPUs) self.corpus = None self.corpusLabel = None self.profiles = None def initialize(self): """Initialize all the relevant objects for the run.""" self.corpus = Corpus(self.dataDir) self.corpusLabel = CorpusLabel(self.labelDir, None, self.corpus) self.corpusLabel.initialize() with open(self.profilesPath) as p: self.profiles = yaml.load(p) def detect(self, detectors): """Generate results file given a dictionary of detector classes Function that takes a set of detectors and a corpus of data and creates a set of files storing the alerts and anomaly scores given by the detectors @param detectors (dict) Dictionary with key value pairs of a detector name and its corresponding class constructor. """ print "\nObtaining detections" count = 0 args = [] for detectorName, detectorConstructor in detectors.iteritems(): for relativePath, dataSet in self.corpus.dataSets.iteritems(): args.append( ( count, detectorConstructor( dataSet=dataSet, probationaryPercent=self.probationaryPercent), detectorName, self.corpusLabel.labels[relativePath]["label"], self.resultsDir, relativePath ) ) count += 1 print "calling multiprocessing pool" self.pool.map(detectDataSet, args) def optimize(self, detectorNames): """Optimize the threshold for each combination of detector and profile. @param detectorNames (list) List of detector names. @return thresholds (dict) Dictionary of dictionaries with detector names then usernames as keys followed by another dictionary containing the score and the threshold used to obtained that score. """ print "\nOptimizing anomaly Scores" thresholds = dict() for detector in detectorNames: resultsDetectorDir = os.path.join(self.resultsDir, detector) resultsCorpus = Corpus(resultsDetectorDir) thresholds[detector] = dict() for username, profile in self.profiles.iteritems(): costMatrix = profile["CostMatrix"] thresholds[detector][username] = optimizeThreshold( (self.pool, detector, username, costMatrix, resultsCorpus, self.corpusLabel, self.probationaryPercent)) updateThresholds(thresholds, self.thresholdPath) return thresholds def score(self, detectors, thresholds): """Score the performance of the detectors. Function that must be called only after detection result files have been generated and thresholds have been optimized. This looks at the result files and scores the performance of each detector specified and stores these results in a csv file. @param detectorNames (list) List of detector names. @param thresholds (dict) Dictionary of dictionaries with detector names then usernames as keys followed by another dictionary containing the score and the threshold used to obtained that score. """ print "\nObtaining Scores" for detector in detectors: ans = pandas.DataFrame(columns=("Detector", "Username", "File", \ "Threshold", "Score", "tp", "tn", "fp", "fn", "Total_Count")) resultsDetectorDir = os.path.join(self.resultsDir, detector) resultsCorpus = Corpus(resultsDetectorDir) for username, profile in self.profiles.iteritems(): costMatrix = profile["CostMatrix"] threshold = thresholds[detector][username]["threshold"] results = scoreCorpus(threshold, (self.pool, detector, username, costMatrix, resultsCorpus, self.corpusLabel, self.probationaryPercent)) for row in results: ans.loc[len(ans)] = row scorePath = os.path.join(resultsDetectorDir, detector + "_scores.csv") ans.to_csv(scorePath, index=False)