Esempio n. 1
0
def main(args):

  dataDir = os.path.join(root, args.dataDir)
  destDir = args.destDir
  labelDir = args.labelDir

  threshold = 1

  labelCombiner = LabelCombiner(labelDir, dataDir, threshold)

  print "Combining Labels"

  labelCombiner.combine()

  print "Writing combined labels"

  labelCombiner.write(destDir)

  print "Attempting to load objects as a test"

  corpusLabel = CorpusLabel(destDir, dataDir)

  corpusLabel.initialize()

  print "Success!"
Esempio n. 2
0
class Runner(object):
  """Class to run a configured nab benchmark."""

  def __init__(self,
               dataDir,
               labelDir,
               resultsDir,
               profilesPath,
               thresholdPath,
               probationaryPercent=0.15,
               numCPUs=None):
    """
    @param dataDir        (string)  Directory where all the raw datasets exist.

    @param labelDir       (string)  Directory where the labels of the datasets
                                    exist.

    @param resultsDir     (string)  Directory where the detector anomaly scores
                                    will be scored.

    @param profilesPath   (string)  Path to user profiles prescribing the
                                    username and the cost matrix.

    @param thresholdPath  (string)  Path to thresholds dictionary containing the
                                    best thresholds (and their corresponding
                                    score) for a combination of detector and
                                    user profile.

    @probationaryPercent  (float)   Percent of each dataset which will be
                                    ignored during the scoring process.

    @param numCPUs        (int)     Number of CPUs to be used for calls to
                                    multiprocessing.pool.map
    """
    self.dataDir = dataDir
    self.labelDir = labelDir
    self.resultsDir = resultsDir
    self.profilesPath = profilesPath
    self.thresholdPath = thresholdPath

    self.probationaryPercent = probationaryPercent
    self.pool = multiprocessing.Pool(numCPUs)

    self.corpus = None
    self.corpusLabel = None
    self.profiles = None


  def initialize(self):
    """Initialize all the relevant objects for the run."""
    self.corpus = Corpus(self.dataDir)
    self.corpusLabel = CorpusLabel(self.labelDir, None, self.corpus)
    self.corpusLabel.initialize()

    with open(self.profilesPath) as p:
      self.profiles = yaml.load(p)


  def detect(self, detectors):
    """Generate results file given a dictionary of detector classes

    Function that takes a set of detectors and a corpus of data and creates a
    set of files storing the alerts and anomaly scores given by the detectors

    @param detectors     (dict)         Dictionary with key value pairs of a
                                        detector name and its corresponding
                                        class constructor.
    """
    print "\nObtaining detections"

    count = 0
    args = []
    for detectorName, detectorConstructor in detectors.iteritems():
      for relativePath, dataSet in self.corpus.dataSets.iteritems():

        args.append(
          (
            count,
            detectorConstructor(
                          dataSet=dataSet,
                          probationaryPercent=self.probationaryPercent),
            detectorName,
            self.corpusLabel.labels[relativePath]["label"],
            self.resultsDir,
            relativePath
          )
        )

        count += 1

    print "calling multiprocessing pool"
    self.pool.map(detectDataSet, args)


  def optimize(self, detectorNames):
    """Optimize the threshold for each combination of detector and profile.

    @param detectorNames  (list)  List of detector names.

    @return thresholds    (dict)  Dictionary of dictionaries with detector names
                                  then usernames as keys followed by another
                                  dictionary containing the score and the
                                  threshold used to obtained that score.
    """
    print "\nOptimizing anomaly Scores"

    thresholds = dict()

    for detector in detectorNames:
      resultsDetectorDir = os.path.join(self.resultsDir, detector)
      resultsCorpus = Corpus(resultsDetectorDir)

      thresholds[detector] = dict()

      for username, profile in self.profiles.iteritems():
        costMatrix = profile["CostMatrix"]

        thresholds[detector][username] = optimizeThreshold(
          (self.pool,
          detector,
          username,
          costMatrix,
          resultsCorpus,
          self.corpusLabel,
          self.probationaryPercent))

    updateThresholds(thresholds, self.thresholdPath)

    return thresholds


  def score(self, detectors, thresholds):
    """Score the performance of the detectors.

    Function that must be called only after detection result files have been
    generated and thresholds have been optimized. This looks at the result files
    and scores the performance of each detector specified and stores these
    results in a csv file.

    @param detectorNames  (list)    List of detector names.

    @param thresholds     (dict)    Dictionary of dictionaries with detector
                                    names then usernames as keys followed by
                                    another dictionary containing the score and
                                    the threshold used to obtained that score.
    """
    print "\nObtaining Scores"

    for detector in detectors:
      ans = pandas.DataFrame(columns=("Detector", "Username", "File", \
        "Threshold", "Score", "tp", "tn", "fp", "fn", "Total_Count"))

      resultsDetectorDir = os.path.join(self.resultsDir, detector)
      resultsCorpus = Corpus(resultsDetectorDir)

      for username, profile in self.profiles.iteritems():

        costMatrix = profile["CostMatrix"]

        threshold = thresholds[detector][username]["threshold"]

        results = scoreCorpus(threshold,
                              (self.pool,
                               detector,
                               username,
                               costMatrix,
                               resultsCorpus,
                               self.corpusLabel,
                               self.probationaryPercent))

        for row in results:
          ans.loc[len(ans)] = row

      scorePath = os.path.join(resultsDetectorDir, detector + "_scores.csv")
      ans.to_csv(scorePath, index=False)