Example #1
0
  def checkWindows(self):
    """
    This takes the anomaly windows and checks for overlap with both each other
    and with the probationary period. Overlapping windows are merged into a
    single window. Windows overlapping with the probationary period are deleted.
    """
    for relativePath, windows in self.combinedWindows.iteritems():
      numWindows = len(windows)
      if numWindows > 0:

        fileLength = self.corpus.dataFiles[relativePath].data.shape[0]
        probationIndex = getProbationPeriod(
          self.probationaryPercent, fileLength)

        probationTimestamp = self.corpus.dataFiles[relativePath].data[
          "timestamp"][probationIndex]

        if (pandas.to_datetime(windows[0][0])
            -probationTimestamp).total_seconds() < 0:
          del windows[0]
          print ("The first window in {} overlaps with the probationary period "
                 ", so we're deleting it.".format(relativePath))

        i = 0
        while len(windows)-1 > i:
          if (pandas.to_datetime(windows[i+1][0])
              - pandas.to_datetime(windows[i][1])).total_seconds() <= 0:
            # merge windows
            windows[i] = [windows[i][0], windows[i+1][1]]
            del windows[i+1]
          i += 1
Example #2
0
    def checkWindows(self):
        """
    This takes the anomaly windows and checks for overlap with both each other
    and with the probationary period. Overlapping windows are merged into a
    single window. Windows overlapping with the probationary period are deleted.
    """
        for relativePath, windows in self.combinedWindows.iteritems():
            numWindows = len(windows)
            if numWindows > 0:

                fileLength = self.corpus.dataFiles[relativePath].data.shape[0]
                probationIndex = getProbationPeriod(self.probationaryPercent,
                                                    fileLength)

                probationTimestamp = self.corpus.dataFiles[relativePath].data[
                    "timestamp"][probationIndex]

                if (pandas.to_datetime(windows[0][0]) -
                        probationTimestamp).total_seconds() < 0:
                    del windows[0]
                    print(
                        "The first window in {} overlaps with the probationary period "
                        ", so we're deleting it.".format(relativePath))

                i = 0
                while len(windows) - 1 > i:
                    if (pandas.to_datetime(windows[i + 1][0]) -
                            pandas.to_datetime(
                                windows[i][1])).total_seconds() <= 0:
                        # merge windows
                        windows[i] = [windows[i][0], windows[i + 1][1]]
                        del windows[i + 1]
                    i += 1
Example #3
0
    def __init__(self, dataSet, probationaryPercent):

        self.dataSet = dataSet
        self.probationaryPeriod = getProbationPeriod(probationaryPercent,
                                                     dataSet.data.shape[0])

        self.inputMin = self.dataSet.data["value"].min()
        self.inputMax = self.dataSet.data["value"].max()
 def testGetProbationPeriod(self):
   fileLengths = (1000, 4032, 5000, 15000)
   expectedIndices = (150, 604, 750, 750)
   
   for length, idx in zip(fileLengths, expectedIndices):
     probationIndex = getProbationPeriod(0.15, length)
     self.assertEqual(idx, probationIndex, "Expected probation index of {} "
       "got {}.".format(idx, probationIndex))
Example #5
0
File: base.py Project: Aleyasen/NAB
  def __init__( self,
                dataSet,
                probationaryPercent):

    self.dataSet = dataSet
    self.probationaryPeriod = getProbationPeriod(
      probationaryPercent, dataSet.data.shape[0])

    self.inputMin = self.dataSet.data["value"].min()
    self.inputMax = self.dataSet.data["value"].max()
    def __init__(self, dataSet, probationaryPercent):

        self.dataSet = dataSet
        self.probationaryPeriod = getProbationPeriod(probationaryPercent,
                                                     dataSet.data.shape[0])

        self.inputMin = self.dataSet.data["value"].min()
        self.inputMax = self.dataSet.data["value"].max()

        # Added attribute threadId so that users can use it in their detectors when printing debug information this attribute is filled by detectDataSet() function below
        self.threadId = None
Example #7
0
File: base.py Project: rvorias/NAB
  def __init__( self,
                dataSet,
                probationaryPercent):

    self.dataSet = dataSet
    self.cols = dataSet.shape[1]
    self.probationaryPeriod = getProbationPeriod(probationaryPercent, dataSet.shape[0])
    
    self.inputMin = numpy.asarray(dataSet.min(numeric_only=True))
    self.inputMax = numpy.asarray(dataSet.max(numeric_only=True))
    self.inputMean = numpy.asarray(dataSet.mean(numeric_only=True))
    self.inputStd = numpy.asarray(dataSet.std(numeric_only=True))
    #check if there are no 0s in the difference
    #otherwise some detectors might divide by 0 during normilization
    assert numpy.count_nonzero((self.inputMax-self.inputMin)==0) == 0, '0 found in diff'
def scoreCorpus(threshold, args):
    """Scores the corpus given a detector's results and a user profile.

  Scores the corpus in parallel.

  @param threshold  (float)   Threshold value to convert an anomaly score value
                              to a detection.

  @param args       (tuple)   Contains:

    pool                (multiprocessing.Pool)  Pool of processes to perform
                                                tasks in parallel.
    detectorName        (string)                Name of detector.

    profileName         (string)                Name of scoring profile.

    costMatrix          (dict)                  Cost matrix to weight the
                                                true positives, false negatives,
                                                and false positives during
                                                scoring.
    resultsDetectorDir  (string)                Directory for the results CSVs.

    resultsCorpus       (nab.Corpus)            Corpus object that holds the per
                                                record anomaly scores for a
                                                given detector.
    corpusLabel         (nab.CorpusLabel)       Ground truth anomaly labels for
                                                the NAB corpus.
    probationaryPercent (float)                 Percent of each data file not
                                                to be considered during scoring.
  """
    (pool, detectorName, profileName, costMatrix, resultsDetectorDir,
     resultsCorpus, corpusLabel, probationaryPercent, scoreFlag) = args

    args = []
    for relativePath, dataSet in resultsCorpus.dataFiles.iteritems():
        if "_scores.csv" in relativePath:
            continue

        # relativePath: raw dataset file,
        # e.g. 'artificialNoAnomaly/art_noisy.csv'
        relativePath = convertResultsPathToDataPath( \
          os.path.join(detectorName, relativePath))

        # outputPath: dataset results file,
        # e.g. 'results/detector/artificialNoAnomaly/detector_art_noisy.csv'
        relativeDir, fileName = os.path.split(relativePath)
        fileName = detectorName + "_" + fileName
        outputPath = os.path.join(resultsDetectorDir, relativeDir, fileName)

        windows = corpusLabel.windows[relativePath]
        labels = corpusLabel.labels[relativePath]

        probationaryPeriod = getProbationPeriod(probationaryPercent,
                                                labels.shape[0])

        predicted = convertAnomalyScoresToDetections(
            dataSet.data["anomaly_score"], threshold)

        args.append((detectorName, profileName, relativePath, outputPath,
                     threshold, predicted, windows, labels, costMatrix,
                     probationaryPeriod, scoreFlag))

    results = pool.map(scoreDataSet, args)

    # Total the 6 scoring metrics for all data files
    totals = [None] * 3 + [0] * 6
    for row in results:
        for i in xrange(6):
            totals[i + 3] += row[i + 4]

    results.append(["Totals"] + totals)

    resultsDF = pandas.DataFrame(data=results,
                                 columns=("Detector", "Profile", "File",
                                          "Threshold", "Score", "TP", "TN",
                                          "FP", "FN", "Total_Count"))

    return resultsDF
Example #9
0
def scoreCorpus(threshold, args):
  """Scores the corpus given a detector's results and a user profile.

  Scores the corpus in parallel.

  @param threshold  (float)   Threshold value to convert an anomaly score value
                              to a detection.

  @param args       (tuple)   Contains:

    pool                (multiprocessing.Pool)  Pool of processes to perform
                                                tasks in parallel.
    detectorName        (string)                Name of detector.

    profileName         (string)                Name of scoring profile.

    costMatrix          (dict)                  Cost matrix to weight the
                                                true positives, false negatives,
                                                and false positives during
                                                scoring.
    resultsDetectorDir  (string)                Directory for the results CSVs.

    resultsCorpus       (nab.Corpus)            Corpus object that holds the per
                                                record anomaly scores for a
                                                given detector.
    corpusLabel         (nab.CorpusLabel)       Ground truth anomaly labels for
                                                the NAB corpus.
    probationaryPercent (float)                 Percent of each data file not
                                                to be considered during scoring.
  """
  (pool,
   detectorName,
   profileName,
   costMatrix,
   resultsDetectorDir,
   resultsCorpus,
   corpusLabel,
   probationaryPercent,
   scoreFlag) = args

  args = []
  for relativePath, dataSet in resultsCorpus.dataFiles.iteritems():
    if "_scores.csv" in relativePath:
      continue

    # relativePath: raw dataset file,
    # e.g. 'artificialNoAnomaly/art_noisy.csv'
    relativePath = convertResultsPathToDataPath( \
      os.path.join(detectorName, relativePath))

    # outputPath: dataset results file,
    # e.g. 'results/detector/artificialNoAnomaly/detector_art_noisy.csv'
    relativeDir, fileName = os.path.split(relativePath)
    fileName =  detectorName + "_" + fileName
    outputPath = os.path.join(resultsDetectorDir, relativeDir, fileName)

    windows = corpusLabel.windows[relativePath]
    labels = corpusLabel.labels[relativePath]

    probationaryPeriod = getProbationPeriod(
      probationaryPercent, labels.shape[0])

    predicted = convertAnomalyScoresToDetections(
      dataSet.data["anomaly_score"], threshold)

    args.append((
      detectorName,
      profileName,
      relativePath,
      outputPath,
      threshold,
      predicted,
      windows,
      labels,
      costMatrix,
      probationaryPeriod,
      scoreFlag))

  # Using `map_async` instead of `map` so interrupts are properly handled.
  # See: http://stackoverflow.com/a/1408476
  results = pool.map_async(scoreDataSet, args).get(99999999)

  # Total the 6 scoring metrics for all data files
  totals = [None]*3 + [0]*6
  for row in results:
    for i in xrange(6):
      totals[i+3] += row[i+4]

  results.append(["Totals"] + totals)

  resultsDF = pandas.DataFrame(data=results,
                               columns=("Detector", "Profile", "File",
                                        "Threshold", "Score", "TP", "TN",
                                        "FP", "FN", "Total_Count"))

  return resultsDF