def run(self):
    """ Run the model: ingest and process the input metric data and emit output
    messages containing anomaly scores
    """

    numRowsToSkip = self._inputSpec["rowOffset"]
    datetimeFormat = self._inputSpec["datetimeFormat"]
    inputRowTimestampIndex = self._inputSpec["timestampIndex"]
    inputRowValueIndex = self._inputSpec["valueIndex"]

    g_log.info("Processing model=%s", self._modelId)

    for inputRow in self._csvReader:
      g_log.debug("Got inputRow=%r", inputRow)

      if numRowsToSkip > 0:
        numRowsToSkip -= 1
        g_log.debug("Skipping header row %s; %s rows left to skip",
                    inputRow, numRowsToSkip)
        continue
      
      if len(inputRow) > inputRowValueIndex:
        if not (na.isNA(str(inputRow[inputRowValueIndex])) or
         na.isNA(str(inputRow[inputRowTimestampIndex]))):
          # Extract timestamp and value
          # NOTE: the order must match the `inputFields` that we passed to the
          # Aggregator constructor
  
          fields = [
            date_time_utils.parseDatetime(inputRow[inputRowTimestampIndex],
                                          datetimeFormat),
            float(inputRow[inputRowValueIndex])
          ]
  
          # Aggregate
          aggRow, _ = self._aggregator.next(fields, None)
          g_log.debug("Aggregator returned %s for %s", aggRow, fields)
          if aggRow is not None:
            self._emitOutputMessage(
              dataRow=aggRow,
              anomalyProbability=self._computeAnomalyProbability(aggRow))


    # Reap remaining data from aggregator
    aggRow, _ = self._aggregator.next(None, curInputBookmark=None)
    g_log.debug("Aggregator reaped %s in final call", aggRow)
    if aggRow is not None:
      self._emitOutputMessage(
        dataRow=aggRow,
        anomalyProbability=self._computeAnomalyProbability(aggRow))
Example #2
0
def _readCSVFile(fileName, rowOffset, timestampIndex, valueIndex,
                 datetimeFormat):
    """
  Read csv data file, the data file must have two columns
  that contains time stamps and data values

  :param str fileName: path to input csv file
  :param int rowOffset: index of first data row in csv
  :param int timestampIndex: column index of the timestamp
  :param int valueIndex: column index of the value
  :param str datetimeFormat: datetime format string for python's
    datetime.strptime
  :returns: Sequence of two tuples (timestamp, value), where
    timestamp of type datetime.datetime and value is a number (int of float)
  """

    with open(fileName, "rU") as csvFile:
        fileReader = _createCsvReader(csvFile)
        for _ in xrange(rowOffset):
            fileReader.next()  # skip header line

        samples = []
        numRows = 0
        for row in fileReader:
            if len(row) > valueIndex:
                if not (na.isNA(str(row[valueIndex]))
                        or na.isNA(str(row[timestampIndex]))):
                    timestamp = date_time_utils.parseDatetime(
                        row[timestampIndex], datetimeFormat)

                    # use utc timezone if timezone information is not provided
                    if timestamp.tzinfo is None:
                        timestamp = timestamp.replace(tzinfo=tz.tzutc())

                    samples.append((timestamp, float(row[valueIndex])))

                    numRows += 1
                    if numRows >= MAX_NUM_ROWS:
                        break
        return samples
def _readCSVFile(fileName, rowOffset, timestampIndex, valueIndex, datetimeFormat):
    """
  Read csv data file, the data file must have two columns
  that contains time stamps and data values

  :param str fileName: path to input csv file
  :param int rowOffset: index of first data row in csv
  :param int timestampIndex: column index of the timestamp
  :param int valueIndex: column index of the value
  :param str datetimeFormat: datetime format string for python's
    datetime.strptime
  :returns: Sequence of two tuples (timestamp, value), where
    timestamp of type datetime.datetime and value is a number (int of float)
  """

    with open(fileName, "rU") as csvFile:
        fileReader = _createCsvReader(csvFile)
        for _ in xrange(rowOffset):
            fileReader.next()  # skip header line

        samples = []
        numRows = 0
        for row in fileReader:
            if len(row) > valueIndex:
                if not (na.isNA(str(row[valueIndex])) or na.isNA(str(row[timestampIndex]))):
                    timestamp = date_time_utils.parseDatetime(row[timestampIndex], datetimeFormat)

                    # use utc timezone if timezone information is not provided
                    if timestamp.tzinfo is None:
                        timestamp = timestamp.replace(tzinfo=tz.tzutc())

                    samples.append((timestamp, float(row[valueIndex])))

                    numRows += 1
                    if numRows >= MAX_NUM_ROWS:
                        break
        return samples