def run(self): """ Run the model: ingest and process the input metric data and emit output messages containing anomaly scores """ numRowsToSkip = self._inputSpec["rowOffset"] datetimeFormat = self._inputSpec["datetimeFormat"] inputRowTimestampIndex = self._inputSpec["timestampIndex"] inputRowValueIndex = self._inputSpec["valueIndex"] g_log.info("Processing model=%s", self._modelId) for inputRow in self._csvReader: g_log.debug("Got inputRow=%r", inputRow) if numRowsToSkip > 0: numRowsToSkip -= 1 g_log.debug("Skipping header row %s; %s rows left to skip", inputRow, numRowsToSkip) continue if len(inputRow) > inputRowValueIndex: if not (na.isNA(str(inputRow[inputRowValueIndex])) or na.isNA(str(inputRow[inputRowTimestampIndex]))): # Extract timestamp and value # NOTE: the order must match the `inputFields` that we passed to the # Aggregator constructor fields = [ date_time_utils.parseDatetime(inputRow[inputRowTimestampIndex], datetimeFormat), float(inputRow[inputRowValueIndex]) ] # Aggregate aggRow, _ = self._aggregator.next(fields, None) g_log.debug("Aggregator returned %s for %s", aggRow, fields) if aggRow is not None: self._emitOutputMessage( dataRow=aggRow, anomalyProbability=self._computeAnomalyProbability(aggRow)) # Reap remaining data from aggregator aggRow, _ = self._aggregator.next(None, curInputBookmark=None) g_log.debug("Aggregator reaped %s in final call", aggRow) if aggRow is not None: self._emitOutputMessage( dataRow=aggRow, anomalyProbability=self._computeAnomalyProbability(aggRow))
def _readCSVFile(fileName, rowOffset, timestampIndex, valueIndex, datetimeFormat): """ Read csv data file, the data file must have two columns that contains time stamps and data values :param str fileName: path to input csv file :param int rowOffset: index of first data row in csv :param int timestampIndex: column index of the timestamp :param int valueIndex: column index of the value :param str datetimeFormat: datetime format string for python's datetime.strptime :returns: Sequence of two tuples (timestamp, value), where timestamp of type datetime.datetime and value is a number (int of float) """ with open(fileName, "rU") as csvFile: fileReader = _createCsvReader(csvFile) for _ in xrange(rowOffset): fileReader.next() # skip header line samples = [] numRows = 0 for row in fileReader: if len(row) > valueIndex: if not (na.isNA(str(row[valueIndex])) or na.isNA(str(row[timestampIndex]))): timestamp = date_time_utils.parseDatetime( row[timestampIndex], datetimeFormat) # use utc timezone if timezone information is not provided if timestamp.tzinfo is None: timestamp = timestamp.replace(tzinfo=tz.tzutc()) samples.append((timestamp, float(row[valueIndex]))) numRows += 1 if numRows >= MAX_NUM_ROWS: break return samples
def _readCSVFile(fileName, rowOffset, timestampIndex, valueIndex, datetimeFormat): """ Read csv data file, the data file must have two columns that contains time stamps and data values :param str fileName: path to input csv file :param int rowOffset: index of first data row in csv :param int timestampIndex: column index of the timestamp :param int valueIndex: column index of the value :param str datetimeFormat: datetime format string for python's datetime.strptime :returns: Sequence of two tuples (timestamp, value), where timestamp of type datetime.datetime and value is a number (int of float) """ with open(fileName, "rU") as csvFile: fileReader = _createCsvReader(csvFile) for _ in xrange(rowOffset): fileReader.next() # skip header line samples = [] numRows = 0 for row in fileReader: if len(row) > valueIndex: if not (na.isNA(str(row[valueIndex])) or na.isNA(str(row[timestampIndex]))): timestamp = date_time_utils.parseDatetime(row[timestampIndex], datetimeFormat) # use utc timezone if timezone information is not provided if timestamp.tzinfo is None: timestamp = timestamp.replace(tzinfo=tz.tzutc()) samples.append((timestamp, float(row[valueIndex]))) numRows += 1 if numRows >= MAX_NUM_ROWS: break return samples