def testAboveMaxTimestampRaisesValueError(self): with self.assertRaises(ValueError) as errorCtx: date_time_utils.parseDatetime( str((date_time_utils._MAX_UNIX_SECONDS + 1) * 1000), "#t") self.assertIn("Unable to parse", errorCtx.exception.args[0])
def testBadFormatNotationRaisesException(self): with self.assertRaises(ValueError) as excCtx: date_time_utils.parseDatetime("01-29-2016 11:01:59.01 AM", "%m-%d-%Y %I:%M:%S.%f %W") self.assertEqual( excCtx.exception.args[0], "time data '01-29-2016 11:01:59.01 AM' does not match format " "'%m-%d-%Y %I:%M:%S.%f %W'")
def _getTrueAnomalyLabels(self, name): labelFileRelativePath = os.path.join("data", "anomaly_labels_nab.json") with resource_stream(__name__, labelFileRelativePath) as infile: nabLabels = json.load(infile) targetAnomalies = [] for anomaly in nabLabels[name]: targetAnomalies.append( date_time_utils.parseDatetime(anomaly, "%Y-%m-%d %H:%M:%S")) return targetAnomalies
def _getTrueAnomalyLabels(self, name): labelFileRelativePath = os.path.join("data", "anomaly_labels_nab.json") with resource_stream(__name__, labelFileRelativePath) as infile: nabLabels = json.load(infile) targetAnomalies = [] for anomaly in nabLabels[name]: targetAnomalies.append(date_time_utils.parseDatetime(anomaly, "%Y-%m-%d %H:%M:%S")) return targetAnomalies
def run(self): """ Run the model: ingest and process the input metric data and emit output messages containing anomaly scores """ numRowsToSkip = self._inputSpec["rowOffset"] datetimeFormat = self._inputSpec["datetimeFormat"] inputRowTimestampIndex = self._inputSpec["timestampIndex"] inputRowValueIndex = self._inputSpec["valueIndex"] g_log.info("Processing model=%s", self._modelId) for inputRow in self._csvReader: g_log.debug("Got inputRow=%r", inputRow) if numRowsToSkip > 0: numRowsToSkip -= 1 g_log.debug("Skipping header row %s; %s rows left to skip", inputRow, numRowsToSkip) continue if len(inputRow) > inputRowValueIndex: if not (na.isNA(str(inputRow[inputRowValueIndex])) or na.isNA(str(inputRow[inputRowTimestampIndex]))): # Extract timestamp and value # NOTE: the order must match the `inputFields` that we passed to the # Aggregator constructor fields = [ date_time_utils.parseDatetime(inputRow[inputRowTimestampIndex], datetimeFormat), float(inputRow[inputRowValueIndex]) ] # Aggregate aggRow, _ = self._aggregator.next(fields, None) g_log.debug("Aggregator returned %s for %s", aggRow, fields) if aggRow is not None: self._emitOutputMessage( dataRow=aggRow, anomalyProbability=self._computeAnomalyProbability(aggRow)) # Reap remaining data from aggregator aggRow, _ = self._aggregator.next(None, curInputBookmark=None) g_log.debug("Aggregator reaped %s in final call", aggRow) if aggRow is not None: self._emitOutputMessage( dataRow=aggRow, anomalyProbability=self._computeAnomalyProbability(aggRow))
def run(self): """ Run the model: ingest and process the input metric data and emit output messages containing anomaly scores """ numRowsToSkip = self._inputSpec["rowOffset"] datetimeFormat = self._inputSpec["datetimeFormat"] inputRowTimestampIndex = self._inputSpec["timestampIndex"] inputRowValueIndex = self._inputSpec["valueIndex"] g_log.info("Processing model=%s", self._modelId) for inputRow in self._csvReader: g_log.debug("Got inputRow=%r", inputRow) if numRowsToSkip > 0: numRowsToSkip -= 1 g_log.debug("Skipping header row %s; %s rows left to skip", inputRow, numRowsToSkip) continue # Extract timestamp and value # NOTE: the order must match the `inputFields` that we passed to the # Aggregator constructor fields = [ date_time_utils.parseDatetime(inputRow[inputRowTimestampIndex], datetimeFormat), float(inputRow[inputRowValueIndex]) ] # Aggregate aggRow, _ = self._aggregator.next(fields, None) g_log.debug("Aggregator returned %s for %s", aggRow, fields) if aggRow is not None: self._emitOutputMessage( dataRow=aggRow, anomalyProbability=self._computeAnomalyProbability(aggRow)) # Reap remaining data from aggregator aggRow, _ = self._aggregator.next(None, curInputBookmark=None) g_log.debug("Aggregator reaped %s in final call", aggRow) if aggRow is not None: self._emitOutputMessage( dataRow=aggRow, anomalyProbability=self._computeAnomalyProbability(aggRow))
def _readCSVFile(fileName, rowOffset, timestampIndex, valueIndex, datetimeFormat): """ Read csv data file, the data file must have two columns that contains time stamps and data values :param str fileName: path to input csv file :param int rowOffset: index of first data row in csv :param int timestampIndex: column index of the timestamp :param int valueIndex: column index of the value :param str datetimeFormat: datetime format string for python's datetime.strptime :returns: Sequence of two tuples (timestamp, value), where timestamp of type datetime.datetime and value is a number (int of float) """ with open(fileName, "rU") as csvFile: fileReader = _createCsvReader(csvFile) for _ in xrange(rowOffset): fileReader.next() # skip header line samples = [] numRows = 0 for row in fileReader: timestamp = date_time_utils.parseDatetime(row[timestampIndex], datetimeFormat) # use utc timezone if timezone information is not provided if timestamp.tzinfo is None: timestamp = timestamp.replace(tzinfo=tz.tzutc()) samples.append((timestamp, float(row[valueIndex]))) numRows += 1 if numRows >= MAX_NUM_ROWS: break return samples
def _readCSVFile(fileName, rowOffset, timestampIndex, valueIndex, datetimeFormat): """ Read csv data file, the data file must have two columns that contains time stamps and data values :param str fileName: path to input csv file :param int rowOffset: index of first data row in csv :param int timestampIndex: column index of the timestamp :param int valueIndex: column index of the value :param str datetimeFormat: datetime format string for python's datetime.strptime :returns: Sequence of two tuples (timestamp, value), where timestamp of type datetime.datetime and value is a number (int of float) """ with open(fileName, "rU") as csvFile: fileReader = _createCsvReader(csvFile) for _ in xrange(rowOffset): fileReader.next() # skip header line samples = [] numRows = 0 for row in fileReader: if len(row) > valueIndex: if not (na.isNA(str(row[valueIndex])) or na.isNA(str(row[timestampIndex]))): timestamp = date_time_utils.parseDatetime( row[timestampIndex], datetimeFormat) # use utc timezone if timezone information is not provided if timestamp.tzinfo is None: timestamp = timestamp.replace(tzinfo=tz.tzutc()) samples.append((timestamp, float(row[valueIndex]))) numRows += 1 if numRows >= MAX_NUM_ROWS: break return samples
"windowSize": 3000} modelSpec = {"timestampFieldName": "timestamp", "valueFieldName": "value"} dataAggregator = initializeAggregator(aggSpec, modelSpec) timeStampRaw = [] timeStampAgg = [] valueRaw = [] valueAgg = [] sliceEndTime = [] for inputRow in inputFile.readlines(): inputRow = inputRow.split(',') fields = [ date_time_utils.parseDatetime(inputRow[0], '%m/%d/%y %H:%M'), float(inputRow[1]) ] aggRow, _ = dataAggregator.next(fields, None) timeStampRaw.append(fields[0]) valueRaw.append(fields[1]) if aggRow is not None: sliceEndTime.append(dataAggregator._endTime) timeStampAgg.append(aggRow[0]) valueAgg.append(aggRow[1]) fig = plt.figure() plt.plot(timeStampRaw, valueRaw, '.') plt.plot(timeStampAgg, valueAgg, 'r+') yl = plt.ylim()
def testNonNumericTimestampRaisesValueError(self): with self.assertRaises(ValueError) as errorCtx: date_time_utils.parseDatetime("xyz", "#t") self.assertIn("xyz", errorCtx.exception.args[0])
def testBadTimezoneRaisesException(self): with self.assertRaises(ValueError) as excCtx: date_time_utils.parseDatetime("2016-01-29T23:00:00.123+000", "%Y-%m-%dT%H:%M:%S.%f%z") self.assertEqual( excCtx.exception.args[0], "time data '2016-01-29T23:00:00.123+000' does not match format " "'%Y-%m-%dT%H:%M:%S.%f%z'") with self.assertRaises(ValueError) as excCtx: date_time_utils.parseDatetime("2016-01-29T23:00:00.123+00:60", "%Y-%m-%dT%H:%M:%S.%f%z") self.assertEqual( excCtx.exception.args[0], "time data '2016-01-29T23:00:00.123+00:60' does not match format " "'%Y-%m-%dT%H:%M:%S.%f%z': UTC offset minutes exceed 59") with self.assertRaises(ValueError) as excCtx: date_time_utils.parseDatetime("2016-01-29T23:00:00.123+25:00", "%Y-%m-%dT%H:%M:%S.%f%z") self.assertEqual( excCtx.exception.args[0], "time data '2016-01-29T23:00:00.123+25:00' does not match format " "'%Y-%m-%dT%H:%M:%S.%f%z': UTC offset +25:0 is out of bounds; must be in " "-24:59 .. +24:59") with self.assertRaises(ValueError) as excCtx: date_time_utils.parseDatetime("2016-01-29T23:00:00.123+00:0", "%Y-%m-%dT%H:%M:%S.%f%z") self.assertEqual( excCtx.exception.args[0], "time data '2016-01-29T23:00:00.123+00:0' does not match format " "'%Y-%m-%dT%H:%M:%S.%f%z'") with self.assertRaises(ValueError) as excCtx: date_time_utils.parseDatetime("2016-01-29T23:00:00.123+0", "%Y-%m-%dT%H:%M:%S.%f%z") self.assertEqual( excCtx.exception.args[0], "time data '2016-01-29T23:00:00.123+0' does not match format " "'%Y-%m-%dT%H:%M:%S.%f%z'") with self.assertRaises(ValueError) as excCtx: date_time_utils.parseDatetime("2016-01-29T23:00:00.123+:00", "%Y-%m-%dT%H:%M:%S.%f%z") self.assertEqual( excCtx.exception.args[0], "time data '2016-01-29T23:00:00.123+:00' does not match format " "'%Y-%m-%dT%H:%M:%S.%f%z'") with self.assertRaises(ValueError) as excCtx: date_time_utils.parseDatetime("2016-01-29T23:00:00.123+:", "%Y-%m-%dT%H:%M:%S.%f%z") self.assertEqual( excCtx.exception.args[0], "time data '2016-01-29T23:00:00.123+:' does not match format " "'%Y-%m-%dT%H:%M:%S.%f%z'") with self.assertRaises(ValueError) as excCtx: date_time_utils.parseDatetime("2016-01-29T23:00:00.123+", "%Y-%m-%dT%H:%M:%S.%f%z") self.assertEqual( excCtx.exception.args[0], "time data '2016-01-29T23:00:00.123+' does not match format " "'%Y-%m-%dT%H:%M:%S.%f%z'")
def _testModelRunner(self, name, inputSpec, aggSpec, modelSpec): """ Make sure model runner returns correct anomaly likelihood :param str name: dataset name :param str inputSpec: JSON object describing the input metric data per input_opt_schema.json :param str aggSpec: JSON object describing aggregation of the input metric per agg_opt_schema.json :param str modelSpec: JSON object describing the model per model_opt_schema.json """ with self._startModelRunnerSubprocess( inputSpec, aggSpec, modelSpec) as mrProcess: stdoutData, stderrData = mrProcess.communicate() out = stdoutData.splitlines() self.assertEqual(stderrData, "") results = self._loadCsvFile(os.path.join(RESULTS_DIR, 'numenta_'+name+'.csv')) inputSpec = json.loads(inputSpec) trueResultTimestamp = [] trueAnomalyLikelihood = [] for trueResult in results: timestamp = date_time_utils.parseDatetime(trueResult[0], inputSpec['datetimeFormat']) trueResultTimestamp.append(timestamp) trueAnomalyLikelihood.append(float(trueResult[2])) dataDuration = trueResultTimestamp[-1] - trueResultTimestamp[0] duration = dataDuration.total_seconds() anomalyWindow = duration * 0.1 probationaryPeriod = trueResultTimestamp[0] + \ datetime.timedelta(seconds=duration * 0.2) nabDetection = self._convertAnomalyScoresToDetections( trueResultTimestamp, trueAnomalyLikelihood, ANOMALY_THRESH) computedTimestamp = [] computedAnomalyLikelihood = [] for computedResult in out: outputRecord = json.loads(computedResult) timestamp = date_time_utils.parseDatetime(outputRecord[0], "%Y-%m-%dT%H:%M:%S") computedTimestamp.append(timestamp) computedAnomalyLikelihood.append(float(outputRecord[2])) computedDetection = self._convertAnomalyScoresToDetections( computedTimestamp, computedAnomalyLikelihood, ANOMALY_THRESH) nabLabels = self._getTrueAnomalyLabels(name) numTruePositiveComputed = self._checkForTruePositives( nabLabels, computedDetection, anomalyWindow) numTruePositiveNAB= self._checkForTruePositives( nabLabels, nabDetection, anomalyWindow) numFalsePositiveComputed = self._checkForFalsePositives( nabLabels, computedDetection, anomalyWindow, probationaryPeriod) numFalsePositiveNAB= self._checkForFalsePositives( nabLabels, nabDetection, anomalyWindow, probationaryPeriod) self.assertGreaterEqual(numTruePositiveComputed, numTruePositiveNAB) self.assertLessEqual(numFalsePositiveComputed, numFalsePositiveNAB) self.assertEqual(mrProcess.returncode, 0)
def testNegativeTimestampRaisesValueError(self): with self.assertRaises(ValueError) as errorCtx: date_time_utils.parseDatetime("-1465257536142.103", "#t") self.assertIn("Expected non-negative Unix Timestamp", errorCtx.exception.args[0])
def testGoodSamples(self): # Check for duplicate test cases self.assertEqual(len(self._GOOD_SAMPLES), len(set(self._GOOD_SAMPLES)), msg="There are duplicate test cases: {}".format( set(item for item in self._GOOD_SAMPLES if self._GOOD_SAMPLES.count(item) > 1))) # Verify the parser testedFormatSet = set() for fmt, timestamp, expectedIso in self._GOOD_SAMPLES: testedFormatSet.add(fmt) try: parsed = date_time_utils.parseDatetime(timestamp, fmt) except (TypeError, ValueError) as exc: self.fail( "Failed to parse ts={!r} using fmt={!r}; exc={!r}".format( timestamp, fmt, exc)) try: isoEncoded = parsed.isoformat() except ValueError as exc: self.fail( "Failed to isoformat parsed datetime={!r}; ts={!r} using fmt={!r}; " "exc={!r}".format(parsed, timestamp, fmt, exc)) self.assertEqual( isoEncoded, expectedIso, msg= ("ISO result {!r} didn't match expected {!r}; ts={!r} using fmt={!r}" .format(isoEncoded, expectedIso, timestamp, fmt))) # Make sure all timestamp formats from # unicorn/app/config/momentjs_to_datetime_strptime.json are covered by our # test cases mappingsPath = os.path.join(os.path.abspath(os.path.dirname(__file__)), os.path.pardir, os.path.pardir, os.path.pardir, os.path.pardir, "app", "config", "momentjs_to_datetime_strptime.json") with open(mappingsPath) as mappingsFile: mapList = json.load(mappingsFile) formatsToCategoryMap = dict() for bundle in mapList: for fmt in bundle["mappings"].itervalues(): if fmt not in formatsToCategoryMap: formatsToCategoryMap[fmt] = bundle["category"] self.assertGreater(len(formatsToCategoryMap), 0) self.assertGreater(len(testedFormatSet), 0) untestedFormats = set(formatsToCategoryMap) - testedFormatSet self.assertFalse( untestedFormats, msg="{} format(s) not covered by GOOD SAMPLES test cases: {}". format(len(untestedFormats), [(fmt, formatsToCategoryMap[fmt]) for fmt in untestedFormats]))
def testMaxTimestampDoesNotRaise(self): # pylint: disable=R0201 date_time_utils.parseDatetime( str(date_time_utils._MAX_UNIX_SECONDS * 1000), "#t")
def testZeroTimestamp(self): result = date_time_utils.parseDatetime("0", "#t") self.assertEqual(result, datetime(1970, 1, 1, 0, 0))
def testNegativeTimestampRaisesValueError(self): with self.assertRaises(ValueError) as errorCtx: date_time_utils.parseDatetime("-5", "#T") self.assertIn("Expected non-negative Unix Timestamp", errorCtx.exception.args[0])
aggSpec = {"func": "mean", "windowSize": 3000} modelSpec = {"timestampFieldName": "timestamp", "valueFieldName": "value"} dataAggregator = initializeAggregator(aggSpec, modelSpec) timeStampRaw = [] timeStampAgg = [] valueRaw = [] valueAgg = [] sliceEndTime = [] for inputRow in inputFile.readlines(): inputRow = inputRow.split(',') fields = [ date_time_utils.parseDatetime(inputRow[0], '%m/%d/%y %H:%M'), float(inputRow[1]) ] aggRow, _ = dataAggregator.next(fields, None) timeStampRaw.append(fields[0]) valueRaw.append(fields[1]) if aggRow is not None: sliceEndTime.append(dataAggregator._endTime) timeStampAgg.append(aggRow[0]) valueAgg.append(aggRow[1]) fig = plt.figure() plt.plot(timeStampRaw, valueRaw, '.') plt.plot(timeStampAgg, valueAgg, 'r+') yl = plt.ylim()
def testNonNumericTimestampRaisesValueError(self): with self.assertRaises(ValueError) as errorCtx: date_time_utils.parseDatetime("xyz", "#T") self.assertIn("xyz", errorCtx.exception.args[0])
def testMaxTimestampDoesNotRaise(self): # pylint: disable=R0201 date_time_utils.parseDatetime(str(date_time_utils._MAX_UNIX_SECONDS * 1000), "#t")
def testPositiveFloatingPointTimestamp(self): result = date_time_utils.parseDatetime("1465257536142.103", "#t") self.assertEqual(result, datetime(2016, 6, 6, 23, 58, 56, 142103))
def testGoodSamples(self): # Check for duplicate test cases self.assertEqual( len(self._GOOD_SAMPLES), len(set(self._GOOD_SAMPLES)), msg="There are duplicate test cases: {}".format( set(item for item in self._GOOD_SAMPLES if self._GOOD_SAMPLES.count(item) > 1)) ) # Verify the parser testedFormatSet = set() for fmt, timestamp, expectedIso in self._GOOD_SAMPLES: testedFormatSet.add(fmt) try: parsed = date_time_utils.parseDatetime(timestamp, fmt) except (TypeError, ValueError) as exc: self.fail( "Failed to parse ts={!r} using fmt={!r}; exc={!r}".format( timestamp, fmt, exc)) try: isoEncoded = parsed.isoformat() except ValueError as exc: self.fail( "Failed to isoformat parsed datetime={!r}; ts={!r} using fmt={!r}; " "exc={!r}".format(parsed, timestamp, fmt, exc)) self.assertEqual( isoEncoded, expectedIso, msg=( "ISO result {!r} didn't match expected {!r}; ts={!r} using fmt={!r}" .format(isoEncoded, expectedIso, timestamp, fmt))) # Make sure all timestamp formats from # unicorn/app/config/momentjs_to_datetime_strptime.json are covered by our # test cases mappingsPath = os.path.join( os.path.abspath(os.path.dirname(__file__)), os.path.pardir, os.path.pardir, os.path.pardir, os.path.pardir, "app", "config", "momentjs_to_datetime_strptime.json" ) with open(mappingsPath) as mappingsFile: mapList = json.load(mappingsFile) formatsToCategoryMap = dict() for bundle in mapList: for fmt in bundle["mappings"].itervalues(): if fmt not in formatsToCategoryMap: formatsToCategoryMap[fmt] = bundle["category"] self.assertGreater(len(formatsToCategoryMap), 0) self.assertGreater(len(testedFormatSet), 0) untestedFormats = set(formatsToCategoryMap) - testedFormatSet self.assertFalse( untestedFormats, msg="{} format(s) not covered by GOOD SAMPLES test cases: {}".format( len(untestedFormats), [(fmt, formatsToCategoryMap[fmt]) for fmt in untestedFormats]))
def _testModelRunner(self, name, inputSpec, aggSpec, modelSpec): """ Make sure model runner returns correct anomaly likelihood :param str name: dataset name :param str inputSpec: JSON object describing the input metric data per input_opt_schema.json :param str aggSpec: JSON object describing aggregation of the input metric per agg_opt_schema.json :param str modelSpec: JSON object describing the model per model_opt_schema.json """ with self._startModelRunnerSubprocess(inputSpec, aggSpec, modelSpec) as mrProcess: stdoutData, stderrData = mrProcess.communicate() out = stdoutData.splitlines() self.assertEqual(stderrData, "") results = self._loadCsvFile( os.path.join(RESULTS_DIR, 'numenta_' + name + '.csv')) inputSpec = json.loads(inputSpec) trueResultTimestamp = [] trueAnomalyLikelihood = [] for trueResult in results: timestamp = date_time_utils.parseDatetime( trueResult[0], inputSpec['datetimeFormat']) trueResultTimestamp.append(timestamp) trueAnomalyLikelihood.append(float(trueResult[2])) dataDuration = trueResultTimestamp[-1] - trueResultTimestamp[0] duration = dataDuration.total_seconds() anomalyWindow = duration * 0.1 probationaryPeriod = trueResultTimestamp[0] + \ datetime.timedelta(seconds=duration * 0.2) nabDetection = self._convertAnomalyScoresToDetections( trueResultTimestamp, trueAnomalyLikelihood, ANOMALY_THRESH) computedTimestamp = [] computedAnomalyLikelihood = [] for computedResult in out: outputRecord = json.loads(computedResult) timestamp = date_time_utils.parseDatetime( outputRecord[0], "%Y-%m-%dT%H:%M:%S") computedTimestamp.append(timestamp) computedAnomalyLikelihood.append(float(outputRecord[2])) computedDetection = self._convertAnomalyScoresToDetections( computedTimestamp, computedAnomalyLikelihood, ANOMALY_THRESH) nabLabels = self._getTrueAnomalyLabels(name) numTruePositiveComputed = self._checkForTruePositives( nabLabels, computedDetection, anomalyWindow) numTruePositiveNAB = self._checkForTruePositives( nabLabels, nabDetection, anomalyWindow) numFalsePositiveComputed = self._checkForFalsePositives( nabLabels, computedDetection, anomalyWindow, probationaryPeriod) numFalsePositiveNAB = self._checkForFalsePositives( nabLabels, nabDetection, anomalyWindow, probationaryPeriod) self.assertGreaterEqual(numTruePositiveComputed, numTruePositiveNAB) self.assertLessEqual(numFalsePositiveComputed, numFalsePositiveNAB) self.assertEqual(mrProcess.returncode, 0)