def sendBacklogDataToModel(conn, metricId, logger): """ Send backlog data to OPF/CLA model. Do not call this before starting the model. :param conn: SQLAlchemy Connection object for executing SQL :type conn: sqlalchemy.engine.Connection :param metricId: unique identifier of the metric row :param logger: logger object """ backlogData = tuple( model_swapper_interface.ModelInputRow( rowID=md.rowid, data=(md.timestamp, md.metric_value,)) for md in repository.getMetricData( conn, metricId, fields=[schema.metric_data.c.rowid, schema.metric_data.c.timestamp, schema.metric_data.c.metric_value])) if backlogData: with model_swapper_interface.ModelSwapperInterface() as modelSwapper: model_data_feeder.sendInputRowsToModel( modelId=metricId, inputRows=backlogData, batchSize=config.getint("metric_streamer", "chunk_size"), modelSwapper=modelSwapper, logger=logger, profiling=(config.getboolean("debugging", "profiling") or logger.isEnabledFor(logging.DEBUG))) logger.info("sendBacklogDataToModel: sent %d backlog data rows to model=%s", len(backlogData), metricId)
def _getTailMetricRowTimestamp(self, conn, metricID, lastDataRowID): """ :param sqlalchemy.engine.Connection conn: A sqlalchemy connection object :param metricID: unique metric id :param lastDataRowID: last metric data row identifier for metric with given metric id :returns: timestamp of the last metric data row that *we* stored in metric_data table for the given metric id, or None if none have been stored :rtype: datetime.datetime or None TODO: unit-test """ if time.time() - self._lastTailInputMetricDataTimestampsGCTime > self._TAIL_INPUT_TIMESTAMP_GC_INTERVAL_SEC: # Garbage-collect our cache # TODO: unit-test self._tailInputMetricDataTimestamps.clear() self._lastTailInputMetricDataTimestampsGCTime = time.time() self._log.info("Garbage-collected tailInputMetricDataTimestamps cache") timestamp = None try: # First try to get it from cache timestamp = self._tailInputMetricDataTimestamps[metricID] except KeyError: # Not in cache, so try to load it from db rows = repository.getMetricData(conn, metricID, rowid=lastDataRowID) if rows.rowcount > 0 and rows.returns_rows: timestamp = next(iter(rows)).timestamp self._tailInputMetricDataTimestamps[metricID] = timestamp return timestamp
def checkModelResultsDeleted(self, uid): """Check that the model results have been deleted""" engine = repository.engineFactory(config=self.__config) with engine.begin() as conn: result = (repository.getMetricData( conn, metricId=uid, sort=schema.metric_data.c.timestamp.desc())) for row in result: self.assertIsNone(row.raw_anomaly_score) self.assertIsNone(row.anomaly_score) self.assertIsNone(row.display_value)
def exportModel(self, metricId): """ Export the given model :param metricId: datasource-specific unique metric identifier :returns: model-export specification for HTM model :rtype: dict :: { "datasource": "custom", "metricSpec": { "metric": "prod.web.14.memory", "unit": "Count" # optional }, # Optional model params "modelParams": { "min": min-value, # optional "max": max-value # optional }, "data": [[value, datetime.datetime], ...] # optional } :raises htmengine.exceptions.ObjectNotFoundError: if metric with the referenced metric uid doesn't exist """ with self.connectionFactory() as conn: metricObj = repository.retryOnTransientErrors( repository.getMetric)(conn, metricId) if metricObj.datasource != self._DATASOURCE: raise TypeError("exportModel: not an HTM metric=%r" % (metricObj, )) data = repository.getMetricData( conn, metricId, fields=[ schema.metric_data.c.metric_value, schema.metric_data.c.timestamp ], fromTimestamp=datetime.datetime.utcnow() - datetime.timedelta(days=14)) modelSpec = htmengine.utils.jsonDecode(metricObj.parameters) modelSpec["data"] = list(data) return modelSpec
def checkModelResultsDeleted(self, uid): """Check that the model results have been deleted""" engine = repository.engineFactory(config=self.__config) with engine.begin() as conn: result = ( repository.getMetricData(conn, metricId=uid, sort=schema.metric_data.c.timestamp.desc())) for row in result: self.assertIsNone(row.raw_anomaly_score) self.assertIsNone(row.anomaly_score) self.assertIsNone(row.display_value)
def exportModel(self, metricId): """ Export the given model :param metricId: datasource-specific unique metric identifier :returns: model-export specification for HTM model :rtype: dict :: { "datasource": "custom", "metricSpec": { "metric": "prod.web.14.memory", "unit": "Count" # optional }, # Optional model params "modelParams": { "min": min-value, # optional "max": max-value # optional }, "data": [[value, datetime.datetime], ...] # optional } :raises htmengine.exceptions.ObjectNotFoundError: if metric with the referenced metric uid doesn't exist """ with self.connectionFactory() as conn: metricObj = repository.retryOnTransientErrors(repository.getMetric)( conn, metricId) if metricObj.datasource != self._DATASOURCE: raise TypeError("exportModel: not an HTM metric=%r" % (metricObj,)) data = repository.getMetricData( conn, metricId, fields=[schema.metric_data.c.metric_value, schema.metric_data.c.timestamp], fromTimestamp=datetime.datetime.utcnow() - datetime.timedelta(days=14)) modelSpec = htmengine.utils.jsonDecode(metricObj.parameters) modelSpec["data"] = list(data) return modelSpec
def checkModelResults(self, uid, expectedResults): """Check that the results for metric uid match expectedResults. """ engine = repository.engineFactory(config=self.__config) with engine.begin() as conn: result = (repository.getMetricData( conn, metricId=uid, sort=schema.metric_data.c.timestamp.desc())) self.assertEqual(result.rowcount, len(expectedResults)) for result, expected in zip(result, expectedResults): self.assertSequenceEqual([ result.timestamp.strftime("%Y-%m-%d %H:%M:%S"), result.metric_value, result.anomaly_score, result.rowid ], expected)
def checkModelResults(self, uid, expectedResults): """Check that the results for metric uid match expectedResults. """ engine = repository.engineFactory(config=self.__config) with engine.begin() as conn: result = ( repository.getMetricData(conn, metricId=uid, sort=schema.metric_data.c.timestamp.desc())) self.assertEqual(result.rowcount, len(expectedResults)) for result, expected in zip(result, expectedResults): self.assertSequenceEqual([result.timestamp.strftime("%Y-%m-%d %H:%M:%S"), result.metric_value, result.anomaly_score, result.rowid], expected)
def _getTailMetricRowTimestamp(self, conn, metricID, lastDataRowID): """ :param sqlalchemy.engine.Connection conn: A sqlalchemy connection object :param metricID: unique metric id :param lastDataRowID: last metric data row identifier for metric with given metric id :returns: timestamp of the last metric data row that *we* stored in metric_data table for the given metric id, or None if none have been stored :rtype: datetime.datetime or None TODO: unit-test """ if (time.time() - self._lastTailInputMetricDataTimestampsGCTime > self._TAIL_INPUT_TIMESTAMP_GC_INTERVAL_SEC): # Garbage-collect our cache # TODO: unit-test self._tailInputMetricDataTimestamps.clear() self._lastTailInputMetricDataTimestampsGCTime = time.time() self._log.info( "Garbage-collected tailInputMetricDataTimestamps cache") timestamp = None try: # First try to get it from cache timestamp = self._tailInputMetricDataTimestamps[metricID] except KeyError: # Not in cache, so try to load it from db rows = repository.getMetricData(conn, metricID, rowid=lastDataRowID) if rows.rowcount > 0 and rows.returns_rows: timestamp = next(iter(rows)).timestamp self._tailInputMetricDataTimestamps[metricID] = timestamp return timestamp
def checkModelResultsSize(self, uid, size, atLeast=False): """Check that the number of results for metric uid matches size. This is not compatible with ManagedTempRepository since it makes an HTTP request that may be outside the temp repository process tree. :param uid: the uid of the metric to check results for :param size: the expected number of results :param atLeast: if True, checks for at least that many results; if False, checks for exact match of the result count; defaults to False """ engine = repository.engineFactory(config=self.__config) with engine.begin() as conn: result = repository.getMetricData(conn, metricId=uid) if atLeast: self.assertGreaterEqual(result.rowcount, size) else: self.assertEqual(result.rowcount, size) for row in result: self.assertIsNotNone(row)
def getModelResults(self, uid, resultCount): """Queries MySQL db and returns rows with anomaly results :param uid: uid of metric :param resultCount: number of rows expected :return: List of tuples containing timestamp, metric_value, anomaly_score, and rowid """ engine = repository.engineFactory(config=self.__config) fields = (schema.metric_data.c.timestamp, schema.metric_data.c.metric_value, schema.metric_data.c.anomaly_score, schema.metric_data.c.rowid) with engine.begin() as conn: result = (repository.getMetricData( conn, metricId=uid, fields=fields, sort=schema.metric_data.c.timestamp.desc(), score=0.0)) self.assertEqual(result.rowcount, resultCount) return result.fetchall()
def getModelResults(self, uid, resultCount): """Queries MySQL db and returns rows with anomaly results :param uid: uid of metric :param resultCount: number of rows expected :return: List of tuples containing timestamp, metric_value, anomaly_score, and rowid """ engine = repository.engineFactory(config=self.__config) fields = (schema.metric_data.c.timestamp, schema.metric_data.c.metric_value, schema.metric_data.c.anomaly_score, schema.metric_data.c.rowid) with engine.begin() as conn: result = ( repository.getMetricData(conn, metricId=uid, fields=fields, sort=schema.metric_data.c.timestamp.desc(), score=0.0)) self.assertEqual(result.rowcount, resultCount) return result.fetchall()
def _processModelInferenceResults(self, inferenceResults, metricID): """ Process a batch of model inference results Store the updated MetricData and anomaly likelihood parameters in the database. A row's anomaly_score value will be set to and remain at 0 in the first self._statisticsMinSampleSize rows; once we get enough inference results to create an anomaly likelihood model, anomaly_score will be computed on the subsequent rows. :param inferenceResults: a sequence of ModelInferenceResult instances in the processed order (ascending by timestamp) :param metricID: metric/model ID of the model that emitted the results :returns: None if the batch was rejected; otherwise a pair: (metric, metricDataRows) metric: Metric RowProxy instance corresponding to the given metricID metricDataRows: a sequence of MutableMetricDataRow instances corresponding to the updated metric_data rows. TODO: unit-test return value :rtype: None or tuple *NOTE:* the processing must be idempotent due to the "at least once" delivery semantics of the message bus *NOTE:* the performance goal is to minimize costly database access and avoid falling behind while processing model results, especially during the model's initial "catch-up" phase when large inference result batches are prevalent. """ engine = repository.engineFactory(config) # Validate model ID try: with engine.connect() as conn: metricObj = repository.getMetric(conn, metricID) except ObjectNotFoundError: # Ignore inferences for unknown models. Typically, this is is the result # of a deleted model. Another scenario where this might occur is when a # developer resets the db while there are result messages still on the # message bus. It would be an error if this were to occur in production # environment. self._log.warning("Received inference results for unknown model=%s; " "(model deleted?)", metricID, exc_info=True) return None # Reject the results if model is in non-ACTIVE state (e.g., if HTM Metric # was unmonitored after the results were generated) if metricObj.status != MetricStatus.ACTIVE: self._log.warning("Received inference results for a non-ACTIVE " "model=%s; metric=<%s>; (metric unmonitored?)", metricID, getMetricLogPrefix(metricObj)) return None # Load the MetricData instances corresponding to the results with engine.connect() as conn: metricDataRows = repository.getMetricData(conn, metricID, start=inferenceResults[0].rowID, stop=inferenceResults[-1].rowID) # metricDataRows must be mutable, as the data is massaged in # _scrubInferenceResultsAndInitMetricData() metricDataRows = list(metricDataRows) if not metricDataRows: self._log.error("Rejected inference result batch=[%s..%s] of model=%s " "due to no matching metric_data rows", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID) return None try: self._scrubInferenceResultsAndInitMetricData( engine=engine, inferenceResults=inferenceResults, metricDataRows=metricDataRows, metricObj=metricObj) except RejectedInferenceResultBatch as e: # TODO: unit-test self._log.error( "Rejected inference result batch=[%s..%s] corresponding to " "rows=[%s..%s] of model=%s due to error=%r", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, e) return None # Update anomaly scores based on the new results anomalyLikelihoodParams = ( self.likelihoodHelper.updateModelAnomalyScores( engine=engine, metricObj=metricObj, metricDataRows=metricDataRows)) # Update metric data rows with rescaled display values # NOTE: doing this outside the updateColumns loop to avoid holding row locks # any longer than necessary for metricData in metricDataRows: metricData.display_value = rescaleForDisplay( metricData.anomaly_score, active=(metricObj.status == MetricStatus.ACTIVE)) # Update database once via transaction! startTime = time.time() try: @retryOnTransientErrors def runSQL(engine): with engine.begin() as conn: for metricData in metricDataRows: fields = {"raw_anomaly_score": metricData.raw_anomaly_score, "anomaly_score": metricData.anomaly_score, "display_value": metricData.display_value, "multi_step_best_predictions": json.dumps(metricData.multi_step_best_predictions)} repository.updateMetricDataColumns(conn, metricData, fields) self._updateAnomalyLikelihoodParams( conn, metricObj.uid, metricObj.model_params, anomalyLikelihoodParams) runSQL(engine) except (ObjectNotFoundError, MetricNotActiveError): self._log.warning("Rejected inference result batch=[%s..%s] of model=%s", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID, exc_info=True) return None self._log.debug("Updated HTM metric_data rows=[%s..%s] " "of model=%s: duration=%ss", metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, time.time() - startTime) return (metricObj, metricDataRows,)
def handler(environ, start_response): metricName = environ["PATH_INFO"] if environ["REQUEST_METHOD"] == "PUT": # Trigger model creation... modelSpec = {"datasource": "custom", "metricSpec": {"metric": metricName}, "modelParams": {}} try: modelSpec["modelParams"].update(json.load(environ["wsgi.input"])) except Exception as e: print e start_response("400 Bad Request", [("Content-Type", "text/html")]) yield "Unable to parse request" adapter = createDatasourceAdapter(modelSpec["datasource"]) try: modelId = adapter.monitorMetric(modelSpec) start_response("201 Created", [("Content-Type", "text/html")]) yield "Created %s\n" % modelId except MetricAlreadyMonitored: start_response("400 Bad Request", [("Content-Type", "text/html")]) yield "Model already exists for %s" % metricName elif environ["REQUEST_METHOD"] == "POST": # Send data... start_response("200 OK", [("Content-Type", "text/html")]) for sample in environ["wsgi.input"]: value, ts = sample.split(" ") sendSample(bus, metricName=metricName, value=float(value), epochTimestamp=int(ts)) yield "Saved %s %f @ %d\n" % (metricName, float(value), int(ts)) elif environ["REQUEST_METHOD"] == "GET": # parameters = parse_qs(environ.get('QUERY_STRING', '')) # print parameters # if 'since' in parameters: # since = parameters['since'][0] with repository.engineFactory(appConfig).connect() as conn: fields = ( schema.metric_data.c.metric_value, schema.metric_data.c.timestamp, schema.metric_data.c.rowid, schema.metric_data.c.anomaly_score, ) sort = schema.metric_data.c.timestamp.asc() metricObj = repository.getCustomMetricByName(conn, metricName, fields=[schema.metric.c.uid]) result = repository.getMetricData(conn, metricId=metricObj.uid, fields=fields, sort=sort) start_response("200 OK", [("Content-Type", "text/html")]) for row in result: yield " ".join( ( metricName, str(row.metric_value), str(calendar.timegm(row.timestamp.timetuple())), str(row.anomaly_score), ) ) + "\n"
def handler(environ, start_response): metricName = environ["PATH_INFO"] if environ["REQUEST_METHOD"] == "PUT": # Trigger model creation... modelSpec = { "datasource": "custom", "metricSpec": { "metric": metricName }, "modelParams": {} } try: modelSpec["modelParams"].update(json.load(environ["wsgi.input"])) except Exception as e: start_response("400 Bad Request", [("Content-Type", "text/html")]) yield "Unable to parse request" adapter = createDatasourceAdapter(modelSpec["datasource"]) try: modelId = adapter.monitorMetric(modelSpec) start_response("201 Created", [("Content-Type", "text/html")]) yield "Created %s\n" % modelId except MetricAlreadyMonitored: start_response("400 Bad Request", [("Content-Type", "text/html")]) yield "Model already exists for %s" % metricName elif environ["REQUEST_METHOD"] == "POST": # Send data... start_response("200 OK", [("Content-Type", "text/html")]) for sample in environ["wsgi.input"]: value, ts = sample.split(" ") sendSample(bus, metricName=metricName, value=float(value), epochTimestamp=int(ts)) yield "Saved %s %f @ %d\n" % (metricName, float(value), int(ts)) elif environ["REQUEST_METHOD"] == "GET": with repository.engineFactory(appConfig).connect() as conn: fields = (schema.metric_data.c.metric_value, schema.metric_data.c.timestamp, schema.metric_data.c.rowid, schema.metric_data.c.anomaly_score) sort = schema.metric_data.c.timestamp.asc() metricObj = repository.getCustomMetricByName( conn, metricName, fields=[schema.metric.c.uid]) result = repository.getMetricData(conn, metricId=metricObj.uid, fields=fields, sort=sort) start_response("200 OK", [("Content-Type", "text/html")]) for row in result: yield " ".join( (metricName, str(row.metric_value), str(calendar.timegm(row.timestamp.timetuple())), str(row.anomaly_score))) + "\n"
def _processModelInferenceResults(self, inferenceResults, metricID): """ Process a batch of model inference results Store the updated MetricData and anomaly likelihood parameters in the database. A row's anomaly_score value will be set to and remain at 0 in the first self._statisticsMinSampleSize rows; once we get enough inference results to create an anomaly likelyhood model, anomaly_score will be computed on the subsequent rows. :param inferenceResults: a sequence of ModelInferenceResult instances in the processed order (ascending by timestamp) :param metricID: metric/model ID of the model that emitted the results :returns: None if the batch was rejected; otherwise a pair: (metric, metricDataRows) metric: Metric RowProxy instance corresponding to the given metricID metricDataRows: a sequence of MutableMetricDataRow instances corresponding to the updated metric_data rows. TODO: unit-test return value :rtype: None or tuple *NOTE:* the processing must be idempotent due to the "at least once" delivery semantics of the message bus *NOTE:* the performance goal is to minimize costly database access and avoid falling behind while processing model results, especially during the model's initial "catch-up" phase when large inference result batches are prevalent. """ engine = repository.engineFactory(config) # Validate model ID try: with engine.connect() as conn: metricObj = repository.getMetric(conn, metricID) except ObjectNotFoundError: # Ignore inferences for unkonwn models. Typically, this is is the result # of a deleted model. Another scenario where this might occur is when a # developer resets db while there are result messages still on the # message bus. It would be an error if this were to occur in production # environment. self._log.warning( "Received inference results for unknown model=%s; " "(model deleted?)", metricID, exc_info=True) return None # Reject the results if model is in non-ACTIVE state (e.g., if HTM Metric # was unmonitored after the results were generated) if metricObj.status != MetricStatus.ACTIVE: self._log.warning( "Received inference results for a non-ACTIVE " "model=%s; metric=<%s>; (metric unmonitored?)", metricID, getMetricLogPrefix(metricObj)) return None # Load the MetricData instances corresponding to the results with engine.connect() as conn: metricDataRows = repository.getMetricData( conn, metricID, start=inferenceResults[0].rowID, stop=inferenceResults[-1].rowID) # metricDataRows must be mutable, as the data is massaged in # _scrubInferenceResultsAndInitMetricData() metricDataRows = list(metricDataRows) if not metricDataRows: self._log.error( "Rejected inference result batch=[%s..%s] of model=%s " "due to no matching metric_data rows", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID) return None try: self._scrubInferenceResultsAndInitMetricData( engine=engine, inferenceResults=inferenceResults, metricDataRows=metricDataRows, metricObj=metricObj) except RejectedInferenceResultBatch as e: # TODO: unit-test self._log.error( "Rejected inference result batch=[%s..%s] corresponding to " "rows=[%s..%s] of model=%s due to error=%r", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, e) return None # Update anomaly scores based on the new results anomalyLikelihoodParams = ( self.likelihoodHelper.updateModelAnomalyScores( engine=engine, metricObj=metricObj, metricDataRows=metricDataRows)) # Update metric data rows with rescaled display values # NOTE: doing this outside the updateColumns loop to avoid holding row locks # any longer than necessary for metricData in metricDataRows: metricData.display_value = rescaleForDisplay( metricData.anomaly_score, active=(metricObj.status == MetricStatus.ACTIVE)) # Update database once via transaction! startTime = time.time() try: @retryOnTransientErrors def runSQL(engine): with engine.begin() as conn: for metricData in metricDataRows: fields = { "raw_anomaly_score": metricData.raw_anomaly_score, "anomaly_score": metricData.anomaly_score, "display_value": metricData.display_value } repository.updateMetricDataColumns( conn, metricData, fields) self._updateAnomalyLikelihoodParams( conn, metricObj.uid, metricObj.model_params, anomalyLikelihoodParams) runSQL(engine) except (ObjectNotFoundError, MetricNotActiveError): self._log.warning( "Rejected inference result batch=[%s..%s] of model=%s", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID, exc_info=True) return None self._log.debug( "Updated HTM metric_data rows=[%s..%s] " "of model=%s: duration=%ss", metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, time.time() - startTime) return ( metricObj, metricDataRows, )