def __init__(self): self._log = _getLogger() self._profiling = (config.getboolean("debugging", "profiling") or self._log.isEnabledFor(logging.DEBUG)) self._modelResultsExchange = (config.get("metric_streamer", "results_exchange_name")) self._statisticsSampleSize = (config.getint("anomaly_likelihood", "statistics_sample_size")) self.likelihoodHelper = AnomalyLikelihoodHelper(self._log, config)
def __init__(self): self._log = _getLogger() self._profiling = config.getboolean("debugging", "profiling") or self._log.isEnabledFor(logging.DEBUG) self._modelResultsExchange = config.get("metric_streamer", "results_exchange_name") self._statisticsSampleSize = config.getint("anomaly_likelihood", "statistics_sample_size") self.likelihoodHelper = AnomalyLikelihoodHelper(self._log, config)
class AnomalyService(object): """ Anomaly Service for processing CLA model results, calculating Anomaly Likelihood scores, and updating the associated metric data records Records are processed in batches from ``ModelSwapperInterface().consumeResults()`` and the associated ``MetricData`` rows are updated with the results of applying ``AnomalyLikelihoodHelper().updateModelAnomalyScores()`` and finally the results are packaged up as as objects compliant with ``model_inference_results_msg_schema.json`` and published to the model results exchange, as identified by the ``results_exchange_name`` configuration directive from the ``metric_streamer`` section of ``config``. Other services may be subscribed to the model results fanout exchange for subsequent (and parallel) processing. For example, ``htmengine.runtime.notification_service.NotificationService`` is one example of a use-case for that exchange. Consumers must deserialize inbound messages with ``AnomalyService.deserializeModelResult()``. """ def __init__(self): self._log = _getLogger() self._profiling = ( config.getboolean("debugging", "profiling") or self._log.isEnabledFor(logging.DEBUG)) self._modelResultsExchange = ( config.get("metric_streamer", "results_exchange_name")) self._statisticsSampleSize = ( config.getint("anomaly_likelihood", "statistics_sample_size")) self.likelihoodHelper = AnomalyLikelihoodHelper(self._log, config) def _processModelCommandResult(self, metricID, result): """ Process a single model command result """ engine = repository.engineFactory(config) # Check if deleting model if result.method == "deleteModel": self._log.info("Model=%s was deleted", metricID) return # Validate model ID try: # NOTE: use shared lock to prevent race condition with adapter's # monitorMetric, whereby adapter creates and/or activates a metric inside # a transaction, and we might get the defineModel command before the # metric row updates are committed with engine.connect() as conn: metricObj = repository.getMetricWithSharedLock(conn, metricID) except ObjectNotFoundError: # This may occur if the user deletes the model before the result was # delivered while there are result messages still on the message bus. self._log.warn("Received command result=%r for unknown model=%s " "(model deleted?)", result, metricID) return if result.status != 0: self._log.error(result.errorMessage) if metricObj.status != MetricStatus.ERROR: self._log.error("Placing model=<%s> in ERROR state due to " "commandResult=%s", getMetricLogPrefix(metricObj), result) with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ERROR, result.errorMessage) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn("Received command result=%r for metricID=%s of " "metric=<%s> that was already in ERROR state", result, metricID, getMetricLogPrefix(metricObj)) return # Create Model if result.method == "defineModel": self._log.info("Model was created for <%s>", getMetricLogPrefix(metricObj)) if metricObj.status == MetricStatus.CREATE_PENDING: with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ACTIVE) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn("Received command result=%r for model=%s of metric=<%s> " "that was not in CREATE_PENDING state", result, metricID, getMetricLogPrefix(metricObj)) return self._log.error("Unexpected model result=%r", result) def _processModelInferenceResults(self, inferenceResults, metricID): """ Process a batch of model inference results Store the updated MetricData and anomaly likelihood parameters in the database. A row's anomaly_score value will be set to and remain at 0 in the first self._statisticsMinSampleSize rows; once we get enough inference results to create an anomaly likelihood model, anomaly_score will be computed on the subsequent rows. :param inferenceResults: a sequence of ModelInferenceResult instances in the processed order (ascending by timestamp) :param metricID: metric/model ID of the model that emitted the results :returns: None if the batch was rejected; otherwise a pair: (metric, metricDataRows) metric: Metric RowProxy instance corresponding to the given metricID metricDataRows: a sequence of MutableMetricDataRow instances corresponding to the updated metric_data rows. TODO: unit-test return value :rtype: None or tuple *NOTE:* the processing must be idempotent due to the "at least once" delivery semantics of the message bus *NOTE:* the performance goal is to minimize costly database access and avoid falling behind while processing model results, especially during the model's initial "catch-up" phase when large inference result batches are prevalent. """ engine = repository.engineFactory(config) # Validate model ID try: with engine.connect() as conn: metricObj = repository.getMetric(conn, metricID) except ObjectNotFoundError: # Ignore inferences for unknown models. Typically, this is is the result # of a deleted model. Another scenario where this might occur is when a # developer resets the db while there are result messages still on the # message bus. It would be an error if this were to occur in production # environment. self._log.warning("Received inference results for unknown model=%s; " "(model deleted?)", metricID, exc_info=True) return None # Reject the results if model is in non-ACTIVE state (e.g., if HTM Metric # was unmonitored after the results were generated) if metricObj.status != MetricStatus.ACTIVE: self._log.warning("Received inference results for a non-ACTIVE " "model=%s; metric=<%s>; (metric unmonitored?)", metricID, getMetricLogPrefix(metricObj)) return None # Load the MetricData instances corresponding to the results with engine.connect() as conn: metricDataRows = repository.getMetricData(conn, metricID, start=inferenceResults[0].rowID, stop=inferenceResults[-1].rowID) # metricDataRows must be mutable, as the data is massaged in # _scrubInferenceResultsAndInitMetricData() metricDataRows = list(metricDataRows) if not metricDataRows: self._log.error("Rejected inference result batch=[%s..%s] of model=%s " "due to no matching metric_data rows", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID) return None try: self._scrubInferenceResultsAndInitMetricData( engine=engine, inferenceResults=inferenceResults, metricDataRows=metricDataRows, metricObj=metricObj) except RejectedInferenceResultBatch as e: # TODO: unit-test self._log.error( "Rejected inference result batch=[%s..%s] corresponding to " "rows=[%s..%s] of model=%s due to error=%r", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, e) return None # Update anomaly scores based on the new results anomalyLikelihoodParams = ( self.likelihoodHelper.updateModelAnomalyScores( engine=engine, metricObj=metricObj, metricDataRows=metricDataRows)) # Update metric data rows with rescaled display values # NOTE: doing this outside the updateColumns loop to avoid holding row locks # any longer than necessary for metricData in metricDataRows: metricData.display_value = rescaleForDisplay( metricData.anomaly_score, active=(metricObj.status == MetricStatus.ACTIVE)) # Update database once via transaction! startTime = time.time() try: @retryOnTransientErrors def runSQL(engine): with engine.begin() as conn: for metricData in metricDataRows: fields = {"raw_anomaly_score": metricData.raw_anomaly_score, "anomaly_score": metricData.anomaly_score, "display_value": metricData.display_value, "multi_step_best_predictions": json.dumps(metricData.multi_step_best_predictions)} repository.updateMetricDataColumns(conn, metricData, fields) self._updateAnomalyLikelihoodParams( conn, metricObj.uid, metricObj.model_params, anomalyLikelihoodParams) runSQL(engine) except (ObjectNotFoundError, MetricNotActiveError): self._log.warning("Rejected inference result batch=[%s..%s] of model=%s", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID, exc_info=True) return None self._log.debug("Updated HTM metric_data rows=[%s..%s] " "of model=%s: duration=%ss", metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, time.time() - startTime) return (metricObj, metricDataRows,) @classmethod def _updateAnomalyLikelihoodParams(cls, conn, metricId, modelParamsJson, likelihoodParams): """Update and save anomaly_params with the given likelihoodParams if the metric is ACTIVE. :param conn: Transactional SQLAlchemy connection object :type conn: sqlalchemy.engine.base.Connection :param metricId: Metric uid :param modelParamsJson: Model params JSON object (from model_params metric column) :param likelihoodParams: anomaly likelihood params dict :raises: htmengine.exceptions.MetricNotActiveError if metric's status is not MetricStatus.ACTIVE """ lockedRow = repository.getMetricWithUpdateLock( conn, metricId, fields=[schema.metric.c.status]) if lockedRow.status != MetricStatus.ACTIVE: raise MetricNotActiveError( "_updateAnomalyLikelihoodParams failed because metric=%s is not " "ACTIVE; status=%s" % (metricId, lockedRow.status,)) modelParams = json.loads(modelParamsJson) modelParams["anomalyLikelihoodParams"] = likelihoodParams repository.updateMetricColumns(conn, metricId, {"model_params": json.dumps(modelParams)}) @classmethod def _composeModelInferenceResultsMessage(cls, metricRow, dataRows): """ Create a message body for publishing from the result of _processModelInferenceResults :param metricRow: Metric instance corresponding to the given metricID :param dataRows: a sequence of MutableMetricDataRow instances corresponding to the updated metric_data rows. :returns: JSON-ifiable dict conforming to model_inference_results_msg_schema.json :rtype: dict """ return dict( metric=dict( uid=metricRow.uid, name=metricRow.name, description=metricRow.description, resource=metricRow.server, location=metricRow.location, datasource=metricRow.datasource, spec=json.loads(metricRow.parameters)["metricSpec"] ), results=[ dict( rowid=row.rowid, ts=epochFromNaiveUTCDatetime(row.timestamp), value=row.metric_value, rawAnomaly=row.raw_anomaly_score, anomaly=row.anomaly_score, multiStepBestPredictions=row.multi_step_best_predictions ) for row in dataRows ] ) @classmethod def _composeModelCommandResultMessage(cls, modelID, cmdResult): """ Compose message corresponding to the completion of a model command for publishing to downstream services. :param modelID: model identifier :param model_swapper_interface.ModelCommandResult cmdResult: model command result :returns: JSON-ifiable message contents object per model_command_result_amqp_message.json :rtype: dict :raises ObjectNotFoundError: when attempted to request additional info about a model that is not in the repository :raises MetricNotMonitoredError: when required info about a model is not available, because it's no longer monitored """ commandResultMessage = dict( method=cmdResult.method, modelId=modelID, commandId=cmdResult.commandID, status=cmdResult.status, errorMessage=cmdResult.errorMessage, ) if (cmdResult.method == "defineModel" and cmdResult.status == htmengineerrno.SUCCESS): # Add modelInfo for successfully-completed "defineModel" commands engine = repository.engineFactory(config) fields = [ schema.metric.c.name, schema.metric.c.server, schema.metric.c.parameters ] try: with engine.connect() as conn: metricObj = repository.getMetric( conn, modelID, fields=fields) except ObjectNotFoundError: g_log.warning("_composeModelCommandResultMessage: method=%s; " "model=%s not found", cmdResult.method, modelID) raise if not metricObj.parameters: g_log.warning("_composeModelCommandResultMessage: method=%s; " "model=%s not monitored", cmdResult.method, modelID) raise MetricNotMonitoredError commandResultMessage["modelInfo"] = dict( metricName=metricObj.name, resource=metricObj.server, modelSpec=json.loads(metricObj.parameters)) return commandResultMessage def _scrubInferenceResultsAndInitMetricData(self, engine, inferenceResults, metricDataRows, metricObj): """ Validate the given inferenceResults against metricDataRows, update corresponding MetricData instances by initializing their `raw_anomaly_score` property from results and the `anomaly_score` property with 0. Replace elements in metricDataRows with MutableMetricDataRow objects. *NOTE:* does NOT update the MetricData instances to the database (we do that once after we process the batch for efficiency) :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param inferenceResults: a sequence of ModelInferenceResult instances representing the inference result batch ordered by row id :param metricDataRows: a mutable list of MetricData instances with row ids in the range of inferenceResults[0].rowID to inferenceResults[-1].rowID :param metricObj: a Metric instance associated with the given inferenceResults :raises RejectedInferenceResultBatch: if the given result batch is rejected """ for result, enumeratedMetricData in itertools.izip_longest(inferenceResults, enumerate( metricDataRows) ): if enumeratedMetricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % ( result, metricObj)) index, metricData = enumeratedMetricData if result is None: raise RejectedInferenceResultBatch( "Truncated inference result batch; no result for metric data row=%r " "of model=<%r>" % (metricData, metricObj)) if metricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % (result, metricObj)) if result.rowID != metricData.rowid: raise RejectedInferenceResultBatch( "RowID mismatch between inference result=%r and ModelData row=%r of " "model=<%r>" % (result, metricData, metricObj)) if metricData.raw_anomaly_score is not None: # Side-effect of at-least-once delivery guarantee? self._log.error( "Anomaly was already processed on data row=%s; new result=%r", metricData, result) # Validate the result if result.status != 0: self._log.error(result.errorMessage) if metricObj.status == MetricStatus.ERROR: raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> was in ERROR state" % (result, getMetricLogPrefix(metricObj))) else: self._log.error("Placing model=<%r> in ERROR state due to " "inferenceResult=%r", metricObj, result) with engine.connect() as conn: repository.setMetricStatus(conn, metricObj.uid, MetricStatus.ERROR, result.errorMessage) raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> promoted to ERROR state" % (result, getMetricLogPrefix(metricObj))) #self._log.info("{TAG:ANOM.METRIC} metric=%s:%s:%s", # metricObj.name, # calendar.timegm(metricData.timestamp.timetuple()), # metricData.metric_value) mutableMetricData = MutableMetricDataRow(**dict(metricData.items())) mutableMetricData.raw_anomaly_score = result.anomalyScore mutableMetricData.anomaly_score = 0 mutableMetricData.multi_step_best_predictions = ( result.multiStepBestPredictions) metricDataRows[index] = mutableMetricData @staticmethod def _serializeModelResult(modelResults): """ Serializes a model result into a message suitable for delivery to RabbitMQ/AMQP model result exchange :param modelResults: a JSON-ifiable object """ return zlib.compress(json.dumps(modelResults)) @staticmethod def deserializeModelResult(payload): """ Deserialize model result batch """ return json.loads(zlib.decompress(payload)) def run(self): """ Consumes pending results. Once result batch arrives, it will be dispatched to the correct model command result handler. :see: `_processModelCommandResult` and `_processModelInferenceResults` """ # Properties for publishing model command results on RabbitMQ exchange modelCommandResultProperties = MessageProperties( deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE, headers=dict(dataType="model-cmd-result")) # Properties for publishing model inference results on RabbitMQ exchange modelInferenceResultProperties = MessageProperties( deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE) # Declare an exchange for forwarding our results with amqp.synchronous_amqp_client.SynchronousAmqpClient( amqp.connection.getRabbitmqConnectionParameters()) as amqpClient: amqpClient.declareExchange(self._modelResultsExchange, exchangeType="fanout", durable=True) with ModelSwapperInterface() as modelSwapper, MessageBusConnector() as bus: with modelSwapper.consumeResults() as consumer: for batch in consumer: if self._profiling: batchStartTime = time.time() inferenceResults = [] for result in batch.objects: try: if isinstance(result, ModelCommandResult): self._processModelCommandResult(batch.modelID, result) # Construct model command result message for consumption by # downstream processes try: cmdResultMessage = self._composeModelCommandResultMessage( modelID=batch.modelID, cmdResult=result) except (ObjectNotFoundError, MetricNotMonitoredError): pass else: bus.publishExg( exchange=self._modelResultsExchange, routingKey="", body=self._serializeModelResult(cmdResultMessage), properties=modelCommandResultProperties) elif isinstance(result, ModelInferenceResult): inferenceResults.append(result) else: self._log.error("Unsupported ModelResult=%r", result) except ObjectNotFoundError: self._log.exception("Error processing result=%r " "from model=%s", result, batch.modelID) if inferenceResults: result = self._processModelInferenceResults( inferenceResults, metricID=batch.modelID) if result is not None: # Construct model results payload for consumption by # downstream processes metricRow, dataRows = result resultsMessage = self._composeModelInferenceResultsMessage( metricRow, dataRows) payload = self._serializeModelResult(resultsMessage) bus.publishExg( exchange=self._modelResultsExchange, routingKey="", body=payload, properties=modelInferenceResultProperties) batch.ack() if self._profiling: if inferenceResults: if result is not None: # pylint: disable=W0633 metricRow, rows = result rowIdRange = ( "%s..%s" % (rows[0].rowid, rows[-1].rowid) if len(rows) > 1 else str(rows[0].rowid)) self._log.info( "{TAG:ANOM.BATCH.INF.DONE} model=%s; " "numItems=%d; rows=[%s]; tailRowTS=%s; duration=%.4fs; " "ds=%s; name=%s", batch.modelID, len(batch.objects), rowIdRange, rows[-1].timestamp.isoformat() + "Z", time.time() - batchStartTime, metricRow.datasource, metricRow.name) else: self._log.info( "{TAG:ANOM.BATCH.CMD.DONE} model=%s; " "numItems=%d; duration=%.4fs", batch.modelID, len(batch.objects), time.time() - batchStartTime) self._log.info("Stopped processing model results")
class AnomalyService(object): """ Anomaly Service for processing CLA model results, calculating Anomaly Likelihood scores, and updating the associated metric data records Records are processed in batches from ``ModelSwapperInterface().consumeResults()`` and the associated ``MetricData`` rows are updated with the results of applying ``AnomalyLikelihoodHelper().updateModelAnomalyScores()`` and finally the results are packaged up as as objects complient with ``model_inference_results_msg_schema.json`` and published to the model results exchange, as identified by the ``results_exchange_name`` configuration directive from the ``metric_streamer`` section of ``config``. Other services may be subscribed to the model results fanout exchange for subsequent (and parallel) processing. For example, ``htmengine.runtime.notification_service.NotificationService`` is one example of a use-case for that exchange. Consumers must deserialize inbound messages with ``AnomalyService.deserializeModelResult()``. """ def __init__(self): self._log = _getLogger() self._profiling = (config.getboolean("debugging", "profiling") or self._log.isEnabledFor(logging.DEBUG)) self._modelResultsExchange = (config.get("metric_streamer", "results_exchange_name")) self._statisticsSampleSize = (config.getint("anomaly_likelihood", "statistics_sample_size")) self.likelihoodHelper = AnomalyLikelihoodHelper(self._log, config) def _processModelCommandResult(self, metricID, result): """ Process a single model command result """ engine = repository.engineFactory(config) # Check if deleting model if result.method == "deleteModel": self._log.info("Model=%s was deleted", metricID) return # Validate model ID try: # NOTE: use shared lock to prevent race condition with adapter's # monitorMetric, whereby adapter creates and/or activates a metric inside # a transaction, and we might get the defineModel command before the # metric row updates are committed with engine.connect() as conn: metricObj = repository.getMetricWithSharedLock(conn, metricID) except ObjectNotFoundError: # This may occur if the user deletes the model before the result was # delivered while there are result messages still on the message bus. self._log.warn( "Received command result=%r for unknown model=%s " "(model deleted?)", result, metricID) return if result.status != 0: self._log.error(result.errorMessage) if metricObj.status != MetricStatus.ERROR: self._log.error( "Placing model=<%s> in ERROR state due to " "commandResult=%s", getMetricLogPrefix(metricObj), result) with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ERROR, result.errorMessage) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn( "Received command result=%r for metricID=%s of " "metric=<%s> that was already in ERROR state", result, metricID, getMetricLogPrefix(metricObj)) return # Create Model if result.method == "defineModel": self._log.info("Model was created for <%s>" % (getMetricLogPrefix(metricObj))) if metricObj.status == MetricStatus.CREATE_PENDING: with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ACTIVE) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn( "Received command result=%r for model=%s of metric=<%s> " "that was not in CREATE_PENDING state", result, metricID, getMetricLogPrefix(metricObj)) return self._log.error("Unexpected model result=%r", result) def _processModelInferenceResults(self, inferenceResults, metricID): """ Process a batch of model inference results Store the updated MetricData and anomaly likelihood parameters in the database. A row's anomaly_score value will be set to and remain at 0 in the first self._statisticsMinSampleSize rows; once we get enough inference results to create an anomaly likelyhood model, anomaly_score will be computed on the subsequent rows. :param inferenceResults: a sequence of ModelInferenceResult instances in the processed order (ascending by timestamp) :param metricID: metric/model ID of the model that emitted the results :returns: None if the batch was rejected; otherwise a pair: (metric, metricDataRows) metric: Metric RowProxy instance corresponding to the given metricID metricDataRows: a sequence of MutableMetricDataRow instances corresponding to the updated metric_data rows. TODO: unit-test return value :rtype: None or tuple *NOTE:* the processing must be idempotent due to the "at least once" delivery semantics of the message bus *NOTE:* the performance goal is to minimize costly database access and avoid falling behind while processing model results, especially during the model's initial "catch-up" phase when large inference result batches are prevalent. """ engine = repository.engineFactory(config) # Validate model ID try: with engine.connect() as conn: metricObj = repository.getMetric(conn, metricID) except ObjectNotFoundError: # Ignore inferences for unkonwn models. Typically, this is is the result # of a deleted model. Another scenario where this might occur is when a # developer resets db while there are result messages still on the # message bus. It would be an error if this were to occur in production # environment. self._log.warning( "Received inference results for unknown model=%s; " "(model deleted?)", metricID, exc_info=True) return None # Reject the results if model is in non-ACTIVE state (e.g., if HTM Metric # was unmonitored after the results were generated) if metricObj.status != MetricStatus.ACTIVE: self._log.warning( "Received inference results for a non-ACTIVE " "model=%s; metric=<%s>; (metric unmonitored?)", metricID, getMetricLogPrefix(metricObj)) return None # Load the MetricData instances corresponding to the results with engine.connect() as conn: metricDataRows = repository.getMetricData( conn, metricID, start=inferenceResults[0].rowID, stop=inferenceResults[-1].rowID) # metricDataRows must be mutable, as the data is massaged in # _scrubInferenceResultsAndInitMetricData() metricDataRows = list(metricDataRows) if not metricDataRows: self._log.error( "Rejected inference result batch=[%s..%s] of model=%s " "due to no matching metric_data rows", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID) return None try: self._scrubInferenceResultsAndInitMetricData( engine=engine, inferenceResults=inferenceResults, metricDataRows=metricDataRows, metricObj=metricObj) except RejectedInferenceResultBatch as e: # TODO: unit-test self._log.error( "Rejected inference result batch=[%s..%s] corresponding to " "rows=[%s..%s] of model=%s due to error=%r", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, e) return None # Update anomaly scores based on the new results anomalyLikelihoodParams = ( self.likelihoodHelper.updateModelAnomalyScores( engine=engine, metricObj=metricObj, metricDataRows=metricDataRows)) # Update metric data rows with rescaled display values # NOTE: doing this outside the updateColumns loop to avoid holding row locks # any longer than necessary for metricData in metricDataRows: metricData.display_value = rescaleForDisplay( metricData.anomaly_score, active=(metricObj.status == MetricStatus.ACTIVE)) # Update database once via transaction! startTime = time.time() try: @retryOnTransientErrors def runSQL(engine): with engine.begin() as conn: for metricData in metricDataRows: fields = { "raw_anomaly_score": metricData.raw_anomaly_score, "anomaly_score": metricData.anomaly_score, "display_value": metricData.display_value } repository.updateMetricDataColumns( conn, metricData, fields) self._updateAnomalyLikelihoodParams( conn, metricObj.uid, metricObj.model_params, anomalyLikelihoodParams) runSQL(engine) except (ObjectNotFoundError, MetricNotActiveError): self._log.warning( "Rejected inference result batch=[%s..%s] of model=%s", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID, exc_info=True) return None self._log.debug( "Updated HTM metric_data rows=[%s..%s] " "of model=%s: duration=%ss", metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, time.time() - startTime) return ( metricObj, metricDataRows, ) @classmethod def _updateAnomalyLikelihoodParams(cls, conn, metricId, modelParamsJson, likelihoodParams): """Update and save anomaly_params with the given likelyhoodParams if the metric is ACTIVE. :param conn: Transactional SQLAlchemy connection object :type conn: sqlalchemy.engine.base.Connection :param metricId: Metric uid :param modelParamsJson: Model params JSON object (from model_params metric column) :param likelihoodParams: anomaly likelihood params dict :raises: htmengine.exceptions.MetricNotActiveError if metric's status is not MetricStatus.ACTIVE """ lockedRow = repository.getMetricWithUpdateLock( conn, metricId, fields=[schema.metric.c.status]) if lockedRow.status != MetricStatus.ACTIVE: raise MetricNotActiveError( "_updateAnomalyLikelihoodParams failed because metric=%s is not " "ACTIVE; status=%s" % ( metricId, lockedRow.status, )) modelParams = json.loads(modelParamsJson) modelParams["anomalyLikelihoodParams"] = likelihoodParams repository.updateMetricColumns( conn, metricId, {"model_params": json.dumps(modelParams)}) @classmethod def _composeModelInferenceResultsMessage(cls, metricRow, dataRows): """ Create a message body for publishing from the result of _processModelInferenceResults :param metricRow: Metric instance corresponding to the given metricID :param dataRows: a sequence of MutableMetricDataRow instances corresponding to the updated metric_data rows. :returns: JSON-ifiable dict conforming to model_inference_results_msg_schema.json :rtype: dict """ return dict(metric=dict(uid=metricRow.uid, name=metricRow.name, description=metricRow.description, resource=metricRow.server, location=metricRow.location, datasource=metricRow.datasource, spec=json.loads( metricRow.parameters)["metricSpec"]), results=[ dict(rowid=row.rowid, ts=epochFromNaiveUTCDatetime(row.timestamp), value=row.metric_value, rawAnomaly=row.raw_anomaly_score, anomaly=row.anomaly_score) for row in dataRows ]) @classmethod def _composeModelCommandResultMessage(cls, modelID, cmdResult): """ Compose message corresponding to the completion of a model command for publishing to downstream services. :param modelID: model identifier :param model_swapper_interface.ModelCommandResult cmdResult: model command result :returns: JSON-ifiable message contents object per model_command_result_amqp_message.json :rtype: dict :raises ObjectNotFoundError: when attempted to request additional info about a model that is not in the repository :raises MetricNotMonitoredError: when required info about a model is not available, because it's no longer monitored """ commandResultMessage = dict( method=cmdResult.method, modelId=modelID, commandId=cmdResult.commandID, status=cmdResult.status, errorMessage=cmdResult.errorMessage, ) if (cmdResult.method == "defineModel" and cmdResult.status == htmengineerrno.SUCCESS): # Add modelInfo for successfully-completed "defineModel" commands engine = repository.engineFactory(config) fields = [ schema.metric.c.name, schema.metric.c.server, schema.metric.c.parameters ] try: with engine.connect() as conn: metricObj = repository.getMetric(conn, modelID, fields=fields) except ObjectNotFoundError: g_log.warning( "_composeModelCommandResultMessage: method=%s; " "model=%s not found", cmdResult.method, modelID) raise if not metricObj.parameters: g_log.warning( "_composeModelCommandResultMessage: method=%s; " "model=%s not monitored", cmdResult.method, modelID) raise MetricNotMonitoredError commandResultMessage["modelInfo"] = dict(metricName=metricObj.name, resource=metricObj.server, modelSpec=json.loads( metricObj.parameters)) return commandResultMessage def _scrubInferenceResultsAndInitMetricData(self, engine, inferenceResults, metricDataRows, metricObj): """ Validate the given inferenceResults against metricDataRows, update corresponding MetricData instances by initializing their `raw_anomaly_score` property from results and the `anomaly_score` property with 0. Replace elements in metricDataRows with MutableMetricDataRow objects. *NOTE:* does NOT update the MetricData instances to the database (we do that once after we process the batch for efficiency) :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param inferenceResults: a sequence of ModelInferenceResult instances representing the inference result batch ordered by row id :param metricDataRows: a mutable list of MetricData instances with row ids in the range of inferenceResults[0].rowID to inferenceResults[-1].rowID :param metricObj: a Metric instance associated with the given inferenceResults :raises RejectedInferenceResultBatch: if the given result batch is rejected """ for result, enumeratedMetricData in itertools.izip_longest( inferenceResults, enumerate(metricDataRows)): if enumeratedMetricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % (result, metricObj)) index, metricData = enumeratedMetricData if result is None: raise RejectedInferenceResultBatch( "Truncated inference result batch; no result for metric data row=%r " "of model=<%r>" % (metricData, metricObj)) if metricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % (result, metricObj)) if result.rowID != metricData.rowid: raise RejectedInferenceResultBatch( "RowID mismatch between inference result=%r and ModelData row=%r of " "model=<%r>" % (result, metricData, metricObj)) if metricData.raw_anomaly_score is not None: # Side-effect of at-least-once delivery guarantee? self._log.error( "Anomaly was already processed on data row=%s; new result=%r", metricData, result) # Validate the result if result.status != 0: self._log.error(result.errorMessage) if metricObj.status == MetricStatus.ERROR: raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> was in ERROR state" % (result, getMetricLogPrefix(metricObj))) else: self._log.error( "Placing model=<%r> in ERROR state due to " "inferenceResult=%r", metricObj, result) with engine.connect() as conn: repository.setMetricStatus(conn, metricObj.uid, MetricStatus.ERROR, result.errorMessage) raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> promoted to ERROR state" % (result, getMetricLogPrefix(metricObj))) #self._log.info("{TAG:ANOM.METRIC} metric=%s:%s:%s", # metricObj.name, # calendar.timegm(metricData.timestamp.timetuple()), # metricData.metric_value) mutableMetricData = MutableMetricDataRow( **dict(metricData.items())) mutableMetricData.raw_anomaly_score = result.anomalyScore mutableMetricData.anomaly_score = 0 metricDataRows[index] = mutableMetricData @staticmethod def _serializeModelResult(modelResults): """ Serializes a model result into a message suitable for delivery to RabbitMQ/AMQP model result exchange :param modelResults: a JSON-ifiable object """ return zlib.compress(json.dumps(modelResults)) @staticmethod def deserializeModelResult(payload): """ Deserialize model result batch """ return json.loads(zlib.decompress(payload)) def run(self): """ Consumes pending results. Once result batch arrives, it will be dispatched to the correct model command result handler. :see: `_processModelCommandResult` and `_processModelInferenceResults` """ # Properties for publishing model command results on RabbitMQ exchange modelCommandResultProperties = MessageProperties( deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE, headers=dict(dataType="model-cmd-result")) # Properties for publishing model inference results on RabbitMQ exchange modelInferenceResultProperties = MessageProperties( deliveryMode=amqp.constants.AMQPDeliveryModes.PERSISTENT_MESSAGE) # Declare an exchange for forwarding our results with amqp.synchronous_amqp_client.SynchronousAmqpClient( amqp.connection.getRabbitmqConnectionParameters( )) as amqpClient: amqpClient.declareExchange(self._modelResultsExchange, exchangeType="fanout", durable=True) with ModelSwapperInterface() as modelSwapper, MessageBusConnector( ) as bus: with modelSwapper.consumeResults() as consumer: for batch in consumer: if self._profiling: batchStartTime = time.time() inferenceResults = [] for result in batch.objects: try: if isinstance(result, ModelCommandResult): self._processModelCommandResult( batch.modelID, result) # Construct model command result message for consumption by # downstream processes try: cmdResultMessage = self._composeModelCommandResultMessage( modelID=batch.modelID, cmdResult=result) except (ObjectNotFoundError, MetricNotMonitoredError): pass else: bus.publishExg( exchange=self._modelResultsExchange, routingKey="", body=self._serializeModelResult( cmdResultMessage), properties=modelCommandResultProperties ) elif isinstance(result, ModelInferenceResult): inferenceResults.append(result) else: self._log.error("Unsupported ModelResult=%r", result) except ObjectNotFoundError: self._log.exception( "Error processing result=%r " "from model=%s", result, batch.modelID) if inferenceResults: result = self._processModelInferenceResults( inferenceResults, metricID=batch.modelID) if result is not None: # Construct model results payload for consumption by # downstream processes metricRow, dataRows = result resultsMessage = self._composeModelInferenceResultsMessage( metricRow, dataRows) payload = self._serializeModelResult( resultsMessage) bus.publishExg( exchange=self._modelResultsExchange, routingKey="", body=payload, properties=modelInferenceResultProperties) batch.ack() if self._profiling: if inferenceResults: if result is not None: # pylint: disable=W0633 metricRow, rows = result rowIdRange = ("%s..%s" % (rows[0].rowid, rows[-1].rowid) if len(rows) > 1 else str( rows[0].rowid)) self._log.info( "{TAG:ANOM.BATCH.INF.DONE} model=%s; " "numItems=%d; rows=[%s]; tailRowTS=%s; duration=%.4fs; " "ds=%s; name=%s", batch.modelID, len(batch.objects), rowIdRange, rows[-1].timestamp.isoformat() + "Z", time.time() - batchStartTime, metricRow.datasource, metricRow.name) else: self._log.info( "{TAG:ANOM.BATCH.CMD.DONE} model=%s; " "numItems=%d; duration=%.4fs", batch.modelID, len(batch.objects), time.time() - batchStartTime) self._log.info("Stopped processing model results")