def _scrubInferenceResultsAndInitMetricData(self, engine, inferenceResults, metricDataRows, metricObj): """ Validate the given inferenceResults against metricDataRows, update corresponding MetricData instances by initializing their `raw_anomaly_score` property from results and the `anomaly_score` property with 0. Replace elements in metricDataRows with MutableMetricDataRow objects. *NOTE:* does NOT update the MetricData instances to the database (we do that once after we process the batch for efficiency) :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param inferenceResults: a sequence of ModelInferenceResult instances representing the inference result batch ordered by row id :param metricDataRows: a mutable list of MetricData instances with row ids in the range of inferenceResults[0].rowID to inferenceResults[-1].rowID :param metricObj: a Metric instance associated with the given inferenceResults :raises RejectedInferenceResultBatch: if the given result batch is rejected """ for result, enumeratedMetricData in itertools.izip_longest(inferenceResults, enumerate( metricDataRows) ): if enumeratedMetricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % ( result, metricObj)) index, metricData = enumeratedMetricData if result is None: raise RejectedInferenceResultBatch( "Truncated inference result batch; no result for metric data row=%r " "of model=<%r>" % (metricData, metricObj)) if metricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % (result, metricObj)) if result.rowID != metricData.rowid: raise RejectedInferenceResultBatch( "RowID mismatch between inference result=%r and ModelData row=%r of " "model=<%r>" % (result, metricData, metricObj)) if metricData.raw_anomaly_score is not None: # Side-effect of at-least-once delivery guarantee? self._log.error( "Anomaly was already processed on data row=%s; new result=%r", metricData, result) # Validate the result if result.status != 0: self._log.error(result.errorMessage) if metricObj.status == MetricStatus.ERROR: raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> was in ERROR state" % (result, getMetricLogPrefix(metricObj))) else: self._log.error("Placing model=<%r> in ERROR state due to " "inferenceResult=%r", metricObj, result) with engine.connect() as conn: repository.setMetricStatus(conn, metricObj.uid, MetricStatus.ERROR, result.errorMessage) raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> promoted to ERROR state" % (result, getMetricLogPrefix(metricObj))) #self._log.info("{TAG:ANOM.METRIC} metric=%s:%s:%s", # metricObj.name, # calendar.timegm(metricData.timestamp.timetuple()), # metricData.metric_value) mutableMetricData = MutableMetricDataRow(**dict(metricData.items())) mutableMetricData.raw_anomaly_score = result.anomalyScore mutableMetricData.anomaly_score = 0 mutableMetricData.multi_step_best_predictions = ( result.multiStepBestPredictions) metricDataRows[index] = mutableMetricData
def startMonitoring(conn, metricId, swarmParams, logger): """ Start monitoring an UNMONITORED metric. NOTE: typically called either inside a transaction and/or with locked tables Starts the CLA model if provided non-None swarmParams; otherwise defers model creation to a later time and places the metric in MetricStatus.PENDING_DATA state. :param conn: SQLAlchemy Connection object for executing SQL :type conn: sqlalchemy.engine.Connection :param metricId: unique identifier of the metric row :param swarmParams: swarmParams generated via scalar_metric_utils.generateSwarmParams() or None. :param logger: logger object :returns: True if model was started; False if not :raises htmengine.exceptions.ObjectNotFoundError: if metric with the referenced metric uid doesn't exist :raises htmengine.exceptions.MetricStatusChangedError: if Metric status was changed by someone else (most likely another process) before this operation could complete """ modelStarted = False startTime = time.time() metricObj = repository.getMetric(conn, metricId) assert metricObj.status == MetricStatus.UNMONITORED, ( "startMonitoring: metric=%s is already monitored; status=%s" % ( metricId, metricObj.status, )) if swarmParams is not None: # We have swarmParams, so start the model modelStarted = _startModelHelper(conn=conn, metricObj=metricObj, swarmParams=swarmParams, logger=logger) else: # Put the metric into the PENDING_DATA state until enough data arrives for # stats refStatus = metricObj.status repository.setMetricStatus(conn, metricId, MetricStatus.PENDING_DATA, refStatus=refStatus) # refresh metricStatus = repository.getMetric(conn, metricId, fields=[schema.metric.c.status ]).status if metricStatus == MetricStatus.PENDING_DATA: logger.info( "startMonitoring: promoted metric to model in PENDING_DATA; " "metric=%s; duration=%.4fs", metricId, time.time() - startTime) else: raise app_exceptions.MetricStatusChangedError( "startMonitoring: unable to promote metric=%s to model as " "PENDING_DATA; metric status morphed from %s to %s" % ( metricId, refStatus, metricStatus, )) return modelStarted
def _processModelCommandResult(self, metricID, result): """ Process a single model command result """ engine = repository.engineFactory(config) # Check if deleting model if result.method == "deleteModel": self._log.info("Model=%s was deleted", metricID) return # Validate model ID try: # NOTE: use shared lock to prevent race condition with adapter's # monitorMetric, whereby adapter creates and/or activates a metric inside # a transaction, and we might get the defineModel command before the # metric row updates are committed with engine.connect() as conn: metricObj = repository.getMetricWithSharedLock(conn, metricID) except ObjectNotFoundError: # This may occur if the user deletes the model before the result was # delivered while there are result messages still on the message bus. self._log.warn("Received command result=%r for unknown model=%s " "(model deleted?)", result, metricID) return if result.status != 0: self._log.error(result.errorMessage) if metricObj.status != MetricStatus.ERROR: self._log.error("Placing model=<%s> in ERROR state due to " "commandResult=%s", getMetricLogPrefix(metricObj), result) with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ERROR, result.errorMessage) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn("Received command result=%r for metricID=%s of " "metric=<%s> that was already in ERROR state", result, metricID, getMetricLogPrefix(metricObj)) return # Create Model if result.method == "defineModel": self._log.info("Model was created for <%s>", getMetricLogPrefix(metricObj)) if metricObj.status == MetricStatus.CREATE_PENDING: with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ACTIVE) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn("Received command result=%r for model=%s of metric=<%s> " "that was not in CREATE_PENDING state", result, metricID, getMetricLogPrefix(metricObj)) return self._log.error("Unexpected model result=%r", result)
def _startModelHelper(conn, metricObj, swarmParams, logger): """ Start the model :param conn: SQLAlchemy Connection object for executing SQL :type conn: sqlalchemy.engine.Connection :param metricObj: metric, freshly-loaded :type metricObj: sqlalchemy.engine.RowProxy (see repository.getMetric()) :param swarmParams: non-None swarmParams generated via scalar_metric_utils.generateSwarmParams(). :param logger: logger object :returns: True if model was started; False if not :raises htmengine.exceptions.ObjectNotFoundError: if the metric doesn't exist; this may happen if it got deleted by another process in the meantime. :raises htmengine.exceptions.MetricStatusChangedError: if Metric status was changed by someone else (most likely another process) before this operation could complete """ if swarmParams is None: raise ValueError( "startModel: 'swarmParams' must be non-None: metric=%s" % (metricObj.uid, )) if metricObj.status not in (MetricStatus.UNMONITORED, MetricStatus.PENDING_DATA): if metricObj.status in (MetricStatus.CREATE_PENDING, MetricStatus.ACTIVE): return False logger.error("Unexpected metric status; metric=%r", metricObj) raise ValueError("startModel: unexpected metric status; metric=%r" % (metricObj, )) startTime = time.time() # Save swarm parameters and update metric status refStatus = metricObj.status repository.updateMetricColumnsForRefStatus( conn, metricObj.uid, refStatus, { "status": MetricStatus.CREATE_PENDING, "model_params": htmengine.utils.jsonEncode(swarmParams) }) metricObj = repository.getMetric( conn, metricObj.uid, fields=[schema.metric.c.uid, schema.metric.c.status]) # refresh if metricObj.status != MetricStatus.CREATE_PENDING: raise app_exceptions.MetricStatusChangedError( "startModel: unable to start model=%s; " "metric status morphed from %s to %s" % ( metricObj.uid, refStatus, metricObj.status, )) # Request to create the CLA model try: model_swapper_utils.createHTMModel(metricObj.uid, swarmParams) except Exception: logger.exception("startModel: createHTMModel failed.") repository.setMetricStatus(conn, metricObj.uid, status=MetricStatus.ERROR, message=repr(sys.exc_info()[1])) raise logger.info("startModel: started model=%r; duration=%.4fs", metricObj, time.time() - startTime) return True
def startMonitoring(conn, metricId, swarmParams, logger): """ Start monitoring an UNMONITORED metric. NOTE: typically called either inside a transaction and/or with locked tables Starts the CLA model if provided non-None swarmParams; otherwise defers model creation to a later time and places the metric in MetricStatus.PENDING_DATA state. :param conn: SQLAlchemy Connection object for executing SQL :type conn: sqlalchemy.engine.Connection :param metricId: unique identifier of the metric row :param swarmParams: swarmParams generated via scalar_metric_utils.generateSwarmParams() or None. :param logger: logger object :returns: True if model was started; False if not :raises htmengine.exceptions.ObjectNotFoundError: if metric with the referenced metric uid doesn't exist :raises htmengine.exceptions.MetricStatusChangedError: if Metric status was changed by someone else (most likely another process) before this operation could complete """ modelStarted = False startTime = time.time() metricObj = repository.getMetric(conn, metricId) assert metricObj.status == MetricStatus.UNMONITORED, ( "startMonitoring: metric=%s is already monitored; status=%s" % ( metricId, metricObj.status,)) if swarmParams is not None: # We have swarmParams, so start the model modelStarted = _startModelHelper(conn=conn, metricObj=metricObj, swarmParams=swarmParams, logger=logger) else: # Put the metric into the PENDING_DATA state until enough data arrives for # stats refStatus = metricObj.status repository.setMetricStatus(conn, metricId, MetricStatus.PENDING_DATA, refStatus=refStatus) # refresh metricStatus = repository.getMetric(conn, metricId, fields=[schema.metric.c.status]).status if metricStatus == MetricStatus.PENDING_DATA: logger.info("startMonitoring: promoted metric to model in PENDING_DATA; " "metric=%s; duration=%.4fs", metricId, time.time() - startTime) else: raise app_exceptions.MetricStatusChangedError( "startMonitoring: unable to promote metric=%s to model as " "PENDING_DATA; metric status morphed from %s to %s" % (metricId, refStatus, metricStatus,)) return modelStarted
def _startModelHelper(conn, metricObj, swarmParams, logger): """ Start the model :param conn: SQLAlchemy Connection object for executing SQL :type conn: sqlalchemy.engine.Connection :param metricObj: metric, freshly-loaded :type metricObj: sqlalchemy.engine.RowProxy (see repository.getMetric()) :param swarmParams: non-None swarmParams generated via scalar_metric_utils.generateSwarmParams(). :param logger: logger object :returns: True if model was started; False if not :raises htmengine.exceptions.ObjectNotFoundError: if the metric doesn't exist; this may happen if it got deleted by another process in the meantime. :raises htmengine.exceptions.MetricStatusChangedError: if Metric status was changed by someone else (most likely another process) before this operation could complete """ if swarmParams is None: raise ValueError( "startModel: 'swarmParams' must be non-None: metric=%s" % (metricObj.uid,)) if metricObj.status not in (MetricStatus.UNMONITORED, MetricStatus.PENDING_DATA): if metricObj.status in (MetricStatus.CREATE_PENDING, MetricStatus.ACTIVE): return False logger.error("Unexpected metric status; metric=%r", metricObj) raise ValueError("startModel: unexpected metric status; metric=%r" % (metricObj,)) startTime = time.time() # Save swarm parameters and update metric status refStatus = metricObj.status repository.updateMetricColumnsForRefStatus( conn, metricObj.uid, refStatus, {"status": MetricStatus.CREATE_PENDING, "model_params": htmengine.utils.jsonEncode(swarmParams)}) metricObj = repository.getMetric(conn, metricObj.uid, fields=[schema.metric.c.uid, schema.metric.c.status]) # refresh if metricObj.status != MetricStatus.CREATE_PENDING: raise app_exceptions.MetricStatusChangedError( "startModel: unable to start model=%s; " "metric status morphed from %s to %s" % (metricObj.uid, refStatus, metricObj.status,)) # Request to create the CLA model try: model_swapper_utils.createHTMModel(metricObj.uid, swarmParams) except Exception: logger.exception("startModel: createHTMModel failed.") repository.setMetricStatus(conn, metricObj.uid, status=MetricStatus.ERROR, message=repr(sys.exc_info()[1])) raise logger.info("startModel: started model=%r; duration=%.4fs", metricObj, time.time() - startTime) return True
def _scrubInferenceResultsAndInitMetricData(self, engine, inferenceResults, metricDataRows, metricObj): """ Validate the given inferenceResults against metricDataRows, update corresponding MetricData instances by initializing their `raw_anomaly_score` property from results and the `anomaly_score` property with 0. Replace elements in metricDataRows with MutableMetricDataRow objects. *NOTE:* does NOT update the MetricData instances to the database (we do that once after we process the batch for efficiency) :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param inferenceResults: a sequence of ModelInferenceResult instances representing the inference result batch ordered by row id :param metricDataRows: a mutable list of MetricData instances with row ids in the range of inferenceResults[0].rowID to inferenceResults[-1].rowID :param metricObj: a Metric instance associated with the given inferenceResults :raises RejectedInferenceResultBatch: if the given result batch is rejected """ for result, enumeratedMetricData in itertools.izip_longest( inferenceResults, enumerate(metricDataRows)): if enumeratedMetricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % (result, metricObj)) index, metricData = enumeratedMetricData if result is None: raise RejectedInferenceResultBatch( "Truncated inference result batch; no result for metric data row=%r " "of model=<%r>" % (metricData, metricObj)) if metricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % (result, metricObj)) if result.rowID != metricData.rowid: raise RejectedInferenceResultBatch( "RowID mismatch between inference result=%r and ModelData row=%r of " "model=<%r>" % (result, metricData, metricObj)) if metricData.raw_anomaly_score is not None: # Side-effect of at-least-once delivery guarantee? self._log.error( "Anomaly was already processed on data row=%s; new result=%r", metricData, result) # Validate the result if result.status != 0: self._log.error(result.errorMessage) if metricObj.status == MetricStatus.ERROR: raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> was in ERROR state" % (result, getMetricLogPrefix(metricObj))) else: self._log.error( "Placing model=<%r> in ERROR state due to " "inferenceResult=%r", metricObj, result) with engine.connect() as conn: repository.setMetricStatus(conn, metricObj.uid, MetricStatus.ERROR, result.errorMessage) raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> promoted to ERROR state" % (result, getMetricLogPrefix(metricObj))) #self._log.info("{TAG:ANOM.METRIC} metric=%s:%s:%s", # metricObj.name, # calendar.timegm(metricData.timestamp.timetuple()), # metricData.metric_value) mutableMetricData = MutableMetricDataRow( **dict(metricData.items())) mutableMetricData.raw_anomaly_score = result.anomalyScore mutableMetricData.anomaly_score = 0 metricDataRows[index] = mutableMetricData
def _processModelCommandResult(self, metricID, result): """ Process a single model command result """ engine = repository.engineFactory(config) # Check if deleting model if result.method == "deleteModel": self._log.info("Model=%s was deleted", metricID) return # Validate model ID try: # NOTE: use shared lock to prevent race condition with adapter's # monitorMetric, whereby adapter creates and/or activates a metric inside # a transaction, and we might get the defineModel command before the # metric row updates are committed with engine.connect() as conn: metricObj = repository.getMetricWithSharedLock(conn, metricID) except ObjectNotFoundError: # This may occur if the user deletes the model before the result was # delivered while there are result messages still on the message bus. self._log.warn( "Received command result=%r for unknown model=%s " "(model deleted?)", result, metricID) return if result.status != 0: self._log.error(result.errorMessage) if metricObj.status != MetricStatus.ERROR: self._log.error( "Placing model=<%s> in ERROR state due to " "commandResult=%s", getMetricLogPrefix(metricObj), result) with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ERROR, result.errorMessage) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn( "Received command result=%r for metricID=%s of " "metric=<%s> that was already in ERROR state", result, metricID, getMetricLogPrefix(metricObj)) return # Create Model if result.method == "defineModel": self._log.info("Model was created for <%s>" % (getMetricLogPrefix(metricObj))) if metricObj.status == MetricStatus.CREATE_PENDING: with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ACTIVE) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn( "Received command result=%r for model=%s of metric=<%s> " "that was not in CREATE_PENDING state", result, metricID, getMetricLogPrefix(metricObj)) return self._log.error("Unexpected model result=%r", result)