def updateModelAnomalyScores(self, engine, metricObj, metricDataRows): """ Calculate the anomaly scores based on the anomaly likelihoods. Update anomaly scores in the given metricDataRows MetricData instances, and calculate new anomaly likelihood params for the model. :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricObj: the model's Metric instance :param metricDataRows: a sequence of MetricData instances in the processed order (ascending by timestamp) with updated raw_anomaly_score and zeroed out anomaly_score corresponding to the new model inference results, but not yet updated in the database. Will update their anomaly_score properties, as needed. :returns: new anomaly likelihood params for the model *NOTE:* the processing must be idempotent due to the "at least once" delivery semantics of the message bus *NOTE:* the performance goal is to minimize costly database access and avoid falling behind while processing model results, especially during the model's initial "catch-up" phase when large inference result batches are prevalent. """ # When populated, a cached list of MetricData instances for updating # anomaly likelyhood params statsSampleCache = None # Index into metricDataRows where processing is to resume startRowIndex = 0 statisticsRefreshInterval = self._getStatisticsRefreshInterval( batchSize=len(metricDataRows)) if metricObj.status != MetricStatus.ACTIVE: raise MetricNotActiveError( "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; " "status=%s; resource=%s" % (metricObj.uid, metricObj.status, metricObj.server,)) modelParams = jsonDecode(metricObj.model_params) anomalyParams = modelParams.get("anomalyLikelihoodParams", None) if not anomalyParams: # We don't have a likelihood model yet. Create one if we have sufficient # records with raw anomaly scores (anomalyParams, statsSampleCache, startRowIndex) = ( self._initAnomalyLikelihoodModel(engine=engine, metricObj=metricObj, metricDataRows=metricDataRows)) # Do anomaly likelihood processing on the rest of the new samples # NOTE: this loop will be skipped if there are still not enough samples for # creating the anomaly likelihood params while startRowIndex < len(metricDataRows): # Determine where to stop processing rows prior to next statistics refresh if (statsSampleCache is None or len(statsSampleCache) >= self._statisticsMinSampleSize): # We're here if: # a. We haven't tried updating anomaly likelihood stats yet # OR # b. We already updated anomaly likelihood stats (we had sufficient # samples for it) # TODO: unit-test endRowID = (anomalyParams["last_rowid_for_stats"] + statisticsRefreshInterval) if endRowID < metricDataRows[startRowIndex].rowid: # We're here if: # a. Statistics refresh interval is smaller than during last stats # update; this is the typical/normal case when backlog catch-up # is tapering off, and refresh interval is reduced for smaller # batches. OR # b. There is a gap of anomaly scores preceding the start of the # current chunk. OR # c. Statistics config changed. # TODO: unit-test self._log.warning( "Anomaly run cutoff precedes samples (smaller stats " "refreshInterval or gap in anomaly scores or statistics config " "changed) : model=%s; rows=[%s..%s]", metricObj.uid, metricDataRows[startRowIndex].rowid, endRowID) if statsSampleCache is not None: # We already attempted to update anomaly likelihood params, so fix # up endRowID to make sure we make progress and don't get stuck in # an infinite loop endRowID = metricDataRows[startRowIndex].rowid self._log.warning( "Advanced anomaly run cutoff to make progress: " "model=%s; rows=[%s..%s]", metricObj.uid, metricDataRows[startRowIndex].rowid, endRowID) else: # During prior iteration, there were not enough samples in cache for # updating anomaly params # We extend the end row so that there will be enough samples # to avoid getting stuck in this rut in the current and following # iterations # TODO: unit-test this endRowID = metricDataRows[startRowIndex].rowid + ( self._statisticsMinSampleSize - len(statsSampleCache) - 1) # Translate endRowID into metricDataRows limitIndex for current run if endRowID < metricDataRows[startRowIndex].rowid: # Cut-off precedes the remaining samples # Normally shouldn't be here (unless statistics config changed or there # is a gap in anomaly scores in metric_data table) # TODO: unit-test this # Set limit to bypass processing of samples for immediate refresh of # anomaly likelihood params limitIndex = startRowIndex self._log.warning( "Anomaly run cutoff precedes samples, so forcing refresh of anomaly " "likelihood params: modelInfo=<%s>; rows=[%s..%s]", getMetricLogPrefix(metricObj), metricDataRows[startRowIndex].rowid, endRowID) else: # Cutoff is either inside or after the remaining samples # TODO: unit-test this limitIndex = startRowIndex + min( len(metricDataRows) - startRowIndex, endRowID + 1 - metricDataRows[startRowIndex].rowid) # Process the next new sample run self._log.debug( "Starting anomaly run: model=%s; " "startRowIndex=%s; limitIndex=%s; rows=[%s..%s]; " "last_rowid_for_stats=%s; refreshInterval=%s; batchSize=%s", metricObj.uid, startRowIndex, limitIndex, metricDataRows[startRowIndex].rowid, endRowID, anomalyParams["last_rowid_for_stats"], statisticsRefreshInterval, len(metricDataRows)) consumedSamples = [] for md in itertools.islice(metricDataRows, startRowIndex, limitIndex): consumedSamples.append(md) (likelihood,), _, anomalyParams["params"] = ( algorithms.updateAnomalyLikelihoods( ((md.timestamp, md.metric_value, md.raw_anomaly_score),), anomalyParams["params"])) # TODO: the float "cast" here seems redundant md.anomaly_score = float(1.0 - likelihood) # If anomaly score > 0.99 then we greedily update the statistics. 0.99 # should not repeat too often, but to be safe we wait a few more # records before updating again, in order to avoid overloading the DB. # # TODO: the magic 0.99 and the magic 3 value below should either # be constants or config settings. Where should they be defined? if (md.anomaly_score > 0.99 and (anomalyParams["last_rowid_for_stats"] + 3) < md.rowid): if statsSampleCache is None or ( len(statsSampleCache) + len(consumedSamples) >= self._statisticsMinSampleSize): # TODO: unit-test this self._log.info("Forcing refresh of anomaly params for model=%s due " "to exceeded anomaly_score threshold in sample=%r", metricObj.uid, md) break if startRowIndex + len(consumedSamples) < len(metricDataRows) or ( consumedSamples[-1].rowid >= endRowID): # We stopped before the end of new samples, including a bypass-run, # or stopped after processing the last item and need one final refresh # of anomaly params anomalyParams, statsSampleCache = self._refreshAnomalyParams( engine=engine, metricID=metricObj.uid, statsSampleCache=statsSampleCache, consumedSamples=consumedSamples, defaultAnomalyParams=anomalyParams) startRowIndex += len(consumedSamples) # <--- while return anomalyParams
def _processModelInferenceResults(self, inferenceResults, metricID): """ Process a batch of model inference results Store the updated MetricData and anomaly likelihood parameters in the database. A row's anomaly_score value will be set to and remain at 0 in the first self._statisticsMinSampleSize rows; once we get enough inference results to create an anomaly likelihood model, anomaly_score will be computed on the subsequent rows. :param inferenceResults: a sequence of ModelInferenceResult instances in the processed order (ascending by timestamp) :param metricID: metric/model ID of the model that emitted the results :returns: None if the batch was rejected; otherwise a pair: (metric, metricDataRows) metric: Metric RowProxy instance corresponding to the given metricID metricDataRows: a sequence of MutableMetricDataRow instances corresponding to the updated metric_data rows. TODO: unit-test return value :rtype: None or tuple *NOTE:* the processing must be idempotent due to the "at least once" delivery semantics of the message bus *NOTE:* the performance goal is to minimize costly database access and avoid falling behind while processing model results, especially during the model's initial "catch-up" phase when large inference result batches are prevalent. """ engine = repository.engineFactory(config) # Validate model ID try: with engine.connect() as conn: metricObj = repository.getMetric(conn, metricID) except ObjectNotFoundError: # Ignore inferences for unknown models. Typically, this is is the result # of a deleted model. Another scenario where this might occur is when a # developer resets the db while there are result messages still on the # message bus. It would be an error if this were to occur in production # environment. self._log.warning("Received inference results for unknown model=%s; " "(model deleted?)", metricID, exc_info=True) return None # Reject the results if model is in non-ACTIVE state (e.g., if HTM Metric # was unmonitored after the results were generated) if metricObj.status != MetricStatus.ACTIVE: self._log.warning("Received inference results for a non-ACTIVE " "model=%s; metric=<%s>; (metric unmonitored?)", metricID, getMetricLogPrefix(metricObj)) return None # Load the MetricData instances corresponding to the results with engine.connect() as conn: metricDataRows = repository.getMetricData(conn, metricID, start=inferenceResults[0].rowID, stop=inferenceResults[-1].rowID) # metricDataRows must be mutable, as the data is massaged in # _scrubInferenceResultsAndInitMetricData() metricDataRows = list(metricDataRows) if not metricDataRows: self._log.error("Rejected inference result batch=[%s..%s] of model=%s " "due to no matching metric_data rows", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID) return None try: self._scrubInferenceResultsAndInitMetricData( engine=engine, inferenceResults=inferenceResults, metricDataRows=metricDataRows, metricObj=metricObj) except RejectedInferenceResultBatch as e: # TODO: unit-test self._log.error( "Rejected inference result batch=[%s..%s] corresponding to " "rows=[%s..%s] of model=%s due to error=%r", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, e) return None # Update anomaly scores based on the new results anomalyLikelihoodParams = ( self.likelihoodHelper.updateModelAnomalyScores( engine=engine, metricObj=metricObj, metricDataRows=metricDataRows)) # Update metric data rows with rescaled display values # NOTE: doing this outside the updateColumns loop to avoid holding row locks # any longer than necessary for metricData in metricDataRows: metricData.display_value = rescaleForDisplay( metricData.anomaly_score, active=(metricObj.status == MetricStatus.ACTIVE)) # Update database once via transaction! startTime = time.time() try: @retryOnTransientErrors def runSQL(engine): with engine.begin() as conn: for metricData in metricDataRows: fields = {"raw_anomaly_score": metricData.raw_anomaly_score, "anomaly_score": metricData.anomaly_score, "display_value": metricData.display_value, "multi_step_best_predictions": json.dumps(metricData.multi_step_best_predictions)} repository.updateMetricDataColumns(conn, metricData, fields) self._updateAnomalyLikelihoodParams( conn, metricObj.uid, metricObj.model_params, anomalyLikelihoodParams) runSQL(engine) except (ObjectNotFoundError, MetricNotActiveError): self._log.warning("Rejected inference result batch=[%s..%s] of model=%s", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID, exc_info=True) return None self._log.debug("Updated HTM metric_data rows=[%s..%s] " "of model=%s: duration=%ss", metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, time.time() - startTime) return (metricObj, metricDataRows,)
def _scrubInferenceResultsAndInitMetricData(self, engine, inferenceResults, metricDataRows, metricObj): """ Validate the given inferenceResults against metricDataRows, update corresponding MetricData instances by initializing their `raw_anomaly_score` property from results and the `anomaly_score` property with 0. Replace elements in metricDataRows with MutableMetricDataRow objects. *NOTE:* does NOT update the MetricData instances to the database (we do that once after we process the batch for efficiency) :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param inferenceResults: a sequence of ModelInferenceResult instances representing the inference result batch ordered by row id :param metricDataRows: a mutable list of MetricData instances with row ids in the range of inferenceResults[0].rowID to inferenceResults[-1].rowID :param metricObj: a Metric instance associated with the given inferenceResults :raises RejectedInferenceResultBatch: if the given result batch is rejected """ for result, enumeratedMetricData in itertools.izip_longest(inferenceResults, enumerate( metricDataRows) ): if enumeratedMetricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % ( result, metricObj)) index, metricData = enumeratedMetricData if result is None: raise RejectedInferenceResultBatch( "Truncated inference result batch; no result for metric data row=%r " "of model=<%r>" % (metricData, metricObj)) if metricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % (result, metricObj)) if result.rowID != metricData.rowid: raise RejectedInferenceResultBatch( "RowID mismatch between inference result=%r and ModelData row=%r of " "model=<%r>" % (result, metricData, metricObj)) if metricData.raw_anomaly_score is not None: # Side-effect of at-least-once delivery guarantee? self._log.error( "Anomaly was already processed on data row=%s; new result=%r", metricData, result) # Validate the result if result.status != 0: self._log.error(result.errorMessage) if metricObj.status == MetricStatus.ERROR: raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> was in ERROR state" % (result, getMetricLogPrefix(metricObj))) else: self._log.error("Placing model=<%r> in ERROR state due to " "inferenceResult=%r", metricObj, result) with engine.connect() as conn: repository.setMetricStatus(conn, metricObj.uid, MetricStatus.ERROR, result.errorMessage) raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> promoted to ERROR state" % (result, getMetricLogPrefix(metricObj))) #self._log.info("{TAG:ANOM.METRIC} metric=%s:%s:%s", # metricObj.name, # calendar.timegm(metricData.timestamp.timetuple()), # metricData.metric_value) mutableMetricData = MutableMetricDataRow(**dict(metricData.items())) mutableMetricData.raw_anomaly_score = result.anomalyScore mutableMetricData.anomaly_score = 0 mutableMetricData.multi_step_best_predictions = ( result.multiStepBestPredictions) metricDataRows[index] = mutableMetricData
def _processModelCommandResult(self, metricID, result): """ Process a single model command result """ engine = repository.engineFactory(config) # Check if deleting model if result.method == "deleteModel": self._log.info("Model=%s was deleted", metricID) return # Validate model ID try: # NOTE: use shared lock to prevent race condition with adapter's # monitorMetric, whereby adapter creates and/or activates a metric inside # a transaction, and we might get the defineModel command before the # metric row updates are committed with engine.connect() as conn: metricObj = repository.getMetricWithSharedLock(conn, metricID) except ObjectNotFoundError: # This may occur if the user deletes the model before the result was # delivered while there are result messages still on the message bus. self._log.warn("Received command result=%r for unknown model=%s " "(model deleted?)", result, metricID) return if result.status != 0: self._log.error(result.errorMessage) if metricObj.status != MetricStatus.ERROR: self._log.error("Placing model=<%s> in ERROR state due to " "commandResult=%s", getMetricLogPrefix(metricObj), result) with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ERROR, result.errorMessage) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn("Received command result=%r for metricID=%s of " "metric=<%s> that was already in ERROR state", result, metricID, getMetricLogPrefix(metricObj)) return # Create Model if result.method == "defineModel": self._log.info("Model was created for <%s>", getMetricLogPrefix(metricObj)) if metricObj.status == MetricStatus.CREATE_PENDING: with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ACTIVE) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn("Received command result=%r for model=%s of metric=<%s> " "that was not in CREATE_PENDING state", result, metricID, getMetricLogPrefix(metricObj)) return self._log.error("Unexpected model result=%r", result)
def _scrubInferenceResultsAndInitMetricData(self, engine, inferenceResults, metricDataRows, metricObj): """ Validate the given inferenceResults against metricDataRows, update corresponding MetricData instances by initializing their `raw_anomaly_score` property from results and the `anomaly_score` property with 0. Replace elements in metricDataRows with MutableMetricDataRow objects. *NOTE:* does NOT update the MetricData instances to the database (we do that once after we process the batch for efficiency) :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param inferenceResults: a sequence of ModelInferenceResult instances representing the inference result batch ordered by row id :param metricDataRows: a mutable list of MetricData instances with row ids in the range of inferenceResults[0].rowID to inferenceResults[-1].rowID :param metricObj: a Metric instance associated with the given inferenceResults :raises RejectedInferenceResultBatch: if the given result batch is rejected """ for result, enumeratedMetricData in itertools.izip_longest( inferenceResults, enumerate(metricDataRows)): if enumeratedMetricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % (result, metricObj)) index, metricData = enumeratedMetricData if result is None: raise RejectedInferenceResultBatch( "Truncated inference result batch; no result for metric data row=%r " "of model=<%r>" % (metricData, metricObj)) if metricData is None: raise RejectedInferenceResultBatch( "No MetricData row for inference result=%r of model=<%r>" % (result, metricObj)) if result.rowID != metricData.rowid: raise RejectedInferenceResultBatch( "RowID mismatch between inference result=%r and ModelData row=%r of " "model=<%r>" % (result, metricData, metricObj)) if metricData.raw_anomaly_score is not None: # Side-effect of at-least-once delivery guarantee? self._log.error( "Anomaly was already processed on data row=%s; new result=%r", metricData, result) # Validate the result if result.status != 0: self._log.error(result.errorMessage) if metricObj.status == MetricStatus.ERROR: raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> was in ERROR state" % (result, getMetricLogPrefix(metricObj))) else: self._log.error( "Placing model=<%r> in ERROR state due to " "inferenceResult=%r", metricObj, result) with engine.connect() as conn: repository.setMetricStatus(conn, metricObj.uid, MetricStatus.ERROR, result.errorMessage) raise RejectedInferenceResultBatch( "inferenceResult=%r failed and model=<%s> promoted to ERROR state" % (result, getMetricLogPrefix(metricObj))) #self._log.info("{TAG:ANOM.METRIC} metric=%s:%s:%s", # metricObj.name, # calendar.timegm(metricData.timestamp.timetuple()), # metricData.metric_value) mutableMetricData = MutableMetricDataRow( **dict(metricData.items())) mutableMetricData.raw_anomaly_score = result.anomalyScore mutableMetricData.anomaly_score = 0 metricDataRows[index] = mutableMetricData
def _processModelInferenceResults(self, inferenceResults, metricID): """ Process a batch of model inference results Store the updated MetricData and anomaly likelihood parameters in the database. A row's anomaly_score value will be set to and remain at 0 in the first self._statisticsMinSampleSize rows; once we get enough inference results to create an anomaly likelyhood model, anomaly_score will be computed on the subsequent rows. :param inferenceResults: a sequence of ModelInferenceResult instances in the processed order (ascending by timestamp) :param metricID: metric/model ID of the model that emitted the results :returns: None if the batch was rejected; otherwise a pair: (metric, metricDataRows) metric: Metric RowProxy instance corresponding to the given metricID metricDataRows: a sequence of MutableMetricDataRow instances corresponding to the updated metric_data rows. TODO: unit-test return value :rtype: None or tuple *NOTE:* the processing must be idempotent due to the "at least once" delivery semantics of the message bus *NOTE:* the performance goal is to minimize costly database access and avoid falling behind while processing model results, especially during the model's initial "catch-up" phase when large inference result batches are prevalent. """ engine = repository.engineFactory(config) # Validate model ID try: with engine.connect() as conn: metricObj = repository.getMetric(conn, metricID) except ObjectNotFoundError: # Ignore inferences for unkonwn models. Typically, this is is the result # of a deleted model. Another scenario where this might occur is when a # developer resets db while there are result messages still on the # message bus. It would be an error if this were to occur in production # environment. self._log.warning( "Received inference results for unknown model=%s; " "(model deleted?)", metricID, exc_info=True) return None # Reject the results if model is in non-ACTIVE state (e.g., if HTM Metric # was unmonitored after the results were generated) if metricObj.status != MetricStatus.ACTIVE: self._log.warning( "Received inference results for a non-ACTIVE " "model=%s; metric=<%s>; (metric unmonitored?)", metricID, getMetricLogPrefix(metricObj)) return None # Load the MetricData instances corresponding to the results with engine.connect() as conn: metricDataRows = repository.getMetricData( conn, metricID, start=inferenceResults[0].rowID, stop=inferenceResults[-1].rowID) # metricDataRows must be mutable, as the data is massaged in # _scrubInferenceResultsAndInitMetricData() metricDataRows = list(metricDataRows) if not metricDataRows: self._log.error( "Rejected inference result batch=[%s..%s] of model=%s " "due to no matching metric_data rows", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID) return None try: self._scrubInferenceResultsAndInitMetricData( engine=engine, inferenceResults=inferenceResults, metricDataRows=metricDataRows, metricObj=metricObj) except RejectedInferenceResultBatch as e: # TODO: unit-test self._log.error( "Rejected inference result batch=[%s..%s] corresponding to " "rows=[%s..%s] of model=%s due to error=%r", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, e) return None # Update anomaly scores based on the new results anomalyLikelihoodParams = ( self.likelihoodHelper.updateModelAnomalyScores( engine=engine, metricObj=metricObj, metricDataRows=metricDataRows)) # Update metric data rows with rescaled display values # NOTE: doing this outside the updateColumns loop to avoid holding row locks # any longer than necessary for metricData in metricDataRows: metricData.display_value = rescaleForDisplay( metricData.anomaly_score, active=(metricObj.status == MetricStatus.ACTIVE)) # Update database once via transaction! startTime = time.time() try: @retryOnTransientErrors def runSQL(engine): with engine.begin() as conn: for metricData in metricDataRows: fields = { "raw_anomaly_score": metricData.raw_anomaly_score, "anomaly_score": metricData.anomaly_score, "display_value": metricData.display_value } repository.updateMetricDataColumns( conn, metricData, fields) self._updateAnomalyLikelihoodParams( conn, metricObj.uid, metricObj.model_params, anomalyLikelihoodParams) runSQL(engine) except (ObjectNotFoundError, MetricNotActiveError): self._log.warning( "Rejected inference result batch=[%s..%s] of model=%s", inferenceResults[0].rowID, inferenceResults[-1].rowID, metricID, exc_info=True) return None self._log.debug( "Updated HTM metric_data rows=[%s..%s] " "of model=%s: duration=%ss", metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, time.time() - startTime) return ( metricObj, metricDataRows, )
def _processModelCommandResult(self, metricID, result): """ Process a single model command result """ engine = repository.engineFactory(config) # Check if deleting model if result.method == "deleteModel": self._log.info("Model=%s was deleted", metricID) return # Validate model ID try: # NOTE: use shared lock to prevent race condition with adapter's # monitorMetric, whereby adapter creates and/or activates a metric inside # a transaction, and we might get the defineModel command before the # metric row updates are committed with engine.connect() as conn: metricObj = repository.getMetricWithSharedLock(conn, metricID) except ObjectNotFoundError: # This may occur if the user deletes the model before the result was # delivered while there are result messages still on the message bus. self._log.warn( "Received command result=%r for unknown model=%s " "(model deleted?)", result, metricID) return if result.status != 0: self._log.error(result.errorMessage) if metricObj.status != MetricStatus.ERROR: self._log.error( "Placing model=<%s> in ERROR state due to " "commandResult=%s", getMetricLogPrefix(metricObj), result) with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ERROR, result.errorMessage) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn( "Received command result=%r for metricID=%s of " "metric=<%s> that was already in ERROR state", result, metricID, getMetricLogPrefix(metricObj)) return # Create Model if result.method == "defineModel": self._log.info("Model was created for <%s>" % (getMetricLogPrefix(metricObj))) if metricObj.status == MetricStatus.CREATE_PENDING: with engine.connect() as conn: repository.setMetricStatus(conn, metricID, MetricStatus.ACTIVE) else: # NOTE: could be a race condition between app-layer and Model Swapper # or a side-effect of the at-least-once delivery guarantee self._log.warn( "Received command result=%r for model=%s of metric=<%s> " "that was not in CREATE_PENDING state", result, metricID, getMetricLogPrefix(metricObj)) return self._log.error("Unexpected model result=%r", result)
def updateModelAnomalyScores(self, engine, metricObj, metricDataRows): """ Calculate the anomaly scores based on the anomaly likelihoods. Update anomaly scores in the given metricDataRows MetricData instances, and calculate new anomaly likelihood params for the model. :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricObj: the model's Metric instance :param metricDataRows: a sequence of MetricData instances in the processed order (ascending by timestamp) with updated raw_anomaly_score and zeroed out anomaly_score corresponding to the new model inference results, but not yet updated in the database. Will update their anomaly_score properties, as needed. :returns: new anomaly likelihood params for the model *NOTE:* the processing must be idempotent due to the "at least once" delivery semantics of the message bus *NOTE:* the performance goal is to minimize costly database access and avoid falling behind while processing model results, especially during the model's initial "catch-up" phase when large inference result batches are prevalent. """ # When populated, a cached list of MetricData instances for updating # anomaly likelyhood params statsSampleCache = None # Index into metricDataRows where processing is to resume startRowIndex = 0 statisticsRefreshInterval = self._getStatisticsRefreshInterval( batchSize=len(metricDataRows)) if metricObj.status != MetricStatus.ACTIVE: raise MetricNotActiveError( "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; " "status=%s; resource=%s" % ( metricObj.uid, metricObj.status, metricObj.server, )) modelParams = jsonDecode(metricObj.model_params) anomalyParams = modelParams.get("anomalyLikelihoodParams", None) if not anomalyParams: # We don't have a likelihood model yet. Create one if we have sufficient # records with raw anomaly scores (anomalyParams, statsSampleCache, startRowIndex) = (self._initAnomalyLikelihoodModel( engine=engine, metricObj=metricObj, metricDataRows=metricDataRows)) # Do anomaly likelihood processing on the rest of the new samples # NOTE: this loop will be skipped if there are still not enough samples for # creating the anomaly likelihood params while startRowIndex < len(metricDataRows): # Determine where to stop processing rows prior to next statistics refresh if (statsSampleCache is None or len(statsSampleCache) >= self._statisticsMinSampleSize): # We're here if: # a. We haven't tried updating anomaly likelihood stats yet # OR # b. We already updated anomaly likelyhood stats (we had sufficient # samples for it) # TODO: unit-test endRowID = (anomalyParams["last_rowid_for_stats"] + statisticsRefreshInterval) if endRowID < metricDataRows[startRowIndex].rowid: # We're here if: # a. Statistics refresh interval is smaller than during last stats # update; this is the typical/normal case when backlog catch-up # is tapering off, and refresh interval is reduced for smaller # batches. OR # b. There is a gap of anomaly scores preceeding the start of the # current chunk. OR # c. Statistics config changed. # TODO: unit-test self._log.warning( "Anomaly run cutoff precedes samples (smaller stats " "refreshInterval or gap in anomaly scores or statistics config " "changed) : model=%s; rows=[%s..%s]", metricObj.uid, metricDataRows[startRowIndex].rowid, endRowID) if statsSampleCache is not None: # We already attempted to update anomaly likelihood params, so fix # up endRowID to make sure we make progress and don't get stuck in # an infinite loop endRowID = metricDataRows[startRowIndex].rowid self._log.warning( "Advanced anomaly run cutoff to make progress: " "model=%s; rows=[%s..%s]", metricObj.uid, metricDataRows[startRowIndex].rowid, endRowID) else: # During prior iteration, there were not enough samples in cache for # updating anomaly params # We extend the end row so that there will be enough samples # to avoid getting stuck in this rut in the current and following # iterations # TODO: unit-test this endRowID = metricDataRows[startRowIndex].rowid + ( self._statisticsMinSampleSize - len(statsSampleCache) - 1) # Translate endRowID into metricDataRows limitIndex for current run if endRowID < metricDataRows[startRowIndex].rowid: # Cut-off precedes the remaining samples # Normally shouldn't be here (unless statistics config changed or there # is a gap in anomaly scores in metric_data table) # TODO: unit-test this # Set limit to bypass processing of samples for immediate refresh of # anomaly likelihood params limitIndex = startRowIndex self._log.warning( "Anomaly run cutoff precedes samples, so forcing refresh of anomaly " "likelihood params: modelInfo=<%s>; rows=[%s..%s]", getMetricLogPrefix(metricObj), metricDataRows[startRowIndex].rowid, endRowID) else: # Cutoff is either inside or after the remaining samples # TODO: unit-test this limitIndex = startRowIndex + min( len(metricDataRows) - startRowIndex, endRowID + 1 - metricDataRows[startRowIndex].rowid) # Process the next new sample run self._log.debug( "Starting anomaly run: model=%s; " "startRowIndex=%s; limitIndex=%s; rows=[%s..%s]; " "last_rowid_for_stats=%s; refreshInterval=%s; batchSize=%s", metricObj.uid, startRowIndex, limitIndex, metricDataRows[startRowIndex].rowid, endRowID, anomalyParams["last_rowid_for_stats"], statisticsRefreshInterval, len(metricDataRows)) consumedSamples = [] for md in itertools.islice(metricDataRows, startRowIndex, limitIndex): consumedSamples.append(md) (likelihood, ), _, anomalyParams["params"] = ( algorithms.updateAnomalyLikelihoods( ((md.timestamp, md.metric_value, md.raw_anomaly_score), ), anomalyParams["params"])) # TODO: the float "cast" here seems redundant md.anomaly_score = float(1.0 - likelihood) # If anomaly score > 0.99 then we greedily update the statistics. 0.99 # should not repeat too often, but to be safe we wait a few more # records before updating again, in order to avoid overloading the DB. # # TODO: the magic 0.99 and the magic 3 value below should either # be constants or config settings. Where should they be defined? if (md.anomaly_score > 0.99 and (anomalyParams["last_rowid_for_stats"] + 3) < md.rowid): if statsSampleCache is None or ( len(statsSampleCache) + len(consumedSamples) >= self._statisticsMinSampleSize): # TODO: unit-test this self._log.info( "Forcing refresh of anomaly params for model=%s due " "to exceeded anomaly_score threshold in sample=%r", metricObj.uid, md) break if startRowIndex + len(consumedSamples) < len(metricDataRows) or ( consumedSamples[-1].rowid >= endRowID): # We stopped before the end of new samples, including a bypass-run, # or stopped after processing the last item and need one final refresh # of anomaly params anomalyParams, statsSampleCache = self._refreshAnomalyParams( engine=engine, metricID=metricObj.uid, statsSampleCache=statsSampleCache, consumedSamples=consumedSamples, defaultAnomalyParams=anomalyParams) startRowIndex += len(consumedSamples) # <--- while return anomalyParams