def _updateAnomalyLikelihoodParams(cls, conn, metricId, modelParamsJson, likelihoodParams): """Update and save anomaly_params with the given likelyhoodParams if the metric is ACTIVE. :param conn: Transactional SQLAlchemy connection object :type conn: sqlalchemy.engine.base.Connection :param metricId: Metric uid :param modelParamsJson: Model params JSON object (from model_params metric column) :param likelihoodParams: anomaly likelihood params dict :raises: htmengine.exceptions.MetricNotActiveError if metric's status is not MetricStatus.ACTIVE """ lockedRow = repository.getMetricWithUpdateLock( conn, metricId, fields=[schema.metric.c.status]) if lockedRow.status != MetricStatus.ACTIVE: raise MetricNotActiveError( "_updateAnomalyLikelihoodParams failed because metric=%s is not " "ACTIVE; status=%s" % ( metricId, lockedRow.status, )) modelParams = json.loads(modelParamsJson) modelParams["anomalyLikelihoodParams"] = likelihoodParams repository.updateMetricColumns( conn, metricId, {"model_params": json.dumps(modelParams)})
def updateModelAnomalyScores(self, engine, metricObj, metricDataRows): """ Calculate the anomaly scores based on the anomaly likelihoods. Update anomaly scores in the given metricDataRows MetricData instances, and calculate new anomaly likelihood params for the model. :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricObj: the model's Metric instance :param metricDataRows: a sequence of MetricData instances in the processed order (ascending by timestamp) with updated raw_anomaly_score and zeroed out anomaly_score corresponding to the new model inference results, but not yet updated in the database. Will update their anomaly_score properties, as needed. :returns: new anomaly likelihood params for the model *NOTE:* the processing must be idempotent due to the "at least once" delivery semantics of the message bus *NOTE:* the performance goal is to minimize costly database access and avoid falling behind while processing model results, especially during the model's initial "catch-up" phase when large inference result batches are prevalent. """ # When populated, a cached list of MetricData instances for updating # anomaly likelyhood params statsSampleCache = None # Index into metricDataRows where processing is to resume startRowIndex = 0 statisticsRefreshInterval = self._getStatisticsRefreshInterval( batchSize=len(metricDataRows)) if metricObj.status != MetricStatus.ACTIVE: raise MetricNotActiveError( "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; " "status=%s; resource=%s" % ( metricObj.uid, metricObj.status, metricObj.server, )) modelParams = jsonDecode(metricObj.model_params) anomalyParams = modelParams.get("anomalyLikelihoodParams", None) if not anomalyParams: # We don't have a likelihood model yet. Create one if we have sufficient # records with raw anomaly scores (anomalyParams, statsSampleCache, startRowIndex) = (self._initAnomalyLikelihoodModel( engine=engine, metricObj=metricObj, metricDataRows=metricDataRows)) # Do anomaly likelihood processing on the rest of the new samples # NOTE: this loop will be skipped if there are still not enough samples for # creating the anomaly likelihood params while startRowIndex < len(metricDataRows): # Determine where to stop processing rows prior to next statistics refresh if (statsSampleCache is None or len(statsSampleCache) >= self._statisticsMinSampleSize): # We're here if: # a. We haven't tried updating anomaly likelihood stats yet # OR # b. We already updated anomaly likelyhood stats (we had sufficient # samples for it) # TODO: unit-test endRowID = (anomalyParams["last_rowid_for_stats"] + statisticsRefreshInterval) if endRowID < metricDataRows[startRowIndex].rowid: # We're here if: # a. Statistics refresh interval is smaller than during last stats # update; this is the typical/normal case when backlog catch-up # is tapering off, and refresh interval is reduced for smaller # batches. OR # b. There is a gap of anomaly scores preceeding the start of the # current chunk. OR # c. Statistics config changed. # TODO: unit-test self._log.warning( "Anomaly run cutoff precedes samples (smaller stats " "refreshInterval or gap in anomaly scores or statistics config " "changed) : model=%s; rows=[%s..%s]", metricObj.uid, metricDataRows[startRowIndex].rowid, endRowID) if statsSampleCache is not None: # We already attempted to update anomaly likelihood params, so fix # up endRowID to make sure we make progress and don't get stuck in # an infinite loop endRowID = metricDataRows[startRowIndex].rowid self._log.warning( "Advanced anomaly run cutoff to make progress: " "model=%s; rows=[%s..%s]", metricObj.uid, metricDataRows[startRowIndex].rowid, endRowID) else: # During prior iteration, there were not enough samples in cache for # updating anomaly params # We extend the end row so that there will be enough samples # to avoid getting stuck in this rut in the current and following # iterations # TODO: unit-test this endRowID = metricDataRows[startRowIndex].rowid + ( self._statisticsMinSampleSize - len(statsSampleCache) - 1) # Translate endRowID into metricDataRows limitIndex for current run if endRowID < metricDataRows[startRowIndex].rowid: # Cut-off precedes the remaining samples # Normally shouldn't be here (unless statistics config changed or there # is a gap in anomaly scores in metric_data table) # TODO: unit-test this # Set limit to bypass processing of samples for immediate refresh of # anomaly likelihood params limitIndex = startRowIndex self._log.warning( "Anomaly run cutoff precedes samples, so forcing refresh of anomaly " "likelihood params: modelInfo=<%s>; rows=[%s..%s]", getMetricLogPrefix(metricObj), metricDataRows[startRowIndex].rowid, endRowID) else: # Cutoff is either inside or after the remaining samples # TODO: unit-test this limitIndex = startRowIndex + min( len(metricDataRows) - startRowIndex, endRowID + 1 - metricDataRows[startRowIndex].rowid) # Process the next new sample run self._log.debug( "Starting anomaly run: model=%s; " "startRowIndex=%s; limitIndex=%s; rows=[%s..%s]; " "last_rowid_for_stats=%s; refreshInterval=%s; batchSize=%s", metricObj.uid, startRowIndex, limitIndex, metricDataRows[startRowIndex].rowid, endRowID, anomalyParams["last_rowid_for_stats"], statisticsRefreshInterval, len(metricDataRows)) consumedSamples = [] for md in itertools.islice(metricDataRows, startRowIndex, limitIndex): consumedSamples.append(md) (likelihood, ), _, anomalyParams["params"] = ( algorithms.updateAnomalyLikelihoods( ((md.timestamp, md.metric_value, md.raw_anomaly_score), ), anomalyParams["params"])) # TODO: the float "cast" here seems redundant md.anomaly_score = float(1.0 - likelihood) # If anomaly score > 0.99 then we greedily update the statistics. 0.99 # should not repeat too often, but to be safe we wait a few more # records before updating again, in order to avoid overloading the DB. # # TODO: the magic 0.99 and the magic 3 value below should either # be constants or config settings. Where should they be defined? if (md.anomaly_score > 0.99 and (anomalyParams["last_rowid_for_stats"] + 3) < md.rowid): if statsSampleCache is None or ( len(statsSampleCache) + len(consumedSamples) >= self._statisticsMinSampleSize): # TODO: unit-test this self._log.info( "Forcing refresh of anomaly params for model=%s due " "to exceeded anomaly_score threshold in sample=%r", metricObj.uid, md) break if startRowIndex + len(consumedSamples) < len(metricDataRows) or ( consumedSamples[-1].rowid >= endRowID): # We stopped before the end of new samples, including a bypass-run, # or stopped after processing the last item and need one final refresh # of anomaly params anomalyParams, statsSampleCache = self._refreshAnomalyParams( engine=engine, metricID=metricObj.uid, statsSampleCache=statsSampleCache, consumedSamples=consumedSamples, defaultAnomalyParams=anomalyParams) startRowIndex += len(consumedSamples) # <--- while return anomalyParams
def _initAnomalyLikelihoodModel(self, engine, metricObj, metricDataRows): """ Create the anomaly likelihood model for the given Metric instance. Assumes that the metric doesn't have anomaly params yet. :param engine: SQLAlchemy engine object :type engine: sqlalchemy.engine.Engine :param metricObj: Metric instance with no anomaly likelihood params :param metricDataRows: a sequence of MetricData instances corresponding to the inference results batch in the processed order (ascending by rowid and timestamp) with updated raw_anomaly_score and zeroed out anomaly_score corresponding to the new model inference results, but not yet updated in the database. Will not alter this sequence. :returns: the tuple (anomalyParams, statsSampleCache, startRowIndex) anomalyParams: None, if there are too few samples; otherwise, the anomaly likelyhood objects as returned by algorithms.estimateAnomalyLikelihoods statsSampleCache: None, if there are too few samples; otherwise, a list of MetricData instances comprising of a concatenation of rows sourced from metric_data tail and topped off with necessary items from the given metricDataRows for a minimum of self._statisticsMinSampleSize and a maximum of self._statisticsSampleSize total items. startRowIndex: Index into the given metricDataRows where processing of anomaly scores is to start; if there are too few samples to generate the anomaly likelihood params, then startRowIndex will reference past the last item in the given metricDataRows sequence. """ if metricObj.status != MetricStatus.ACTIVE: raise MetricNotActiveError( "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; " "status=%s; resource=%s" % ( metricObj.uid, metricObj.status, metricObj.server, )) modelParams = jsonDecode(metricObj.model_params) anomalyParams = modelParams.get("anomalyLikelihoodParams", None) assert not anomalyParams, anomalyParams statsSampleCache = None # Index into metricDataRows where processing of anomaly scores is to start startRowIndex = 0 with engine.connect() as conn: numProcessedRows = repository.getProcessedMetricDataCount( conn, metricObj.uid) if numProcessedRows + len( metricDataRows) >= self._statisticsMinSampleSize: # We have enough samples to initialize the anomaly likelihood model # TODO: unit-test # Determine how many samples will be used from metricDataRows numToConsume = max( 0, self._statisticsMinSampleSize - numProcessedRows) consumedSamples = metricDataRows[:numToConsume] startRowIndex += numToConsume # Create the anomaly likelihood model anomalyParams, statsSampleCache = self._refreshAnomalyParams( engine=engine, metricID=metricObj.uid, statsSampleCache=None, consumedSamples=consumedSamples, defaultAnomalyParams=anomalyParams) # If this assertion fails, it implies that the count retrieved by our # call to MetricData.count above is no longer correct assert anomalyParams self._log.info( "Generated initial anomaly params for model=%s: " "numSamples=%d; firstRowID=%s; lastRowID=%s; ", metricObj.uid, len(statsSampleCache), statsSampleCache[0].rowid, statsSampleCache[-1].rowid) else: # Not enough raw scores yet to begin anomaly likelyhoods processing # TODO: unit-test startRowIndex = len(metricDataRows) return anomalyParams, statsSampleCache, startRowIndex