Example #1
    def _updateAnomalyLikelihoodParams(cls, conn, metricId, modelParamsJson,
        """Update and save anomaly_params with the given likelyhoodParams if the
       metric is ACTIVE.

    :param conn: Transactional SQLAlchemy connection object
    :type conn: sqlalchemy.engine.base.Connection
    :param metricId: Metric uid
    :param modelParamsJson: Model params JSON object (from model_params metric
    :param likelihoodParams: anomaly likelihood params dict

    :raises: htmengine.exceptions.MetricNotActiveError if metric's status is not
        lockedRow = repository.getMetricWithUpdateLock(
            conn, metricId, fields=[schema.metric.c.status])

        if lockedRow.status != MetricStatus.ACTIVE:
            raise MetricNotActiveError(
                "_updateAnomalyLikelihoodParams failed because metric=%s is not "
                "ACTIVE; status=%s" % (

        modelParams = json.loads(modelParamsJson)
        modelParams["anomalyLikelihoodParams"] = likelihoodParams

            conn, metricId, {"model_params": json.dumps(modelParams)})
Example #2
    def updateModelAnomalyScores(self, engine, metricObj, metricDataRows):
    Calculate the anomaly scores based on the anomaly likelihoods. Update
    anomaly scores in the given metricDataRows MetricData instances, and
    calculate new anomaly likelihood params for the model.

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine
    :param metricObj: the model's Metric instance
    :param metricDataRows: a sequence of MetricData instances in the
      processed order (ascending by timestamp) with updated raw_anomaly_score
      and zeroed out anomaly_score corresponding to the new model inference
      results, but not yet updated in the database. Will update their
      anomaly_score properties, as needed.

    :returns: new anomaly likelihood params for the model

      the processing must be idempotent due to the "at least once" delivery
      semantics of the message bus

      the performance goal is to minimize costly database access and avoid
      falling behind while processing model results, especially during the
      model's initial "catch-up" phase when large inference result batches are
        # When populated, a cached list of MetricData instances for updating
        # anomaly likelyhood params
        statsSampleCache = None

        # Index into metricDataRows where processing is to resume
        startRowIndex = 0

        statisticsRefreshInterval = self._getStatisticsRefreshInterval(

        if metricObj.status != MetricStatus.ACTIVE:
            raise MetricNotActiveError(
                "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; "
                "status=%s; resource=%s" % (

        modelParams = jsonDecode(metricObj.model_params)
        anomalyParams = modelParams.get("anomalyLikelihoodParams", None)
        if not anomalyParams:
            # We don't have a likelihood model yet. Create one if we have sufficient
            # records with raw anomaly scores
            (anomalyParams, statsSampleCache,
             startRowIndex) = (self._initAnomalyLikelihoodModel(

        # Do anomaly likelihood processing on the rest of the new samples
        # NOTE: this loop will be skipped if there are still not enough samples for
        #  creating the anomaly likelihood params
        while startRowIndex < len(metricDataRows):
            # Determine where to stop processing rows prior to next statistics refresh

            if (statsSampleCache is None
                    or len(statsSampleCache) >= self._statisticsMinSampleSize):
                # We're here if:
                #   a. We haven't tried updating anomaly likelihood stats yet
                #                 OR
                #   b. We already updated anomaly likelyhood stats (we had sufficient
                #      samples for it)
                # TODO: unit-test
                endRowID = (anomalyParams["last_rowid_for_stats"] +

                if endRowID < metricDataRows[startRowIndex].rowid:
                    # We're here if:
                    #   a. Statistics refresh interval is smaller than during last stats
                    #      update; this is the typical/normal case when backlog catch-up
                    #      is tapering off, and refresh interval is reduced for smaller
                    #      batches. OR
                    #   b. There is a gap of anomaly scores preceeding the start of the
                    #      current chunk. OR
                    #   c. Statistics config changed.
                    # TODO: unit-test

                        "Anomaly run cutoff precedes samples (smaller stats "
                        "refreshInterval or gap in anomaly scores or statistics config "
                        "changed) : model=%s; rows=[%s..%s]", metricObj.uid,
                        metricDataRows[startRowIndex].rowid, endRowID)

                    if statsSampleCache is not None:
                        # We already attempted to update anomaly likelihood params, so fix
                        # up endRowID to make sure we make progress and don't get stuck in
                        # an infinite loop
                        endRowID = metricDataRows[startRowIndex].rowid
                            "Advanced anomaly run cutoff to make progress: "
                            "model=%s; rows=[%s..%s]", metricObj.uid,
                            metricDataRows[startRowIndex].rowid, endRowID)
                # During prior iteration, there were not enough samples in cache for
                # updating anomaly params

                # We extend the end row so that there will be enough samples
                # to avoid getting stuck in this rut in the current and following
                # iterations
                # TODO: unit-test this
                endRowID = metricDataRows[startRowIndex].rowid + (
                    self._statisticsMinSampleSize - len(statsSampleCache) - 1)

            # Translate endRowID into metricDataRows limitIndex for current run
            if endRowID < metricDataRows[startRowIndex].rowid:
                # Cut-off precedes the remaining samples
                # Normally shouldn't be here (unless statistics config changed or there
                # is a gap in anomaly scores in metric_data table)
                # TODO: unit-test this

                # Set limit to bypass processing of samples for immediate refresh of
                # anomaly likelihood params
                limitIndex = startRowIndex
                    "Anomaly run cutoff precedes samples, so forcing refresh of anomaly "
                    "likelihood params: modelInfo=<%s>; rows=[%s..%s]",
                    metricDataRows[startRowIndex].rowid, endRowID)
                # Cutoff is either inside or after the remaining samples
                # TODO: unit-test this
                limitIndex = startRowIndex + min(
                    len(metricDataRows) - startRowIndex,
                    endRowID + 1 - metricDataRows[startRowIndex].rowid)

            # Process the next new sample run
                "Starting anomaly run: model=%s; "
                "startRowIndex=%s; limitIndex=%s; rows=[%s..%s]; "
                "last_rowid_for_stats=%s; refreshInterval=%s; batchSize=%s",
                metricObj.uid, startRowIndex, limitIndex,
                metricDataRows[startRowIndex].rowid, endRowID,
                statisticsRefreshInterval, len(metricDataRows))

            consumedSamples = []
            for md in itertools.islice(metricDataRows, startRowIndex,

                (likelihood, ), _, anomalyParams["params"] = (
                        ((md.timestamp, md.metric_value,
                          md.raw_anomaly_score), ), anomalyParams["params"]))

                # TODO: the float "cast" here seems redundant
                md.anomaly_score = float(1.0 - likelihood)

                # If anomaly score > 0.99 then we greedily update the statistics. 0.99
                # should not repeat too often, but to be safe we wait a few more
                # records before updating again, in order to avoid overloading the DB.
                # TODO: the magic 0.99 and the magic 3 value below should either
                #  be constants or config settings. Where should they be defined?
                if (md.anomaly_score > 0.99 and
                    (anomalyParams["last_rowid_for_stats"] + 3) < md.rowid):
                    if statsSampleCache is None or (
                            len(statsSampleCache) + len(consumedSamples) >=
                        # TODO: unit-test this
                            "Forcing refresh of anomaly params for model=%s due "
                            "to exceeded anomaly_score threshold in sample=%r",
                            metricObj.uid, md)

            if startRowIndex + len(consumedSamples) < len(metricDataRows) or (
                    consumedSamples[-1].rowid >= endRowID):
                # We stopped before the end of new samples, including a bypass-run,
                # or stopped after processing the last item and need one final refresh
                # of anomaly params
                anomalyParams, statsSampleCache = self._refreshAnomalyParams(

            startRowIndex += len(consumedSamples)
        # <--- while

        return anomalyParams
Example #3
    def _initAnomalyLikelihoodModel(self, engine, metricObj, metricDataRows):
        """ Create the anomaly likelihood model for the given Metric instance.
    Assumes that the metric doesn't have anomaly params yet.

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine

    :param metricObj: Metric instance with no anomaly likelihood params
    :param metricDataRows: a sequence of MetricData instances
      corresponding to the inference results batch in the processed order
      (ascending by rowid and timestamp) with updated raw_anomaly_score and
      zeroed out anomaly_score corresponding to the new model inference results,
      but not yet updated in the database. Will not alter this sequence.

    :returns: the tuple (anomalyParams, statsSampleCache, startRowIndex)
      anomalyParams: None, if there are too few samples; otherwise, the anomaly
        likelyhood objects as returned by algorithms.estimateAnomalyLikelihoods
      statsSampleCache: None, if there are too few samples; otherwise, a list of
        MetricData instances comprising of a concatenation of rows sourced
        from metric_data tail and topped off with necessary items from the
        given metricDataRows for a minimum of self._statisticsMinSampleSize and
        a maximum of self._statisticsSampleSize total items.
      startRowIndex: Index into the given metricDataRows where processing of
        anomaly scores is to start; if there are too few samples to generate
        the anomaly likelihood params, then startRowIndex will reference past
        the last item in the given metricDataRows sequence.
        if metricObj.status != MetricStatus.ACTIVE:
            raise MetricNotActiveError(
                "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; "
                "status=%s; resource=%s" % (

        modelParams = jsonDecode(metricObj.model_params)
        anomalyParams = modelParams.get("anomalyLikelihoodParams", None)

        assert not anomalyParams, anomalyParams

        statsSampleCache = None

        # Index into metricDataRows where processing of anomaly scores is to start
        startRowIndex = 0

        with engine.connect() as conn:
            numProcessedRows = repository.getProcessedMetricDataCount(
                conn, metricObj.uid)

        if numProcessedRows + len(
                metricDataRows) >= self._statisticsMinSampleSize:
            # We have enough samples to initialize the anomaly likelihood model
            # TODO: unit-test

            # Determine how many samples will be used from metricDataRows
            numToConsume = max(
                0, self._statisticsMinSampleSize - numProcessedRows)
            consumedSamples = metricDataRows[:numToConsume]
            startRowIndex += numToConsume

            # Create the anomaly likelihood model
            anomalyParams, statsSampleCache = self._refreshAnomalyParams(

            # If this assertion fails, it implies that the count retrieved by our
            # call to MetricData.count above is no longer correct
            assert anomalyParams

                "Generated initial anomaly params for model=%s: "
                "numSamples=%d; firstRowID=%s; lastRowID=%s; ", metricObj.uid,
                len(statsSampleCache), statsSampleCache[0].rowid,
            # Not enough raw scores yet to begin anomaly likelyhoods processing
            # TODO: unit-test
            startRowIndex = len(metricDataRows)

        return anomalyParams, statsSampleCache, startRowIndex