コード例 #1
0
ファイル: anomaly_service.py プロジェクト: bopopescu/what
    def _updateAnomalyLikelihoodParams(cls, conn, metricId, modelParamsJson,
                                       likelihoodParams):
        """Update and save anomaly_params with the given likelyhoodParams if the
       metric is ACTIVE.

    :param conn: Transactional SQLAlchemy connection object
    :type conn: sqlalchemy.engine.base.Connection
    :param metricId: Metric uid
    :param modelParamsJson: Model params JSON object (from model_params metric
      column)
    :param likelihoodParams: anomaly likelihood params dict

    :raises: htmengine.exceptions.MetricNotActiveError if metric's status is not
      MetricStatus.ACTIVE
    """
        lockedRow = repository.getMetricWithUpdateLock(
            conn, metricId, fields=[schema.metric.c.status])

        if lockedRow.status != MetricStatus.ACTIVE:
            raise MetricNotActiveError(
                "_updateAnomalyLikelihoodParams failed because metric=%s is not "
                "ACTIVE; status=%s" % (
                    metricId,
                    lockedRow.status,
                ))

        modelParams = json.loads(modelParamsJson)
        modelParams["anomalyLikelihoodParams"] = likelihoodParams

        repository.updateMetricColumns(
            conn, metricId, {"model_params": json.dumps(modelParams)})
コード例 #2
0
    def updateModelAnomalyScores(self, engine, metricObj, metricDataRows):
        """
    Calculate the anomaly scores based on the anomaly likelihoods. Update
    anomaly scores in the given metricDataRows MetricData instances, and
    calculate new anomaly likelihood params for the model.

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine
    :param metricObj: the model's Metric instance
    :param metricDataRows: a sequence of MetricData instances in the
      processed order (ascending by timestamp) with updated raw_anomaly_score
      and zeroed out anomaly_score corresponding to the new model inference
      results, but not yet updated in the database. Will update their
      anomaly_score properties, as needed.

    :returns: new anomaly likelihood params for the model

    *NOTE:*
      the processing must be idempotent due to the "at least once" delivery
      semantics of the message bus

    *NOTE:*
      the performance goal is to minimize costly database access and avoid
      falling behind while processing model results, especially during the
      model's initial "catch-up" phase when large inference result batches are
      prevalent.
    """
        # When populated, a cached list of MetricData instances for updating
        # anomaly likelyhood params
        statsSampleCache = None

        # Index into metricDataRows where processing is to resume
        startRowIndex = 0

        statisticsRefreshInterval = self._getStatisticsRefreshInterval(
            batchSize=len(metricDataRows))

        if metricObj.status != MetricStatus.ACTIVE:
            raise MetricNotActiveError(
                "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; "
                "status=%s; resource=%s" % (
                    metricObj.uid,
                    metricObj.status,
                    metricObj.server,
                ))

        modelParams = jsonDecode(metricObj.model_params)
        anomalyParams = modelParams.get("anomalyLikelihoodParams", None)
        if not anomalyParams:
            # We don't have a likelihood model yet. Create one if we have sufficient
            # records with raw anomaly scores
            (anomalyParams, statsSampleCache,
             startRowIndex) = (self._initAnomalyLikelihoodModel(
                 engine=engine,
                 metricObj=metricObj,
                 metricDataRows=metricDataRows))

        # Do anomaly likelihood processing on the rest of the new samples
        # NOTE: this loop will be skipped if there are still not enough samples for
        #  creating the anomaly likelihood params
        while startRowIndex < len(metricDataRows):
            # Determine where to stop processing rows prior to next statistics refresh

            if (statsSampleCache is None
                    or len(statsSampleCache) >= self._statisticsMinSampleSize):
                # We're here if:
                #   a. We haven't tried updating anomaly likelihood stats yet
                #                 OR
                #   b. We already updated anomaly likelyhood stats (we had sufficient
                #      samples for it)
                # TODO: unit-test
                endRowID = (anomalyParams["last_rowid_for_stats"] +
                            statisticsRefreshInterval)

                if endRowID < metricDataRows[startRowIndex].rowid:
                    # We're here if:
                    #   a. Statistics refresh interval is smaller than during last stats
                    #      update; this is the typical/normal case when backlog catch-up
                    #      is tapering off, and refresh interval is reduced for smaller
                    #      batches. OR
                    #   b. There is a gap of anomaly scores preceeding the start of the
                    #      current chunk. OR
                    #   c. Statistics config changed.
                    # TODO: unit-test

                    self._log.warning(
                        "Anomaly run cutoff precedes samples (smaller stats "
                        "refreshInterval or gap in anomaly scores or statistics config "
                        "changed) : model=%s; rows=[%s..%s]", metricObj.uid,
                        metricDataRows[startRowIndex].rowid, endRowID)

                    if statsSampleCache is not None:
                        # We already attempted to update anomaly likelihood params, so fix
                        # up endRowID to make sure we make progress and don't get stuck in
                        # an infinite loop
                        endRowID = metricDataRows[startRowIndex].rowid
                        self._log.warning(
                            "Advanced anomaly run cutoff to make progress: "
                            "model=%s; rows=[%s..%s]", metricObj.uid,
                            metricDataRows[startRowIndex].rowid, endRowID)
            else:
                # During prior iteration, there were not enough samples in cache for
                # updating anomaly params

                # We extend the end row so that there will be enough samples
                # to avoid getting stuck in this rut in the current and following
                # iterations
                # TODO: unit-test this
                endRowID = metricDataRows[startRowIndex].rowid + (
                    self._statisticsMinSampleSize - len(statsSampleCache) - 1)

            # Translate endRowID into metricDataRows limitIndex for current run
            if endRowID < metricDataRows[startRowIndex].rowid:
                # Cut-off precedes the remaining samples
                # Normally shouldn't be here (unless statistics config changed or there
                # is a gap in anomaly scores in metric_data table)
                # TODO: unit-test this

                # Set limit to bypass processing of samples for immediate refresh of
                # anomaly likelihood params
                limitIndex = startRowIndex
                self._log.warning(
                    "Anomaly run cutoff precedes samples, so forcing refresh of anomaly "
                    "likelihood params: modelInfo=<%s>; rows=[%s..%s]",
                    getMetricLogPrefix(metricObj),
                    metricDataRows[startRowIndex].rowid, endRowID)
            else:
                # Cutoff is either inside or after the remaining samples
                # TODO: unit-test this
                limitIndex = startRowIndex + min(
                    len(metricDataRows) - startRowIndex,
                    endRowID + 1 - metricDataRows[startRowIndex].rowid)

            # Process the next new sample run
            self._log.debug(
                "Starting anomaly run: model=%s; "
                "startRowIndex=%s; limitIndex=%s; rows=[%s..%s]; "
                "last_rowid_for_stats=%s; refreshInterval=%s; batchSize=%s",
                metricObj.uid, startRowIndex, limitIndex,
                metricDataRows[startRowIndex].rowid, endRowID,
                anomalyParams["last_rowid_for_stats"],
                statisticsRefreshInterval, len(metricDataRows))

            consumedSamples = []
            for md in itertools.islice(metricDataRows, startRowIndex,
                                       limitIndex):
                consumedSamples.append(md)

                (likelihood, ), _, anomalyParams["params"] = (
                    algorithms.updateAnomalyLikelihoods(
                        ((md.timestamp, md.metric_value,
                          md.raw_anomaly_score), ), anomalyParams["params"]))

                # TODO: the float "cast" here seems redundant
                md.anomaly_score = float(1.0 - likelihood)

                # If anomaly score > 0.99 then we greedily update the statistics. 0.99
                # should not repeat too often, but to be safe we wait a few more
                # records before updating again, in order to avoid overloading the DB.
                #
                # TODO: the magic 0.99 and the magic 3 value below should either
                #  be constants or config settings. Where should they be defined?
                if (md.anomaly_score > 0.99 and
                    (anomalyParams["last_rowid_for_stats"] + 3) < md.rowid):
                    if statsSampleCache is None or (
                            len(statsSampleCache) + len(consumedSamples) >=
                            self._statisticsMinSampleSize):
                        # TODO: unit-test this
                        self._log.info(
                            "Forcing refresh of anomaly params for model=%s due "
                            "to exceeded anomaly_score threshold in sample=%r",
                            metricObj.uid, md)
                        break

            if startRowIndex + len(consumedSamples) < len(metricDataRows) or (
                    consumedSamples[-1].rowid >= endRowID):
                # We stopped before the end of new samples, including a bypass-run,
                # or stopped after processing the last item and need one final refresh
                # of anomaly params
                anomalyParams, statsSampleCache = self._refreshAnomalyParams(
                    engine=engine,
                    metricID=metricObj.uid,
                    statsSampleCache=statsSampleCache,
                    consumedSamples=consumedSamples,
                    defaultAnomalyParams=anomalyParams)

            startRowIndex += len(consumedSamples)
        # <--- while

        return anomalyParams
コード例 #3
0
    def _initAnomalyLikelihoodModel(self, engine, metricObj, metricDataRows):
        """ Create the anomaly likelihood model for the given Metric instance.
    Assumes that the metric doesn't have anomaly params yet.

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine

    :param metricObj: Metric instance with no anomaly likelihood params
    :param metricDataRows: a sequence of MetricData instances
      corresponding to the inference results batch in the processed order
      (ascending by rowid and timestamp) with updated raw_anomaly_score and
      zeroed out anomaly_score corresponding to the new model inference results,
      but not yet updated in the database. Will not alter this sequence.

    :returns: the tuple (anomalyParams, statsSampleCache, startRowIndex)
      anomalyParams: None, if there are too few samples; otherwise, the anomaly
        likelyhood objects as returned by algorithms.estimateAnomalyLikelihoods
      statsSampleCache: None, if there are too few samples; otherwise, a list of
        MetricData instances comprising of a concatenation of rows sourced
        from metric_data tail and topped off with necessary items from the
        given metricDataRows for a minimum of self._statisticsMinSampleSize and
        a maximum of self._statisticsSampleSize total items.
      startRowIndex: Index into the given metricDataRows where processing of
        anomaly scores is to start; if there are too few samples to generate
        the anomaly likelihood params, then startRowIndex will reference past
        the last item in the given metricDataRows sequence.
    """
        if metricObj.status != MetricStatus.ACTIVE:
            raise MetricNotActiveError(
                "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; "
                "status=%s; resource=%s" % (
                    metricObj.uid,
                    metricObj.status,
                    metricObj.server,
                ))

        modelParams = jsonDecode(metricObj.model_params)
        anomalyParams = modelParams.get("anomalyLikelihoodParams", None)

        assert not anomalyParams, anomalyParams

        statsSampleCache = None

        # Index into metricDataRows where processing of anomaly scores is to start
        startRowIndex = 0

        with engine.connect() as conn:
            numProcessedRows = repository.getProcessedMetricDataCount(
                conn, metricObj.uid)

        if numProcessedRows + len(
                metricDataRows) >= self._statisticsMinSampleSize:
            # We have enough samples to initialize the anomaly likelihood model
            # TODO: unit-test

            # Determine how many samples will be used from metricDataRows
            numToConsume = max(
                0, self._statisticsMinSampleSize - numProcessedRows)
            consumedSamples = metricDataRows[:numToConsume]
            startRowIndex += numToConsume

            # Create the anomaly likelihood model
            anomalyParams, statsSampleCache = self._refreshAnomalyParams(
                engine=engine,
                metricID=metricObj.uid,
                statsSampleCache=None,
                consumedSamples=consumedSamples,
                defaultAnomalyParams=anomalyParams)

            # If this assertion fails, it implies that the count retrieved by our
            # call to MetricData.count above is no longer correct
            assert anomalyParams

            self._log.info(
                "Generated initial anomaly params for model=%s: "
                "numSamples=%d; firstRowID=%s; lastRowID=%s; ", metricObj.uid,
                len(statsSampleCache), statsSampleCache[0].rowid,
                statsSampleCache[-1].rowid)
        else:
            # Not enough raw scores yet to begin anomaly likelyhoods processing
            # TODO: unit-test
            startRowIndex = len(metricDataRows)

        return anomalyParams, statsSampleCache, startRowIndex