def updateModelAnomalyScores(self, engine, metricObj, metricDataRows):
    """
    Calculate the anomaly scores based on the anomaly likelihoods. Update
    anomaly scores in the given metricDataRows MetricData instances, and
    calculate new anomaly likelihood params for the model.

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine
    :param metricObj: the model's Metric instance
    :param metricDataRows: a sequence of MetricData instances in the
      processed order (ascending by timestamp) with updated raw_anomaly_score
      and zeroed out anomaly_score corresponding to the new model inference
      results, but not yet updated in the database. Will update their
      anomaly_score properties, as needed.

    :returns: new anomaly likelihood params for the model

    *NOTE:*
      the processing must be idempotent due to the "at least once" delivery
      semantics of the message bus

    *NOTE:*
      the performance goal is to minimize costly database access and avoid
      falling behind while processing model results, especially during the
      model's initial "catch-up" phase when large inference result batches are
      prevalent.
    """
    # When populated, a cached list of MetricData instances for updating
    # anomaly likelyhood params
    statsSampleCache = None

    # Index into metricDataRows where processing is to resume
    startRowIndex = 0

    statisticsRefreshInterval = self._getStatisticsRefreshInterval(
      batchSize=len(metricDataRows))

    if metricObj.status != MetricStatus.ACTIVE:
      raise MetricNotActiveError(
        "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; "
        "status=%s; resource=%s" % (metricObj.uid,
                                    metricObj.status,
                                    metricObj.server,))

    modelParams = jsonDecode(metricObj.model_params)
    anomalyParams = modelParams.get("anomalyLikelihoodParams", None)
    if not anomalyParams:
      # We don't have a likelihood model yet. Create one if we have sufficient
      # records with raw anomaly scores
      (anomalyParams, statsSampleCache, startRowIndex) = (
        self._initAnomalyLikelihoodModel(engine=engine,
                                         metricObj=metricObj,
                                         metricDataRows=metricDataRows))

    # Do anomaly likelihood processing on the rest of the new samples
    # NOTE: this loop will be skipped if there are still not enough samples for
    #  creating the anomaly likelihood params
    while startRowIndex < len(metricDataRows):
      # Determine where to stop processing rows prior to next statistics refresh

      if (statsSampleCache is None or
          len(statsSampleCache) >= self._statisticsMinSampleSize):
        # We're here if:
        #   a. We haven't tried updating anomaly likelihood stats yet
        #                 OR
        #   b. We already updated anomaly likelihood stats (we had sufficient
        #      samples for it)
        # TODO: unit-test
        endRowID = (anomalyParams["last_rowid_for_stats"] +
                    statisticsRefreshInterval)

        if endRowID < metricDataRows[startRowIndex].rowid:
          # We're here if:
          #   a. Statistics refresh interval is smaller than during last stats
          #      update; this is the typical/normal case when backlog catch-up
          #      is tapering off, and refresh interval is reduced for smaller
          #      batches. OR
          #   b. There is a gap of anomaly scores preceding the start of the
          #      current chunk. OR
          #   c. Statistics config changed.
          # TODO: unit-test

          self._log.warning(
            "Anomaly run cutoff precedes samples (smaller stats "
            "refreshInterval or gap in anomaly scores or statistics config "
            "changed) : model=%s; rows=[%s..%s]",
            metricObj.uid, metricDataRows[startRowIndex].rowid, endRowID)

          if statsSampleCache is not None:
            # We already attempted to update anomaly likelihood params, so fix
            # up endRowID to make sure we make progress and don't get stuck in
            # an infinite loop
            endRowID = metricDataRows[startRowIndex].rowid
            self._log.warning(
              "Advanced anomaly run cutoff to make progress: "
              "model=%s; rows=[%s..%s]",
              metricObj.uid, metricDataRows[startRowIndex].rowid, endRowID)
      else:
        # During prior iteration, there were not enough samples in cache for
        # updating anomaly params

        # We extend the end row so that there will be enough samples
        # to avoid getting stuck in this rut in the current and following
        # iterations
        # TODO: unit-test this
        endRowID = metricDataRows[startRowIndex].rowid + (
          self._statisticsMinSampleSize - len(statsSampleCache) - 1)

      # Translate endRowID into metricDataRows limitIndex for current run
      if endRowID < metricDataRows[startRowIndex].rowid:
        # Cut-off precedes the remaining samples
        # Normally shouldn't be here (unless statistics config changed or there
        # is a gap in anomaly scores in metric_data table)
        # TODO: unit-test this

        # Set limit to bypass processing of samples for immediate refresh of
        # anomaly likelihood params
        limitIndex = startRowIndex
        self._log.warning(
          "Anomaly run cutoff precedes samples, so forcing refresh of anomaly "
          "likelihood params: modelInfo=<%s>; rows=[%s..%s]",
          getMetricLogPrefix(metricObj),
          metricDataRows[startRowIndex].rowid, endRowID)
      else:
        # Cutoff is either inside or after the remaining samples
        # TODO: unit-test this
        limitIndex = startRowIndex + min(
          len(metricDataRows) - startRowIndex,
          endRowID + 1 - metricDataRows[startRowIndex].rowid)

      # Process the next new sample run
      self._log.debug(
        "Starting anomaly run: model=%s; "
        "startRowIndex=%s; limitIndex=%s; rows=[%s..%s]; "
        "last_rowid_for_stats=%s; refreshInterval=%s; batchSize=%s",
        metricObj.uid,
        startRowIndex, limitIndex, metricDataRows[startRowIndex].rowid,
        endRowID, anomalyParams["last_rowid_for_stats"],
        statisticsRefreshInterval, len(metricDataRows))

      consumedSamples = []
      for md in itertools.islice(metricDataRows, startRowIndex, limitIndex):
        consumedSamples.append(md)

        (likelihood,), _, anomalyParams["params"] = (
          algorithms.updateAnomalyLikelihoods(
            ((md.timestamp, md.metric_value, md.raw_anomaly_score),),
            anomalyParams["params"]))

        # TODO: the float "cast" here seems redundant
        md.anomaly_score = float(1.0 - likelihood)

        # If anomaly score > 0.99 then we greedily update the statistics. 0.99
        # should not repeat too often, but to be safe we wait a few more
        # records before updating again, in order to avoid overloading the DB.
        #
        # TODO: the magic 0.99 and the magic 3 value below should either
        #  be constants or config settings. Where should they be defined?
        if (md.anomaly_score > 0.99 and
            (anomalyParams["last_rowid_for_stats"] + 3) < md.rowid):
          if statsSampleCache is None or (
              len(statsSampleCache) + len(consumedSamples) >=
              self._statisticsMinSampleSize):
            # TODO: unit-test this
            self._log.info("Forcing refresh of anomaly params for model=%s due "
                           "to exceeded anomaly_score threshold in sample=%r",
                           metricObj.uid, md)
            break

      if startRowIndex + len(consumedSamples) < len(metricDataRows) or (
          consumedSamples[-1].rowid >= endRowID):
        # We stopped before the end of new samples, including a bypass-run,
        # or stopped after processing the last item and need one final refresh
        # of anomaly params
        anomalyParams, statsSampleCache = self._refreshAnomalyParams(
          engine=engine,
          metricID=metricObj.uid,
          statsSampleCache=statsSampleCache,
          consumedSamples=consumedSamples,
          defaultAnomalyParams=anomalyParams)


      startRowIndex += len(consumedSamples)
    # <--- while

    return anomalyParams
  def _processModelInferenceResults(self, inferenceResults, metricID):
    """
    Process a batch of model inference results

    Store the updated MetricData and anomaly likelihood parameters in the
    database.

    A row's anomaly_score value will be set to and remain at 0 in the
    first self._statisticsMinSampleSize rows; once we get enough inference
    results to create an anomaly likelihood model, anomaly_score will be
    computed on the subsequent rows.

    :param inferenceResults: a sequence of ModelInferenceResult instances in the
      processed order (ascending by timestamp)

    :param metricID: metric/model ID of the model that emitted the results

    :returns: None if the batch was rejected; otherwise a pair:
      (metric, metricDataRows)
        metric: Metric RowProxy instance corresponding to the given metricID
        metricDataRows: a sequence of MutableMetricDataRow instances
          corresponding to the updated metric_data rows.
      TODO: unit-test return value
    :rtype: None or tuple

    *NOTE:*
      the processing must be idempotent due to the "at least once" delivery
      semantics of the message bus

    *NOTE:*
      the performance goal is to minimize costly database access and avoid
      falling behind while processing model results, especially during the
      model's initial "catch-up" phase when large inference result batches are
      prevalent.
    """
    engine = repository.engineFactory(config)

    # Validate model ID
    try:
      with engine.connect() as conn:
        metricObj = repository.getMetric(conn, metricID)
    except ObjectNotFoundError:
      # Ignore inferences for unknown models. Typically, this is is the result
      # of a deleted model. Another scenario where this might occur is when a
      # developer resets the db while there are result messages still on the
      # message bus. It would be an error if this were to occur in production
      # environment.
      self._log.warning("Received inference results for unknown model=%s; "
                        "(model deleted?)", metricID, exc_info=True)
      return None

    # Reject the results if model is in non-ACTIVE state (e.g., if HTM Metric
    # was unmonitored after the results were generated)
    if metricObj.status != MetricStatus.ACTIVE:
      self._log.warning("Received inference results for a non-ACTIVE "
                        "model=%s; metric=<%s>; (metric unmonitored?)",
                        metricID, getMetricLogPrefix(metricObj))
      return None

    # Load the MetricData instances corresponding to the results
    with engine.connect() as conn:
      metricDataRows = repository.getMetricData(conn,
                                                metricID,
                                                start=inferenceResults[0].rowID,
                                                stop=inferenceResults[-1].rowID)

    # metricDataRows must be mutable, as the data is massaged in
    # _scrubInferenceResultsAndInitMetricData()
    metricDataRows = list(metricDataRows)

    if not metricDataRows:
      self._log.error("Rejected inference result batch=[%s..%s] of model=%s "
                      "due to no matching metric_data rows",
                      inferenceResults[0].rowID, inferenceResults[-1].rowID,
                      metricID)
      return None

    try:
      self._scrubInferenceResultsAndInitMetricData(
        engine=engine,
        inferenceResults=inferenceResults,
        metricDataRows=metricDataRows,
        metricObj=metricObj)
    except RejectedInferenceResultBatch as e:
      # TODO: unit-test
      self._log.error(
        "Rejected inference result batch=[%s..%s] corresponding to "
        "rows=[%s..%s] of model=%s due to error=%r",
        inferenceResults[0].rowID, inferenceResults[-1].rowID,
        metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, e)
      return None

   # Update anomaly scores based on the new results
    anomalyLikelihoodParams = (
      self.likelihoodHelper.updateModelAnomalyScores(
        engine=engine,
        metricObj=metricObj,
        metricDataRows=metricDataRows))

    # Update metric data rows with rescaled display values
    # NOTE: doing this outside the updateColumns loop to avoid holding row locks
    #  any longer than necessary
    for metricData in metricDataRows:
      metricData.display_value = rescaleForDisplay(
        metricData.anomaly_score,
        active=(metricObj.status == MetricStatus.ACTIVE))

    # Update database once via transaction!
    startTime = time.time()
    try:
      @retryOnTransientErrors
      def runSQL(engine):
        with engine.begin() as conn:
          for metricData in metricDataRows:
            fields = {"raw_anomaly_score": metricData.raw_anomaly_score,
                      "anomaly_score": metricData.anomaly_score,
                      "display_value": metricData.display_value,
                      "multi_step_best_predictions":
                        json.dumps(metricData.multi_step_best_predictions)}
            repository.updateMetricDataColumns(conn, metricData, fields)

          self._updateAnomalyLikelihoodParams(
            conn,
            metricObj.uid,
            metricObj.model_params,
            anomalyLikelihoodParams)

      runSQL(engine)
    except (ObjectNotFoundError, MetricNotActiveError):
      self._log.warning("Rejected inference result batch=[%s..%s] of model=%s",
                        inferenceResults[0].rowID, inferenceResults[-1].rowID,
                        metricID, exc_info=True)
      return None

    self._log.debug("Updated HTM metric_data rows=[%s..%s] "
                    "of model=%s: duration=%ss",
                    metricDataRows[0].rowid, metricDataRows[-1].rowid,
                    metricID, time.time() - startTime)

    return (metricObj, metricDataRows,)
  def _scrubInferenceResultsAndInitMetricData(self, engine, inferenceResults,
                                              metricDataRows, metricObj):
    """ Validate the given inferenceResults against metricDataRows, update
    corresponding MetricData instances by initializing their
    `raw_anomaly_score` property from results and the `anomaly_score` property
    with 0. Replace elements in metricDataRows with MutableMetricDataRow
    objects.

    *NOTE:* does NOT update the MetricData instances to the database (we do that
    once after we process the batch for efficiency)

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine

    :param inferenceResults: a sequence of ModelInferenceResult instances
      representing the inference result batch ordered by row id

    :param metricDataRows: a mutable list of MetricData instances with row ids
      in the range of inferenceResults[0].rowID to inferenceResults[-1].rowID

    :param metricObj: a Metric instance associated with the given
      inferenceResults

    :raises RejectedInferenceResultBatch: if the given result batch is rejected
    """
    for result, enumeratedMetricData in itertools.izip_longest(inferenceResults,
                                                               enumerate(
                                                                 metricDataRows)
                                                              ):

      if enumeratedMetricData is None:
        raise RejectedInferenceResultBatch(
          "No MetricData row for inference result=%r of model=<%r>" % (
            result, metricObj))
      index, metricData = enumeratedMetricData

      if result is None:
        raise RejectedInferenceResultBatch(
          "Truncated inference result batch; no result for metric data row=%r "
          "of model=<%r>" % (metricData, metricObj))

      if metricData is None:
        raise RejectedInferenceResultBatch(
          "No MetricData row for inference result=%r of model=<%r>" %
          (result, metricObj))

      if result.rowID != metricData.rowid:
        raise RejectedInferenceResultBatch(
          "RowID mismatch between inference result=%r and ModelData row=%r of "
          "model=<%r>" % (result, metricData, metricObj))

      if metricData.raw_anomaly_score is not None:
        # Side-effect of at-least-once delivery guarantee?
        self._log.error(
          "Anomaly was already processed on data row=%s; new result=%r",
          metricData, result)

      # Validate the result
      if result.status != 0:
        self._log.error(result.errorMessage)
        if metricObj.status == MetricStatus.ERROR:
          raise RejectedInferenceResultBatch(
            "inferenceResult=%r failed and model=<%s> was in ERROR state" %
            (result, getMetricLogPrefix(metricObj)))
        else:
          self._log.error("Placing model=<%r> in ERROR state due to "
                          "inferenceResult=%r", metricObj, result)
          with engine.connect() as conn:
            repository.setMetricStatus(conn,
                                       metricObj.uid,
                                       MetricStatus.ERROR,
                                       result.errorMessage)
          raise RejectedInferenceResultBatch(
            "inferenceResult=%r failed and model=<%s> promoted to ERROR state" %
            (result, getMetricLogPrefix(metricObj)))

      #self._log.info("{TAG:ANOM.METRIC} metric=%s:%s:%s",
      #               metricObj.name,
      #               calendar.timegm(metricData.timestamp.timetuple()),
      #               metricData.metric_value)

      mutableMetricData = MutableMetricDataRow(**dict(metricData.items()))
      mutableMetricData.raw_anomaly_score = result.anomalyScore
      mutableMetricData.anomaly_score = 0
      mutableMetricData.multi_step_best_predictions = (
        result.multiStepBestPredictions)
      metricDataRows[index] = mutableMetricData
  def _processModelCommandResult(self, metricID, result):
    """
    Process a single model command result
    """
    engine = repository.engineFactory(config)

    # Check if deleting model
    if result.method == "deleteModel":
      self._log.info("Model=%s was deleted", metricID)
      return

    # Validate model ID
    try:
      # NOTE: use shared lock to prevent race condition with adapter's
      # monitorMetric, whereby adapter creates and/or activates a metric inside
      # a transaction, and we might get the defineModel command before the
      # metric row updates are committed
      with engine.connect() as conn:
        metricObj = repository.getMetricWithSharedLock(conn, metricID)
    except ObjectNotFoundError:
      # This may occur if the user deletes the model before the result was
      # delivered while there are result messages still on the message bus.
      self._log.warn("Received command result=%r for unknown model=%s "
                     "(model deleted?)", result, metricID)
      return

    if result.status != 0:
      self._log.error(result.errorMessage)
      if metricObj.status != MetricStatus.ERROR:
        self._log.error("Placing model=<%s> in ERROR state due to "
                        "commandResult=%s",
                        getMetricLogPrefix(metricObj),
                        result)
        with engine.connect() as conn:
          repository.setMetricStatus(conn, metricID, MetricStatus.ERROR,
                                     result.errorMessage)



      else:
        # NOTE: could be a race condition between app-layer and Model Swapper
        #   or a side-effect of the at-least-once delivery guarantee
        self._log.warn("Received command result=%r for metricID=%s of "
                       "metric=<%s> that was already in ERROR state",
                       result, metricID, getMetricLogPrefix(metricObj))
      return

    # Create Model
    if result.method == "defineModel":
      self._log.info("Model was created for <%s>",
                     getMetricLogPrefix(metricObj))

      if metricObj.status == MetricStatus.CREATE_PENDING:
        with engine.connect() as conn:
          repository.setMetricStatus(conn, metricID, MetricStatus.ACTIVE)
      else:
        # NOTE: could be a race condition between app-layer and Model Swapper
        #   or a side-effect of the at-least-once delivery guarantee
        self._log.warn("Received command result=%r for model=%s of metric=<%s> "
                       "that was not in CREATE_PENDING state",
                       result, metricID, getMetricLogPrefix(metricObj))
      return

    self._log.error("Unexpected model result=%r", result)
Beispiel #5
0
    def _scrubInferenceResultsAndInitMetricData(self, engine, inferenceResults,
                                                metricDataRows, metricObj):
        """ Validate the given inferenceResults against metricDataRows, update
    corresponding MetricData instances by initializing their
    `raw_anomaly_score` property from results and the `anomaly_score` property
    with 0. Replace elements in metricDataRows with MutableMetricDataRow
    objects.

    *NOTE:* does NOT update the MetricData instances to the database (we do that
    once after we process the batch for efficiency)

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine

    :param inferenceResults: a sequence of ModelInferenceResult instances
      representing the inference result batch ordered by row id

    :param metricDataRows: a mutable list of MetricData instances with row ids
      in the range of inferenceResults[0].rowID to inferenceResults[-1].rowID

    :param metricObj: a Metric instance associated with the given
      inferenceResults

    :raises RejectedInferenceResultBatch: if the given result batch is rejected
    """

        for result, enumeratedMetricData in itertools.izip_longest(
                inferenceResults, enumerate(metricDataRows)):

            if enumeratedMetricData is None:
                raise RejectedInferenceResultBatch(
                    "No MetricData row for inference result=%r of model=<%r>" %
                    (result, metricObj))
            index, metricData = enumeratedMetricData

            if result is None:
                raise RejectedInferenceResultBatch(
                    "Truncated inference result batch; no result for metric data row=%r "
                    "of model=<%r>" % (metricData, metricObj))

            if metricData is None:
                raise RejectedInferenceResultBatch(
                    "No MetricData row for inference result=%r of model=<%r>" %
                    (result, metricObj))

            if result.rowID != metricData.rowid:
                raise RejectedInferenceResultBatch(
                    "RowID mismatch between inference result=%r and ModelData row=%r of "
                    "model=<%r>" % (result, metricData, metricObj))

            if metricData.raw_anomaly_score is not None:
                # Side-effect of at-least-once delivery guarantee?
                self._log.error(
                    "Anomaly was already processed on data row=%s; new result=%r",
                    metricData, result)

            # Validate the result
            if result.status != 0:
                self._log.error(result.errorMessage)
                if metricObj.status == MetricStatus.ERROR:
                    raise RejectedInferenceResultBatch(
                        "inferenceResult=%r failed and model=<%s> was in ERROR state"
                        % (result, getMetricLogPrefix(metricObj)))
                else:
                    self._log.error(
                        "Placing model=<%r> in ERROR state due to "
                        "inferenceResult=%r", metricObj, result)
                    with engine.connect() as conn:
                        repository.setMetricStatus(conn, metricObj.uid,
                                                   MetricStatus.ERROR,
                                                   result.errorMessage)
                    raise RejectedInferenceResultBatch(
                        "inferenceResult=%r failed and model=<%s> promoted to ERROR state"
                        % (result, getMetricLogPrefix(metricObj)))

            #self._log.info("{TAG:ANOM.METRIC} metric=%s:%s:%s",
            #               metricObj.name,
            #               calendar.timegm(metricData.timestamp.timetuple()),
            #               metricData.metric_value)

            mutableMetricData = MutableMetricDataRow(
                **dict(metricData.items()))
            mutableMetricData.raw_anomaly_score = result.anomalyScore
            mutableMetricData.anomaly_score = 0
            metricDataRows[index] = mutableMetricData
Beispiel #6
0
    def _processModelInferenceResults(self, inferenceResults, metricID):
        """
    Process a batch of model inference results

    Store the updated MetricData and anomaly likelihood parameters in the
    database.

    A row's anomaly_score value will be set to and remain at 0 in the
    first self._statisticsMinSampleSize rows; once we get enough inference
    results to create an anomaly likelyhood model, anomaly_score will be
    computed on the subsequent rows.

    :param inferenceResults: a sequence of ModelInferenceResult instances in the
      processed order (ascending by timestamp)

    :param metricID: metric/model ID of the model that emitted the results

    :returns: None if the batch was rejected; otherwise a pair:
      (metric, metricDataRows)
        metric: Metric RowProxy instance corresponding to the given metricID
        metricDataRows: a sequence of MutableMetricDataRow instances
          corresponding to the updated metric_data rows.
      TODO: unit-test return value
    :rtype: None or tuple

    *NOTE:*
      the processing must be idempotent due to the "at least once" delivery
      semantics of the message bus

    *NOTE:*
      the performance goal is to minimize costly database access and avoid
      falling behind while processing model results, especially during the
      model's initial "catch-up" phase when large inference result batches are
      prevalent.
    """
        engine = repository.engineFactory(config)

        # Validate model ID
        try:
            with engine.connect() as conn:
                metricObj = repository.getMetric(conn, metricID)
        except ObjectNotFoundError:
            # Ignore inferences for unkonwn models. Typically, this is is the result
            # of a deleted model. Another scenario where this might occur is when a
            # developer resets db while there are result messages still on the
            # message bus. It would be an error if this were to occur in production
            # environment.
            self._log.warning(
                "Received inference results for unknown model=%s; "
                "(model deleted?)",
                metricID,
                exc_info=True)
            return None

        # Reject the results if model is in non-ACTIVE state (e.g., if HTM Metric
        # was unmonitored after the results were generated)
        if metricObj.status != MetricStatus.ACTIVE:
            self._log.warning(
                "Received inference results for a non-ACTIVE "
                "model=%s; metric=<%s>; (metric unmonitored?)", metricID,
                getMetricLogPrefix(metricObj))
            return None

        # Load the MetricData instances corresponding to the results
        with engine.connect() as conn:
            metricDataRows = repository.getMetricData(
                conn,
                metricID,
                start=inferenceResults[0].rowID,
                stop=inferenceResults[-1].rowID)

        # metricDataRows must be mutable, as the data is massaged in
        # _scrubInferenceResultsAndInitMetricData()
        metricDataRows = list(metricDataRows)

        if not metricDataRows:
            self._log.error(
                "Rejected inference result batch=[%s..%s] of model=%s "
                "due to no matching metric_data rows",
                inferenceResults[0].rowID, inferenceResults[-1].rowID,
                metricID)
            return None

        try:
            self._scrubInferenceResultsAndInitMetricData(
                engine=engine,
                inferenceResults=inferenceResults,
                metricDataRows=metricDataRows,
                metricObj=metricObj)
        except RejectedInferenceResultBatch as e:
            # TODO: unit-test
            self._log.error(
                "Rejected inference result batch=[%s..%s] corresponding to "
                "rows=[%s..%s] of model=%s due to error=%r",
                inferenceResults[0].rowID, inferenceResults[-1].rowID,
                metricDataRows[0].rowid, metricDataRows[-1].rowid, metricID, e)
            return None

        # Update anomaly scores based on the new results
        anomalyLikelihoodParams = (
            self.likelihoodHelper.updateModelAnomalyScores(
                engine=engine,
                metricObj=metricObj,
                metricDataRows=metricDataRows))

        # Update metric data rows with rescaled display values
        # NOTE: doing this outside the updateColumns loop to avoid holding row locks
        #  any longer than necessary
        for metricData in metricDataRows:
            metricData.display_value = rescaleForDisplay(
                metricData.anomaly_score,
                active=(metricObj.status == MetricStatus.ACTIVE))

        # Update database once via transaction!
        startTime = time.time()
        try:

            @retryOnTransientErrors
            def runSQL(engine):
                with engine.begin() as conn:
                    for metricData in metricDataRows:
                        fields = {
                            "raw_anomaly_score": metricData.raw_anomaly_score,
                            "anomaly_score": metricData.anomaly_score,
                            "display_value": metricData.display_value
                        }
                        repository.updateMetricDataColumns(
                            conn, metricData, fields)

                    self._updateAnomalyLikelihoodParams(
                        conn, metricObj.uid, metricObj.model_params,
                        anomalyLikelihoodParams)

            runSQL(engine)
        except (ObjectNotFoundError, MetricNotActiveError):
            self._log.warning(
                "Rejected inference result batch=[%s..%s] of model=%s",
                inferenceResults[0].rowID,
                inferenceResults[-1].rowID,
                metricID,
                exc_info=True)
            return None

        self._log.debug(
            "Updated HTM metric_data rows=[%s..%s] "
            "of model=%s: duration=%ss", metricDataRows[0].rowid,
            metricDataRows[-1].rowid, metricID,
            time.time() - startTime)

        return (
            metricObj,
            metricDataRows,
        )
Beispiel #7
0
    def _processModelCommandResult(self, metricID, result):
        """
    Process a single model command result
    """
        engine = repository.engineFactory(config)

        # Check if deleting model
        if result.method == "deleteModel":
            self._log.info("Model=%s was deleted", metricID)
            return

        # Validate model ID
        try:
            # NOTE: use shared lock to prevent race condition with adapter's
            # monitorMetric, whereby adapter creates and/or activates a metric inside
            # a transaction, and we might get the defineModel command before the
            # metric row updates are committed
            with engine.connect() as conn:
                metricObj = repository.getMetricWithSharedLock(conn, metricID)
        except ObjectNotFoundError:
            # This may occur if the user deletes the model before the result was
            # delivered while there are result messages still on the message bus.
            self._log.warn(
                "Received command result=%r for unknown model=%s "
                "(model deleted?)", result, metricID)
            return

        if result.status != 0:
            self._log.error(result.errorMessage)
            if metricObj.status != MetricStatus.ERROR:
                self._log.error(
                    "Placing model=<%s> in ERROR state due to "
                    "commandResult=%s", getMetricLogPrefix(metricObj), result)
                with engine.connect() as conn:
                    repository.setMetricStatus(conn, metricID,
                                               MetricStatus.ERROR,
                                               result.errorMessage)

            else:
                # NOTE: could be a race condition between app-layer and Model Swapper
                #   or a side-effect of the at-least-once delivery guarantee
                self._log.warn(
                    "Received command result=%r for metricID=%s of "
                    "metric=<%s> that was already in ERROR state", result,
                    metricID, getMetricLogPrefix(metricObj))
            return

        # Create Model
        if result.method == "defineModel":
            self._log.info("Model was created for <%s>" %
                           (getMetricLogPrefix(metricObj)))

            if metricObj.status == MetricStatus.CREATE_PENDING:
                with engine.connect() as conn:
                    repository.setMetricStatus(conn, metricID,
                                               MetricStatus.ACTIVE)
            else:
                # NOTE: could be a race condition between app-layer and Model Swapper
                #   or a side-effect of the at-least-once delivery guarantee
                self._log.warn(
                    "Received command result=%r for model=%s of metric=<%s> "
                    "that was not in CREATE_PENDING state", result, metricID,
                    getMetricLogPrefix(metricObj))
            return

        self._log.error("Unexpected model result=%r", result)
Beispiel #8
0
    def updateModelAnomalyScores(self, engine, metricObj, metricDataRows):
        """
    Calculate the anomaly scores based on the anomaly likelihoods. Update
    anomaly scores in the given metricDataRows MetricData instances, and
    calculate new anomaly likelihood params for the model.

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine
    :param metricObj: the model's Metric instance
    :param metricDataRows: a sequence of MetricData instances in the
      processed order (ascending by timestamp) with updated raw_anomaly_score
      and zeroed out anomaly_score corresponding to the new model inference
      results, but not yet updated in the database. Will update their
      anomaly_score properties, as needed.

    :returns: new anomaly likelihood params for the model

    *NOTE:*
      the processing must be idempotent due to the "at least once" delivery
      semantics of the message bus

    *NOTE:*
      the performance goal is to minimize costly database access and avoid
      falling behind while processing model results, especially during the
      model's initial "catch-up" phase when large inference result batches are
      prevalent.
    """
        # When populated, a cached list of MetricData instances for updating
        # anomaly likelyhood params
        statsSampleCache = None

        # Index into metricDataRows where processing is to resume
        startRowIndex = 0

        statisticsRefreshInterval = self._getStatisticsRefreshInterval(
            batchSize=len(metricDataRows))

        if metricObj.status != MetricStatus.ACTIVE:
            raise MetricNotActiveError(
                "getAnomalyLikelihoodParams failed because metric=%s is not ACTIVE; "
                "status=%s; resource=%s" % (
                    metricObj.uid,
                    metricObj.status,
                    metricObj.server,
                ))

        modelParams = jsonDecode(metricObj.model_params)
        anomalyParams = modelParams.get("anomalyLikelihoodParams", None)
        if not anomalyParams:
            # We don't have a likelihood model yet. Create one if we have sufficient
            # records with raw anomaly scores
            (anomalyParams, statsSampleCache,
             startRowIndex) = (self._initAnomalyLikelihoodModel(
                 engine=engine,
                 metricObj=metricObj,
                 metricDataRows=metricDataRows))

        # Do anomaly likelihood processing on the rest of the new samples
        # NOTE: this loop will be skipped if there are still not enough samples for
        #  creating the anomaly likelihood params
        while startRowIndex < len(metricDataRows):
            # Determine where to stop processing rows prior to next statistics refresh

            if (statsSampleCache is None
                    or len(statsSampleCache) >= self._statisticsMinSampleSize):
                # We're here if:
                #   a. We haven't tried updating anomaly likelihood stats yet
                #                 OR
                #   b. We already updated anomaly likelyhood stats (we had sufficient
                #      samples for it)
                # TODO: unit-test
                endRowID = (anomalyParams["last_rowid_for_stats"] +
                            statisticsRefreshInterval)

                if endRowID < metricDataRows[startRowIndex].rowid:
                    # We're here if:
                    #   a. Statistics refresh interval is smaller than during last stats
                    #      update; this is the typical/normal case when backlog catch-up
                    #      is tapering off, and refresh interval is reduced for smaller
                    #      batches. OR
                    #   b. There is a gap of anomaly scores preceeding the start of the
                    #      current chunk. OR
                    #   c. Statistics config changed.
                    # TODO: unit-test

                    self._log.warning(
                        "Anomaly run cutoff precedes samples (smaller stats "
                        "refreshInterval or gap in anomaly scores or statistics config "
                        "changed) : model=%s; rows=[%s..%s]", metricObj.uid,
                        metricDataRows[startRowIndex].rowid, endRowID)

                    if statsSampleCache is not None:
                        # We already attempted to update anomaly likelihood params, so fix
                        # up endRowID to make sure we make progress and don't get stuck in
                        # an infinite loop
                        endRowID = metricDataRows[startRowIndex].rowid
                        self._log.warning(
                            "Advanced anomaly run cutoff to make progress: "
                            "model=%s; rows=[%s..%s]", metricObj.uid,
                            metricDataRows[startRowIndex].rowid, endRowID)
            else:
                # During prior iteration, there were not enough samples in cache for
                # updating anomaly params

                # We extend the end row so that there will be enough samples
                # to avoid getting stuck in this rut in the current and following
                # iterations
                # TODO: unit-test this
                endRowID = metricDataRows[startRowIndex].rowid + (
                    self._statisticsMinSampleSize - len(statsSampleCache) - 1)

            # Translate endRowID into metricDataRows limitIndex for current run
            if endRowID < metricDataRows[startRowIndex].rowid:
                # Cut-off precedes the remaining samples
                # Normally shouldn't be here (unless statistics config changed or there
                # is a gap in anomaly scores in metric_data table)
                # TODO: unit-test this

                # Set limit to bypass processing of samples for immediate refresh of
                # anomaly likelihood params
                limitIndex = startRowIndex
                self._log.warning(
                    "Anomaly run cutoff precedes samples, so forcing refresh of anomaly "
                    "likelihood params: modelInfo=<%s>; rows=[%s..%s]",
                    getMetricLogPrefix(metricObj),
                    metricDataRows[startRowIndex].rowid, endRowID)
            else:
                # Cutoff is either inside or after the remaining samples
                # TODO: unit-test this
                limitIndex = startRowIndex + min(
                    len(metricDataRows) - startRowIndex,
                    endRowID + 1 - metricDataRows[startRowIndex].rowid)

            # Process the next new sample run
            self._log.debug(
                "Starting anomaly run: model=%s; "
                "startRowIndex=%s; limitIndex=%s; rows=[%s..%s]; "
                "last_rowid_for_stats=%s; refreshInterval=%s; batchSize=%s",
                metricObj.uid, startRowIndex, limitIndex,
                metricDataRows[startRowIndex].rowid, endRowID,
                anomalyParams["last_rowid_for_stats"],
                statisticsRefreshInterval, len(metricDataRows))

            consumedSamples = []
            for md in itertools.islice(metricDataRows, startRowIndex,
                                       limitIndex):
                consumedSamples.append(md)

                (likelihood, ), _, anomalyParams["params"] = (
                    algorithms.updateAnomalyLikelihoods(
                        ((md.timestamp, md.metric_value,
                          md.raw_anomaly_score), ), anomalyParams["params"]))

                # TODO: the float "cast" here seems redundant
                md.anomaly_score = float(1.0 - likelihood)

                # If anomaly score > 0.99 then we greedily update the statistics. 0.99
                # should not repeat too often, but to be safe we wait a few more
                # records before updating again, in order to avoid overloading the DB.
                #
                # TODO: the magic 0.99 and the magic 3 value below should either
                #  be constants or config settings. Where should they be defined?
                if (md.anomaly_score > 0.99 and
                    (anomalyParams["last_rowid_for_stats"] + 3) < md.rowid):
                    if statsSampleCache is None or (
                            len(statsSampleCache) + len(consumedSamples) >=
                            self._statisticsMinSampleSize):
                        # TODO: unit-test this
                        self._log.info(
                            "Forcing refresh of anomaly params for model=%s due "
                            "to exceeded anomaly_score threshold in sample=%r",
                            metricObj.uid, md)
                        break

            if startRowIndex + len(consumedSamples) < len(metricDataRows) or (
                    consumedSamples[-1].rowid >= endRowID):
                # We stopped before the end of new samples, including a bypass-run,
                # or stopped after processing the last item and need one final refresh
                # of anomaly params
                anomalyParams, statsSampleCache = self._refreshAnomalyParams(
                    engine=engine,
                    metricID=metricObj.uid,
                    statsSampleCache=statsSampleCache,
                    consumedSamples=consumedSamples,
                    defaultAnomalyParams=anomalyParams)

            startRowIndex += len(consumedSamples)
        # <--- while

        return anomalyParams