Example #1
0
    def _handleMetricCollectionError(self, engine, metricObj, startTime,
                                     error):
        """ Update the metric's collector_error member and promote the metric
    to ERROR state if needed

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine

    :param metricObj: Metric instance

    :param startTime: metric collection start time; unix epoch timestamp

    :param error: exception object that is the result of metric data collection
      or (possibly pending) resource status check
    """
        # Quarantine the metric temporarily
        self._metricInfoCache[metricObj.uid].quarantineEndTime = (
            startTime +
            metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO)

        try:
            if metricObj.collector_error is None:
                # TODO: unit-test
                # Begin error grace period for this metric
                deadline = time.time() + self._metricErrorGracePeriod
                with engine.connect() as conn:
                    repository.retryOnTransientErrors(
                        repository.setMetricCollectorError)(
                            conn, metricObj.uid,
                            utils.jsonEncode(
                                dict(deadline=deadline, message=repr(error))))
                self._log.error(
                    "Started error grace period on metric=<%r> through %sZ due to "
                    "error=%r", metricObj,
                    datetime.utcfromtimestamp(deadline).isoformat(), error)
            elif (time.time() > utils.jsonDecode(
                    metricObj.collector_error)["deadline"]):
                # TODO: unit-test
                # Error grace period expired: promote the metric to ERROR state
                with engine.connect() as conn:
                    repository.retryOnTransientErrors(
                        repository.setMetricStatus)(conn, metricObj.uid,
                                                    MetricStatus.ERROR,
                                                    repr(error))
                self._log.error(
                    "Metric Collector: grace period expired; placed metric=<%r> in "
                    "ERROR state due to error=%r", metricObj, error)
        except app_exceptions.ObjectNotFoundError:
            # TODO: unit-test
            self._log.warning("Metric deleted? metric=%r",
                              metricObj,
                              exc_info=True)
Example #2
0
    def _handleMetricCollectionError(self, engine, metricObj, startTime, error):
        """ Update the metric's collector_error member and promote the metric
    to ERROR state if needed

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine

    :param metricObj: Metric instance

    :param startTime: metric collection start time; unix epoch timestamp

    :param error: exception object that is the result of metric data collection
      or (possibly pending) resource status check
    """
        # Quarantine the metric temporarily
        self._metricInfoCache[metricObj.uid].quarantineEndTime = (
            startTime + metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO
        )

        try:
            if metricObj.collector_error is None:
                # TODO: unit-test
                # Begin error grace period for this metric
                deadline = time.time() + self._metricErrorGracePeriod
                with engine.connect() as conn:
                    repository.retryOnTransientErrors(repository.setMetricCollectorError)(
                        conn, metricObj.uid, utils.jsonEncode(dict(deadline=deadline, message=repr(error)))
                    )
                self._log.error(
                    "Started error grace period on metric=<%r> through %sZ due to " "error=%r",
                    metricObj,
                    datetime.utcfromtimestamp(deadline).isoformat(),
                    error,
                )
            elif time.time() > utils.jsonDecode(metricObj.collector_error)["deadline"]:
                # TODO: unit-test
                # Error grace period expired: promote the metric to ERROR state
                with engine.connect() as conn:
                    repository.retryOnTransientErrors(repository.setMetricStatus)(
                        conn, metricObj.uid, MetricStatus.ERROR, repr(error)
                    )
                self._log.error(
                    "Metric Collector: grace period expired; placed metric=<%r> in " "ERROR state due to error=%r",
                    metricObj,
                    error,
                )
        except app_exceptions.ObjectNotFoundError:
            # TODO: unit-test
            self._log.warning("Metric deleted? metric=%r", metricObj, exc_info=True)
Example #3
0
    def unmonitorMetric(self, metricId):
        """ Unmonitor a metric

    :param metricId: unique identifier of the metric row

    :raises grok.app.exceptions.ObjectNotFoundError: if metric with the
      referenced metric uid doesn't exist
    """
        # Delete the metric from the database
        with self.connectionFactory() as conn:
            repository.retryOnTransientErrors(repository.deleteMetric)(conn, metricId)

        # Send request to delete HTM model
        model_swapper_utils.deleteHTMModel(metricId)

        self._log.info("Cloudwatch Metric unmonitored: metric=%r", metricId)
Example #4
0
  def _getCandidateMetrics(self, engine):
    """ Return the metrics that are due for an update

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine

    :returns: a (possibly empty) sequence of Metric instances that are due for
      an update (after removing quarantined indexes)

    TODO: unit-test
    """
    with engine.connect() as conn:
      metricsToUpdate = repository.retryOnTransientErrors(
        repository.getCloudwatchMetricsPendingDataCollection)(conn)

    # Remove quarantined metrics
    quarantinedIndexes = None
    if metricsToUpdate:
      now = time.time()
      quarantinedIndexes = set(
        i for i, m in enumerate(metricsToUpdate)
        if now < self._metricInfoCache[m.uid].quarantineEndTime)

      metricsToUpdate = OrderedDict((m.uid, m)
                                    for i, m in enumerate(metricsToUpdate)
                                    if i not in quarantinedIndexes)

      if not metricsToUpdate:
        # TODO: unit-test
        self._log.debug("All candidate numMetrics=%d are quarantined",
                        len(quarantinedIndexes))

    return metricsToUpdate
Example #5
0
    def _getCandidateMetrics(self, engine):
        """ Return the metrics that are due for an update

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine

    :returns: a (possibly empty) sequence of Metric instances that are due for
      an update (after removing quarantined indexes)

    TODO: unit-test
    """
        with engine.connect() as conn:
            metricsToUpdate = repository.retryOnTransientErrors(repository.getCloudwatchMetricsPendingDataCollection)(
                conn
            )

        # Remove quarantined metrics
        quarantinedIndexes = None
        if metricsToUpdate:
            now = time.time()
            quarantinedIndexes = set(
                i for i, m in enumerate(metricsToUpdate) if now < self._metricInfoCache[m.uid].quarantineEndTime
            )

            metricsToUpdate = OrderedDict(
                (m.uid, m) for i, m in enumerate(metricsToUpdate) if i not in quarantinedIndexes
            )

            if not metricsToUpdate:
                # TODO: unit-test
                self._log.debug("All candidate numMetrics=%d are quarantined", len(quarantinedIndexes))

        return metricsToUpdate
    def run(self):
        with ModelSwapperInterface() as modelSwapper:
            engine = repository.engineFactory()
            while True:
                with engine.connect() as conn:
                    pendingStacks = repository.retryOnTransientErrors(
                        repository.getAutostackMetricsPendingDataCollection)(
                            conn)

                if not pendingStacks:
                    time.sleep(self._NOTHING_READY_SLEEP_TIME_SEC)
                    continue

                # Build a sequence of autostack metric requests
                requests = []
                for autostack, metrics in pendingStacks:
                    refBase = len(requests)
                    requests.extend(
                        AutostackMetricRequest(refID=refBase + i,
                                               autostack=autostack,
                                               metric=metric)
                        for i, metric in enumerate(metrics))

                # Collect, aggregate, and stream metric data
                self._processAutostackMetricRequests(engine, requests,
                                                     modelSwapper)
Example #7
0
    def unmonitorMetric(self, metricId):
        """ Unmonitor a metric

    :param metricId: unique identifier of the metric row

    :raises grok.app.exceptions.ObjectNotFoundError: if metric with the
      referenced metric uid doesn't exist
    """
        # Delete the metric from the database
        with self.connectionFactory() as conn:
            repository.retryOnTransientErrors(repository.deleteMetric)(
                conn, metricId)

        # Send request to delete HTM model
        model_swapper_utils.deleteHTMModel(metricId)

        self._log.info("Cloudwatch Metric unmonitored: metric=%r", metricId)
Example #8
0
    def exportModel(self, metricId):
        """ Export the given model.

    :param metricId: datasource-specific unique metric identifier

    :returns: Model-export specification for Cloudwatch model
    :rtype: dict

    ::
        {
          "datasource": "cloudwatch",

          "metricSpec": {
            "region": "us-west-2",
            "namespace": "AWS/EC2",
            "metric": "CPUUtilization",
            "dimensions": {
              "InstanceId": "i-12d67826"
            }
          },

          # Same modelParams with which model was created, if any
          "modelParams": {
            "min": 0,  # optional
            "max": 100  # optional
          }
        }

    :raises grok.app.exceptions.ObjectNotFoundError: if referenced metric
      doesn't exist
    """
        with self.connectionFactory() as conn:
            metricObj = repository.retryOnTransientErrors(
                repository.getMetric)(conn,
                                      metricId,
                                      fields=[schema.metric.c.parameters])

        parameters = htmengine.utils.jsonDecode(metricObj.parameters)

        return parameters
Example #9
0
    def exportModel(self, metricId):
        """ Export the given model.

    :param metricId: datasource-specific unique metric identifier

    :returns: Model-export specification for Cloudwatch model
    :rtype: dict

    ::
        {
          "datasource": "cloudwatch",

          "metricSpec": {
            "region": "us-west-2",
            "namespace": "AWS/EC2",
            "metric": "CPUUtilization",
            "dimensions": {
              "InstanceId": "i-12d67826"
            }
          },

          # Same modelParams with which model was created, if any
          "modelParams": {
            "min": 0,  # optional
            "max": 100  # optional
          }
        }

    :raises grok.app.exceptions.ObjectNotFoundError: if referenced metric
      doesn't exist
    """
        with self.connectionFactory() as conn:
            metricObj = repository.retryOnTransientErrors(repository.getMetric)(
                conn, metricId, fields=[schema.metric.c.parameters]
            )

        parameters = htmengine.utils.jsonDecode(metricObj.parameters)

        return parameters
Example #10
0
  def run(self):
    with ModelSwapperInterface() as modelSwapper:
      engine = repository.engineFactory()
      while True:
        with engine.connect() as conn:
          pendingStacks = repository.retryOnTransientErrors(
            repository.getAutostackMetricsPendingDataCollection)(conn)

        if not pendingStacks:
          time.sleep(self._NOTHING_READY_SLEEP_TIME_SEC)
          continue

        # Build a sequence of autostack metric requests
        requests = []
        for autostack, metrics in pendingStacks:
          refBase = len(requests)
          requests.extend(
            AutostackMetricRequest(refID=refBase + i,
                                   autostack=autostack,
                                   metric=metric)
            for i, metric in enumerate(metrics))

        # Collect, aggregate, and stream metric data
        self._processAutostackMetricRequests(engine, requests, modelSwapper)
  def _processAutostackMetricRequests(self, engine, requests, modelSwapper):
    """ Execute autostack metric requests, aggregate and stream
    collected metric data

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine
    :param requests: sequence of AutostackMetricRequest objects
    :param modelSwapper: Model Swapper
    """
    # Start collecting requested metric data
    collectionIter = self._metricGetter.collectMetricData(requests)

    # Aggregate each collection and dispatch to app MetricStreamer
    for metricCollection in collectionIter:
      request = requests[metricCollection.refID]

      metricObj = request.metric
      data = None

      if metricCollection.slices:
        aggregationFn = getAggregationFn(metricObj)
        if aggregationFn:
          data = aggregate(metricCollection.slices,
                           aggregationFn=aggregationFn)
        else:
          data = aggregate(metricCollection.slices)

      try:
        with engine.connect() as conn:
          repository.retryOnTransientErrors(repository.setMetricLastTimestamp)(
            conn, metricObj.uid, metricCollection.nextMetricTime)
      except ObjectNotFoundError:
        self._log.warning("Processing autostack data collection results for "
                          "unknown model=%s (model deleted?)", metricObj.uid)
        continue

      if data:
        try:
          self.metricStreamer.streamMetricData(data,
                                               metricID=metricObj.uid,
                                               modelSwapper=modelSwapper)
        except ObjectNotFoundError:
          # We expect that the model exists but in the odd case that it has
          # already been deleted we don't want to crash the process.
          self._log.info("Metric not found when adding data. metric=%s" %
                         metricObj.uid)

        self._log.debug(
          "{TAG:APP.AGG.DATA.PUB} Published numItems=%d for metric=%s;"
          "timeRange=[%sZ-%sZ]; headTS=%sZ; tailTS=%sZ",
          len(data), getMetricLogPrefix(metricObj),
          metricCollection.timeRange.start.isoformat(),
          metricCollection.timeRange.end.isoformat(),
          data[0][0].isoformat(), data[-1][0].isoformat())

      else:
        self._log.info(
          "{TAG:APP.AGG.DATA.NONE} No data for metric=%s;"
          "timeRange=[%sZ-%sZ]", getMetricLogPrefix(metricObj),
          metricCollection.timeRange.start.isoformat(),
          metricCollection.timeRange.end.isoformat())
  def messageHandler(self, message):
    """ Inspect all inbound model results in a batch for anomaly thresholds and
        trigger notifications where applicable.

        :param amqp.messages.ConsumerMessage message: ``message.body`` is a
          serialized batch of model inference results generated in
          ``AnomalyService`` and must be deserialized using
          ``AnomalyService.deserializeModelResult()``. The message conforms to
          htmengine/runtime/json_schema/model_inference_results_msg_schema.json
    """
    if message.properties.headers and "dataType" in message.properties.headers:
      # Not a model inference result
      return

    grok.app.config.loadConfig() # reload config on every batch
    engine = repository.engineFactory()
    # Cache minimum threshold to trigger any notification to avoid permuting
    # settings x metricDataRows
    try:
      try:
        batch = AnomalyService.deserializeModelResult(message.body)
      except Exception:
        self._log.exception("Error deserializing model result")
        raise

      # Load all settings for all users (once per incoming batch)
      with engine.connect() as conn:
        settings = repository.retryOnTransientErrors(
            repository.getAllNotificationSettings)(conn)

      self._log.debug("settings: %r" % settings)

      if settings:
        minThreshold = min(setting.sensitivity for setting in settings)
      else:
        minThreshold = 0.99999

      metricInfo = batch["metric"]
      metricId = metricInfo["uid"]
      resource = metricInfo["resource"]


      for row in batch["results"]:

        if row["anomaly"] >= minThreshold:
          for settingObj in settings:
            if row["rowid"] <= 1000:
              continue # Not enough data

            rowDatetime = datetime.utcfromtimestamp(row["ts"])

            if rowDatetime < datetime.utcnow() - timedelta(seconds=3600):
              continue # Skip old

            if row["anomaly"] >= settingObj.sensitivity:
              # First let's clear any old users out of the database.
              with engine.connect() as conn:
                repository.retryOnTransientErrors(
                    repository.deleteStaleNotificationDevices)(
                        conn, _NOTIFICATION_DEVICE_STALE_DAYS)

              # If anomaly_score meets or exceeds any of the device
              # notification sensitivity settings, trigger notification.
              # repository.addNotification() will handle throttling.
              notificationId = str(uuid.uuid4())

              with engine.connect() as conn:
                result = repository.retryOnTransientErrors(
                    repository.addNotification)(conn,
                                                uid=notificationId,
                                                server=resource,
                                                metric=metricId,
                                                rowid=row["rowid"],
                                                device=settingObj.uid,
                                                windowsize=(
                                                  settingObj.windowsize),
                                                timestamp=rowDatetime,
                                                acknowledged=0,
                                                seen=0)

              self._log.info("NOTIFICATION=%s SERVER=%s METRICID=%s DEVICE=%s "
                             "Notification generated. " % (notificationId,
                             resource, metricId,
                             settingObj.uid))

              if (result is not None and
                  result.rowcount > 0 and
                  settingObj.email_addr):
                # Notification was generated.  Attempt to send email
                with engine.connect() as conn:
                  notificationObj = repository.getNotification(conn,
                                                               notificationId)

                self.sendNotificationEmail(engine,
                                           settingObj,
                                           notificationObj)

          if not settings:
            # There are no device notification settings stored on this server,
            # no notifications will be generated.  However, log that a
            # an anomaly was detected and notification would be sent if there
            # were any configured devices
            self._log.info("<%r>" % (metricInfo) + (
                                          "{TAG:APP.NOTIFICATION} Anomaly "
                                          "detected at %s, but no devices are "
                                          "configured.") % rowDatetime)

    finally:
      message.ack()

    # Do cleanup
    with engine.connect() as conn:
      repository.clearOldNotifications(conn) # Delete all notifications outside
Example #13
0
  def _processCollectedData(self, engine, metricsToUpdate, modelSwapper,
                            collectResult):
    """ Process collected metrics data; publish it on the destination
    message queue, and update metric instance status and metric collection
    state.

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine

    :param metricsToUpdate: a sequence of Metric instances which were due for
      an update
    :type metricsToUpdate: collections.OrderedDict

    :param modelSwapper: ModelSwapperInterface object for running models

    :param collectResult: _DataCollectionResult object

    :returns: 2-tuple (numEmpty, numErrors), where numEmpty is the count of
      metrics with empty data (not ready or data gap) numErrors is the count
      of metrics with errors
    """

    numEmpty = numErrors = 0

    metricObj = metricsToUpdate[collectResult.metricID]
    # Update cached resource status
    resourceInfoCacheItem = self._resourceInfoCache[metricObj.server]
    newResourceStatus = None
    if collectResult.resourceStatus is not None:
      if isinstance(collectResult.resourceStatus, Exception):
        resourceInfoCacheItem.error = collectResult.resourceStatus
      else:
        newResourceStatus = collectResult.resourceStatus
        resourceInfoCacheItem.error = None

    if isinstance(collectResult.data, Exception):
      dataError = collectResult.data
      newData = None
      nextStartTime = None
    else:
      dataError = None
      newData = collectResult.data
      nextStartTime = collectResult.nextCallStart

    if newData is not None or newResourceStatus is not None:
      rangeStart = metricObj.last_timestamp

      if not newData:
        # Data is not ready yet or data gap
        numEmpty += 1
        self._metricInfoCache[metricObj.uid].quarantineEndTime = (
          collectResult.creationTime +
          metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO)
      else:
        headTS = newData[0][0]
        tailTS = newData[-1][0]

      startTime = time.time()

      # Update resource status and next metric start time
      @repository.retryOnTransientErrors
      def runSQL(engine):
        with engine.begin() as conn:
          try:
            # Save resource status
            if newResourceStatus is not None:
              repository.saveMetricInstanceStatus(conn,
                                                  metricObj.server,
                                                  newResourceStatus)

            # Save starting time for the next data query in metric row
            if nextStartTime is not None:
              repository.setMetricLastTimestamp(conn,
                                                metricObj.uid,
                                                nextStartTime)
          except app_exceptions.ObjectNotFoundError:
            self._log.warning("Metric deleted?", exc_info=True)

      if newResourceStatus is not None or nextStartTime is not None:
        runSQL(engine)

      if newData:
        # This is done outside the cursor context to minimize lock duration.
        #
        # NOTE: It is still possible for duplicate data to be sent to the
        # exchange so the recipient will need to de-dupe it. It is also
        # possible for the metric to be updated in the database but data not
        # sent to the exchange.
        try:
          self.metricStreamer.streamMetricData(newData,
                                               metricID=metricObj.uid,
                                               modelSwapper=modelSwapper)
        except ValueError:
          self._log.exception("Failed to stream metric data %r" % newData)
        except app_exceptions.ObjectNotFoundError:
          # We expect that the model exists but in the odd case that it has
          # already been deleted we don't want to crash the process.
          self._log.info("Metric not found when adding data.")

        if self._profiling:
          self._log.info(
            "{TAG:APP.MC.DATA.TO_STRM.DONE} numItems=%d for metric=%r; "
            "rangeStart=%s; ts=[%s]; duration=%.4fs; ",
            len(newData), metricObj,
            rangeStart.isoformat() + "Z" if rangeStart is not None else None,
            (("%sZ..%sZ" % (headTS.isoformat(), tailTS.isoformat()))
              if len(newData) > 1 else (headTS.isoformat() + "Z")),
            time.time() - startTime + collectResult.duration)
      else:
        if self._log.isEnabledFor(logging.DEBUG):
          self._log.debug(
            "{TAG:APP.MC.DATA.NONE} No data for metric=%r; rangeStart=%s; "
            "duration=%.4fs", metricObj,
            rangeStart.isoformat() + "Z" if rangeStart is not None else None,
            time.time() - startTime + collectResult.duration)

    # Take care of errors
    error = dataError or resourceInfoCacheItem.error
    if error is not None:
      # Metric data or resource status collection error
      numErrors += 1
      # Update metric error grace period and quarantine info
      self._handleMetricCollectionError(engine,
                                        metricObj,
                                        startTime=collectResult.creationTime,
                                        error=error)
    else:
      if metricObj.collector_error is not None:
        oldErrorInfo = metricObj.collector_error

        # There was pending collector error before, but no error this time, so
        # clear metric's collector_error
        try:
          with engine.connect() as conn:
            repository.retryOnTransientErrors(
              repository.setMetricCollectorError)(conn, metricObj.uid, None)
        except app_exceptions.ObjectNotFoundError:
          self._log.warning("Metric deleted?", exc_info=True)

        self._log.info("metric=<%r> exited error grace state %s",
                       metricObj, oldErrorInfo)

    return (numEmpty, numErrors)
    def messageHandler(self, message):
        """ Inspect all inbound model results in a batch for anomaly thresholds and
        trigger notifications where applicable.

        :param amqp.messages.ConsumerMessage message: ``message.body`` is a
          serialized batch of model inference results generated in
          ``AnomalyService`` and must be deserialized using
          ``AnomalyService.deserializeModelResult()``. The message conforms to
          htmengine/runtime/json_schema/model_inference_results_msg_schema.json
    """
        if message.properties.headers and "dataType" in message.properties.headers:
            # Not a model inference result
            message.ack()
            return

        grok.app.config.loadConfig()  # reload config on every batch
        engine = repository.engineFactory()
        # Cache minimum threshold to trigger any notification to avoid permuting
        # settings x metricDataRows
        try:
            try:
                batch = AnomalyService.deserializeModelResult(message.body)
            except Exception:
                self._log.exception("Error deserializing model result")
                raise

            # Load all settings for all users (once per incoming batch)
            with engine.connect() as conn:
                settings = repository.retryOnTransientErrors(
                    repository.getAllNotificationSettings)(conn)

            self._log.debug("settings: %r" % settings)

            if settings:
                minThreshold = min(setting.sensitivity for setting in settings)
            else:
                minThreshold = 0.99999

            metricInfo = batch["metric"]
            metricId = metricInfo["uid"]
            resource = metricInfo["resource"]

            for row in batch["results"]:

                if row["anomaly"] >= minThreshold:
                    rowDatetime = datetime.utcfromtimestamp(row["ts"])

                    if not settings:
                        # There are no device notification settings stored on this server,
                        # no notifications will be generated.  However, log that a
                        # an anomaly was detected and notification would be sent if there
                        # were any configured devices
                        self._log.info("<%r>" % (metricInfo) +
                                       ("{TAG:APP.NOTIFICATION} Anomaly "
                                        "detected at %s, but no devices are "
                                        "configured.") % rowDatetime)
                        continue

                    for settingObj in settings:
                        if row["rowid"] <= 1000:
                            continue  # Not enough data

                        if rowDatetime < datetime.utcnow() - timedelta(
                                seconds=3600):
                            continue  # Skip old

                        if row["anomaly"] >= settingObj.sensitivity:
                            # First let's clear any old users out of the database.
                            with engine.connect() as conn:
                                repository.retryOnTransientErrors(
                                    repository.deleteStaleNotificationDevices)(
                                        conn, _NOTIFICATION_DEVICE_STALE_DAYS)

                            # If anomaly_score meets or exceeds any of the device
                            # notification sensitivity settings, trigger notification.
                            # repository.addNotification() will handle throttling.
                            notificationId = str(uuid.uuid4())

                            with engine.connect() as conn:
                                result = repository.retryOnTransientErrors(
                                    repository.addNotification)(
                                        conn,
                                        uid=notificationId,
                                        server=resource,
                                        metric=metricId,
                                        rowid=row["rowid"],
                                        device=settingObj.uid,
                                        windowsize=(settingObj.windowsize),
                                        timestamp=rowDatetime,
                                        acknowledged=0,
                                        seen=0)

                            self._log.info(
                                "NOTIFICATION=%s SERVER=%s METRICID=%s DEVICE=%s "
                                "Notification generated. " %
                                (notificationId, resource, metricId,
                                 settingObj.uid))

                            if (result is not None and result.rowcount > 0
                                    and settingObj.email_addr):
                                # Notification was generated.  Attempt to send email
                                with engine.connect() as conn:
                                    notificationObj = repository.getNotification(
                                        conn, notificationId)

                                self.sendNotificationEmail(
                                    engine, settingObj, notificationObj)
        finally:
            message.ack()

        # Do cleanup
        with engine.connect() as conn:
            repository.clearOldNotifications(
                conn)  # Delete all notifications outside
Example #15
0
    def _processCollectedData(self, engine, metricsToUpdate, modelSwapper, collectResult):
        """ Process collected metrics data; publish it on the destination
    message queue, and update metric instance status and metric collection
    state.

    :param engine: SQLAlchemy engine object
    :type engine: sqlalchemy.engine.Engine

    :param metricsToUpdate: a sequence of Metric instances which were due for
      an update
    :type metricsToUpdate: collections.OrderedDict

    :param modelSwapper: ModelSwapperInterface object for running models

    :param collectResult: _DataCollectionResult object

    :returns: 2-tuple (numEmpty, numErrors), where numEmpty is the count of
      metrics with empty data (not ready or data gap) numErrors is the count
      of metrics with errors
    """

        numEmpty = numErrors = 0

        metricObj = metricsToUpdate[collectResult.metricID]
        # Update cached resource status
        resourceInfoCacheItem = self._resourceInfoCache[metricObj.server]
        newResourceStatus = None
        if collectResult.resourceStatus is not None:
            if isinstance(collectResult.resourceStatus, Exception):
                resourceInfoCacheItem.error = collectResult.resourceStatus
            else:
                newResourceStatus = collectResult.resourceStatus
                resourceInfoCacheItem.error = None

        if isinstance(collectResult.data, Exception):
            dataError = collectResult.data
            newData = None
            nextStartTime = None
        else:
            dataError = None
            newData = collectResult.data
            nextStartTime = collectResult.nextCallStart

        if newData is not None or newResourceStatus is not None:
            rangeStart = metricObj.last_timestamp

            if not newData:
                # Data is not ready yet or data gap
                numEmpty += 1
                self._metricInfoCache[metricObj.uid].quarantineEndTime = (
                    collectResult.creationTime + metricObj.poll_interval * self._METRIC_QUARANTINE_DURATION_RATIO
                )
            else:
                headTS = newData[0][0]
                tailTS = newData[-1][0]

            startTime = time.time()

            # Update resource status and next metric start time
            @repository.retryOnTransientErrors
            def runSQL(engine):
                with engine.begin() as conn:
                    try:
                        # Save resource status
                        if newResourceStatus is not None:
                            repository.saveMetricInstanceStatus(conn, metricObj.server, newResourceStatus)

                        # Save starting time for the next data query in metric row
                        if nextStartTime is not None:
                            repository.setMetricLastTimestamp(conn, metricObj.uid, nextStartTime)
                    except app_exceptions.ObjectNotFoundError:
                        self._log.warning("Metric deleted?", exc_info=True)

            if newResourceStatus is not None or nextStartTime is not None:
                runSQL(engine)

            if newData:
                # This is done outside the cursor context to minimize lock duration.
                #
                # NOTE: It is still possible for duplicate data to be sent to the
                # exchange so the recipient will need to de-dupe it. It is also
                # possible for the metric to be updated in the database but data not
                # sent to the exchange.
                try:
                    self.metricStreamer.streamMetricData(newData, metricID=metricObj.uid, modelSwapper=modelSwapper)
                except ValueError:
                    self._log.exception("Failed to stream metric data %r" % newData)
                except app_exceptions.ObjectNotFoundError:
                    # We expect that the model exists but in the odd case that it has
                    # already been deleted we don't want to crash the process.
                    self._log.info("Metric not found when adding data.")

                if self._profiling:
                    self._log.info(
                        "{TAG:APP.MC.DATA.TO_STRM.DONE} numItems=%d for metric=%r; "
                        "rangeStart=%s; ts=[%s]; duration=%.4fs; ",
                        len(newData),
                        metricObj,
                        rangeStart.isoformat() + "Z" if rangeStart is not None else None,
                        (
                            ("%sZ..%sZ" % (headTS.isoformat(), tailTS.isoformat()))
                            if len(newData) > 1
                            else (headTS.isoformat() + "Z")
                        ),
                        time.time() - startTime + collectResult.duration,
                    )
            else:
                if self._log.isEnabledFor(logging.DEBUG):
                    self._log.debug(
                        "{TAG:APP.MC.DATA.NONE} No data for metric=%r; rangeStart=%s; " "duration=%.4fs",
                        metricObj,
                        rangeStart.isoformat() + "Z" if rangeStart is not None else None,
                        time.time() - startTime + collectResult.duration,
                    )

        # Take care of errors
        error = dataError or resourceInfoCacheItem.error
        if error is not None:
            # Metric data or resource status collection error
            numErrors += 1
            # Update metric error grace period and quarantine info
            self._handleMetricCollectionError(engine, metricObj, startTime=collectResult.creationTime, error=error)
        else:
            if metricObj.collector_error is not None:
                oldErrorInfo = metricObj.collector_error

                # There was pending collector error before, but no error this time, so
                # clear metric's collector_error
                try:
                    with engine.connect() as conn:
                        repository.retryOnTransientErrors(repository.setMetricCollectorError)(conn, metricObj.uid, None)
                except app_exceptions.ObjectNotFoundError:
                    self._log.warning("Metric deleted?", exc_info=True)

                self._log.info("metric=<%r> exited error grace state %s", metricObj, oldErrorInfo)

        return (numEmpty, numErrors)