Exemple #1
0
    def getMetricData(self, metricSpec, start, end):  # pylint: disable=R0201
        """ Retrieve metric data for the given time range

    :param metricSpec: metric specification for Cloudwatch-based model
    :type metricSpec: dict (see monitorMetric())

    :param start: UTC start time of the metric data range. The start value
      is inclusive: results include datapoints with the time stamp specified. If
      set to None, the implementation will choose the start time automatically
      based on Cloudwatch metric data expiration policy (14 days at the time of
      this writing)
    :type start: datetime.datetime

    :param end: UTC end time of the metric data range. The end value is
      exclusive; results will include datapoints predating the time stamp
      specified. If set to None, will use the current UTC time as end
    :type start: datetime.datetime

    :returns: A two-tuple (<data-sequence>, <next-start-time>).
      <data-sequence> is a possibly empty sequence of data points sorted by
      timestamp in ascending order. Each data point is a two-tuple of
      (<datetime timestamp>, <value>).
      <next-start-time> is a datetime.datetime object indicating the UTC start
      time to use in next call to this method.
    :rtype: tuple
    """
        metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)
        return metricAdapter.getMetricData(start, end)
Exemple #2
0
  def describeResources(self, region, resourceType):  # pylint: disable=R0201
    """ Describe available AWS resources that are supported by Grok within a
    given region and resources type.

    :param region: AWS region
    :param resourceType: type name of the resource (per
      aws_base.ResourceTypeNames)

    :returns: description of available AWS resources for a given resource type
      in the given region

      ::
        describeResources("us-west-2", ResourceTypeNames.AUTOSCALING_GROUP)
        -->

        [
          {   # NOTE: grn = "grok resource name"
              "grn": "aws://us-west-2/auto-scaling-group/webserver-asg",
              "resID": "webserver-asg-micros01",
              "name": value-of-name-tag-or-empty-str
          },

          ...
        ]
    """
    return AWSResourceAdapterBase.describeResourcesByRegionAndType(region,
                                                                   resourceType)
Exemple #3
0
    def getDefaultModelSpecs(resourceType, region, instanceId, dimension=None):
        """Gets model specs for the default metrics for the specified instance.

    :param resourceType: the resource type of the instance
    :param region: the region the instance is in
    :param instanceId: the resource-type-specific identifier for the instance
    :param dimension: the optional dimension name to use for the identifier to
        use instead of the defaults for the metrics
    :returns: a sequence of model spec dicts for each default metric
    """
        # Get the adapters for the resource types default metrics.
        defaultMetricAdapters = AWSResourceAdapterBase.getDefaultResourceMetrics(resourceType)

        # Build the model specs for the default metrics.
        modelSpecs = []
        for metricAdapter in defaultMetricAdapters:
            metricDimension = dimension or metricAdapter.DIMENSION_GROUPS[0][0]
            modelSpecs.append(
                {
                    "region": region,
                    "namespace": metricAdapter.NAMESPACE,
                    "datasource": "cloudwatch",
                    "metric": metricAdapter.METRIC_NAME,
                    # TODO: Is there a method for getting this or one we can make public?
                    # TODO: Is this right?
                    "dimensions": {metricDimension: instanceId},
                }
            )

        return modelSpecs
Exemple #4
0
    def describeSupportedMetrics(self):  # pylint: disable=R0201
        """ Describe supported metrics, grouped by resource type (per
    aws_base.ResourceTypeNames)

    :returns: description of supported metrics, grouped by resource type.
    :rtype: dict

    ::

        {
            AWS::EC2::Instance: {
              "CPUUtilization": {
                "namespace": "AWS/EC2",
                "dimensionGroups": (("InstanceId",),)
                }
              },
              ...
            },
            ...
        }

    NOTE: this differs from the legacy getMetrics() primarily in that this new
      API is resource-oriented, while the legacy getMetrics() grouped results by
      Cloudwatch namespace. The new organization permits grouping of related
      metrics, such AWS/EC2 and AWS/AutoScale metrics on an Autoscaling region,
      by resource
    """
        return AWSResourceAdapterBase.describeSupportedMetrics()
Exemple #5
0
    def describeResources(self, region, resourceType):  # pylint: disable=R0201
        """ Describe available AWS resources that are supported by Grok within a
    given region and resources type.

    :param region: AWS region
    :param resourceType: type name of the resource (per
      aws_base.ResourceTypeNames)

    :returns: description of available AWS resources for a given resource type
      in the given region

      ::
        describeResources("us-west-2", ResourceTypeNames.AUTOSCALING_GROUP)
        -->

        [
          {   # NOTE: grn = "grok resource name"
              "grn": "aws://us-west-2/auto-scaling-group/webserver-asg",
              "resID": "webserver-asg-micros01",
              "name": value-of-name-tag-or-empty-str
          },

          ...
        ]
    """
        return AWSResourceAdapterBase.describeResourcesByRegionAndType(region, resourceType)
Exemple #6
0
  def describeSupportedMetrics(self):  # pylint: disable=R0201
    """ Describe supported metrics, grouped by resource type (per
    aws_base.ResourceTypeNames)

    :returns: description of supported metrics, grouped by resource type.
    :rtype: dict

    ::

        {
            AWS::EC2::Instance: {
              "CPUUtilization": {
                "namespace": "AWS/EC2",
                "dimensionGroups": (("InstanceId",),)
                }
              },
              ...
            },
            ...
        }

    NOTE: this differs from the legacy getMetrics() primarily in that this new
      API is resource-oriented, while the legacy getMetrics() grouped results by
      Cloudwatch namespace. The new organization permits grouping of related
      metrics, such AWS/EC2 and AWS/AutoScale metrics on an Autoscaling region,
      by resource
    """
    return AWSResourceAdapterBase.describeSupportedMetrics()
Exemple #7
0
  def getMetricData(self, metricSpec, start, end):  # pylint: disable=R0201
    """ Retrieve metric data for the given time range

    :param metricSpec: metric specification for Cloudwatch-based model
    :type metricSpec: dict (see monitorMetric())

    :param start: UTC start time of the metric data range. The start value
      is inclusive: results include datapoints with the time stamp specified. If
      set to None, the implementation will choose the start time automatically
      based on Cloudwatch metric data expiration policy (14 days at the time of
      this writing)
    :type start: datetime.datetime

    :param end: UTC end time of the metric data range. The end value is
      exclusive; results will include datapoints predating the time stamp
      specified. If set to None, will use the current UTC time as end
    :type start: datetime.datetime

    :returns: A two-tuple (<data-sequence>, <next-start-time>).
      <data-sequence> is a possibly empty sequence of data points sorted by
      timestamp in ascending order. Each data point is a two-tuple of
      (<datetime timestamp>, <value>).
      <next-start-time> is a datetime.datetime object indicating the UTC start
      time to use in next call to this method.
    :rtype: tuple
    """
    metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)
    return metricAdapter.getMetricData(start, end)
Exemple #8
0
  def getDefaultModelSpecs(resourceType, region, instanceId,
                           dimension=None):
    """Gets model specs for the default metrics for the specified instance.

    :param resourceType: the resource type of the instance
    :param region: the region the instance is in
    :param instanceId: the resource-type-specific identifier for the instance
    :param dimension: the optional dimension name to use for the identifier to
        use instead of the defaults for the metrics
    :returns: a sequence of model spec dicts for each default metric
    """
    # Get the adapters for the resource types default metrics.
    defaultMetricAdapters = AWSResourceAdapterBase.getDefaultResourceMetrics(
        resourceType)

    # Build the model specs for the default metrics.
    modelSpecs = []
    for metricAdapter in defaultMetricAdapters:
      metricDimension = dimension or metricAdapter.DIMENSION_GROUPS[0][0]
      modelSpecs.append({
          "region": region,
          "namespace": metricAdapter.NAMESPACE,
          "datasource": "cloudwatch",
          "metric": metricAdapter.METRIC_NAME,
          # TODO: Is there a method for getting this or one we can make public?
          # TODO: Is this right?
          "dimensions": {metricDimension: instanceId},
      })

    return modelSpecs
    def testCreateModelForImportModel(self, createDatasourceAdapterMock,
                                      ctxMock, repositoryMock,
                                      quotaRepositoryMock, _engineMock):
        nativeMetric = {
            "type": "metric",
            "region": "us-west-2",
            "namespace": "AWS/EC2",
            "datasource": "cloudwatch",
            "metric": "CPUUtilization",
            "dimensions": {
                "InstanceId": "i-ab15a19d"
            }
        }

        metricSpec = {
            "region": nativeMetric["region"],
            "namespace": nativeMetric["namespace"],
            "metric": nativeMetric["metric"],
            "dimensions": nativeMetric["dimensions"]
        }

        metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)
        (createDatasourceAdapterMock.return_value.getInstanceNameForModelSpec.
         return_value) = metricAdapter.getCanonicalResourceName()

        quotaRepositoryMock.getInstanceCount.return_value = 0

        result = models_api.ModelHandler.createModel(nativeMetric)

        self.assertIs(result, repositoryMock.getMetric.return_value)

        repositoryMock.getMetric.assert_called_once_with(
            ctxMock.connFactory.return_value.__enter__.return_value,
            createDatasourceAdapterMock.return_value.importModel.return_value)
Exemple #10
0
  def getMetricResourceStatus(self, metricSpec):  # pylint: disable=R0201
    """ Query AWS for the status of the metric's resource

    :returns: AWS/resource-specific status string if supported and available or
      None if not
    :rtype: string or NoneType
    """
    metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)
    return metricAdapter.getResourceStatus()
Exemple #11
0
    def getMetricResourceStatus(self, metricSpec):  # pylint: disable=R0201
        """ Query AWS for the status of the metric's resource

    :returns: AWS/resource-specific status string if supported and available or
      None if not
    :rtype: string or NoneType
    """
        metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)
        return metricAdapter.getResourceStatus()
Exemple #12
0
    def listSupportedResourceTypes(self):  # pylint: disable=R0201
        """ List supported resource types

    :returns: sequence of resource type names (per aws_base.ResourceTypeNames)

      ::

        ("AWS::AutoScaling::AutoScalingGroup", "AWS::EC2::Instance", ...)
    """
        return AWSResourceAdapterBase.listSupportedResourceTypes()
Exemple #13
0
  def listSupportedResourceTypes(self):  # pylint: disable=R0201
    """ List supported resource types

    :returns: sequence of resource type names (per aws_base.ResourceTypeNames)

      ::

        ("AWS::AutoScaling::AutoScalingGroup", "AWS::EC2::Instance", ...)
    """
    return AWSResourceAdapterBase.listSupportedResourceTypes()
Exemple #14
0
    def getMatchingResources(self, aggSpec):
        """ Get the resources that match an aggregation specification.

    :param aggSpec: Autostack aggregation specification
    :type aggSpec: dict (see _AutostackDatasourceAdapter.createAutostack)

    :returns: sequence of matching resources
    """
        resourceType = aggSpec["resourceType"]
        adapter = AWSResourceAdapterBase.getResourceAdapterClass(resourceType)
        return adapter.getMatchingResources(aggSpec)
Exemple #15
0
  def getMatchingResources(self, aggSpec):
    """ Get the resources that match an aggregation specification.

    :param aggSpec: Autostack aggregation specification
    :type aggSpec: dict (see _AutostackDatasourceAdapter.createAutostack)

    :returns: sequence of matching resources
    """
    resourceType = aggSpec["resourceType"]
    adapter = AWSResourceAdapterBase.getResourceAdapterClass(resourceType)
    return adapter.getMatchingResources(aggSpec)
Exemple #16
0
    def getInstanceNameForModelSpec(self, spec):
        """ Get canonical instance name from a model spec

    :param modelSpec: Datasource-specific model specification
    :type modelSpec: JSONifiable dict

    :returns: Canonical instance name
    :rtype: str
    """
        metricSpec = spec["metricSpec"]
        metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)
        return metricAdapter.getCanonicalResourceName()
Exemple #17
0
  def getInstanceNameForModelSpec(self, spec):
    """ Get canonical instance name from a model spec

    :param modelSpec: Datasource-specific model specification
    :type modelSpec: JSONifiable dict

    :returns: Canonical instance name
    :rtype: str
    """
    metricSpec = spec["metricSpec"]
    metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)
    return metricAdapter.getCanonicalResourceName()
  def testCreateModelForImportModel(self,
                                    createDatasourceAdapterMock,
                                    ctxMock,
                                    repositoryMock,
                                    quotaRepositoryMock,
                                    _engineMock):
    nativeMetric = {
        "type": "metric",
        "region": "us-west-2",
        "namespace": "AWS/EC2",
        "datasource": "cloudwatch",
        "metric": "CPUUtilization",
        "dimensions": {
          "InstanceId": "i-ab15a19d"
        }
      }

    metricSpec = {
        "region": nativeMetric["region"],
        "namespace": nativeMetric["namespace"],
        "metric": nativeMetric["metric"],
        "dimensions": nativeMetric["dimensions"]
      }

    metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)
    (createDatasourceAdapterMock
     .return_value
     .getInstanceNameForModelSpec
     .return_value) = metricAdapter.getCanonicalResourceName()

    quotaRepositoryMock.getInstanceCount.return_value = 0

    result = models_api.ModelHandler.createModel(nativeMetric)

    self.assertIs(result, repositoryMock.getMetric.return_value)

    repositoryMock.getMetric.assert_called_once_with(
      ctxMock.connFactory.return_value.__enter__.return_value,
      createDatasourceAdapterMock.return_value.importModel.return_value)
Exemple #19
0
    def _getMetricStatistics(self, metricSpec):  # pylint: disable=R0201
        """ Retrieve metric data statistics

    :param metricSpec: metric specification for Cloudwatch-based model
    :type metricSpec: dict (see monitorMetric())

    :param start: UTC start time of the metric data range. The start value
      is inclusive: results include datapoints with the time stamp specified. If
      set to None, the implementation will choose the start time automatically
      based on Cloudwatch metric data expiration policy (14 days at the time of
      this writing)
    :type start: datetime.datetime

    :param end: UTC end time of the metric data range. The end value is
      exclusive; results will include datapoints predating the time stamp
      specified. If set to None, will use the current UTC time
    :type start: datetime.datetime

    :returns: a dictionary with the metric's statistics
    :rtype: dict; {"min": <min-value>, "max": <max-value>}
    """
        metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)

        return metricAdapter.getMetricStatistics(start=None, end=None)
Exemple #20
0
  def _getMetricStatistics(self, metricSpec):  # pylint: disable=R0201
    """ Retrieve metric data statistics

    :param metricSpec: metric specification for Cloudwatch-based model
    :type metricSpec: dict (see monitorMetric())

    :param start: UTC start time of the metric data range. The start value
      is inclusive: results include datapoints with the time stamp specified. If
      set to None, the implementation will choose the start time automatically
      based on Cloudwatch metric data expiration policy (14 days at the time of
      this writing)
    :type start: datetime.datetime

    :param end: UTC end time of the metric data range. The end value is
      exclusive; results will include datapoints predating the time stamp
      specified. If set to None, will use the current UTC time
    :type start: datetime.datetime

    :returns: a dictionary with the metric's statistics
    :rtype: dict; {"min": <min-value>, "max": <max-value>}
    """
    metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)

    return metricAdapter.getMetricStatistics(start=None, end=None)
Exemple #21
0
  def monitorMetric(self, modelSpec):
    """ Start monitoring a metric; create a "cloudwatch model" DAO object for
    the given model specification.

    :param modelSpec: model specification for Cloudwatch-based model
    :type modelSpec: dict

    ::

        {
          "datasource": "cloudwatch",

          "metricSpec": {
            "region": "us-west-2",
            "namespace": "AWS/EC2",
            "metric": "CPUUtilization",
            "dimensions": {
              "InstanceId": "i-12d67826"
            }
          },

          # optional
          "modelParams": {
            "min": 0,  # optional
            "max": 100  # optional
          }
        }

    :returns: datasource-specific unique model identifier

    :raises grok.app.exceptions.ObjectNotFoundError: if referenced metric
      doesn't exist

    :raises grok.app.exceptions.MetricNotSupportedError: if requested metric
      isn't supported

    :raises grok.app.exceptions.MetricAlreadyMonitored: if the metric is already
      being monitored
    """
    metricSpec = modelSpec["metricSpec"]
    metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)

    # NOTE: getResourceName may be slow (AWS query)
    # TODO MER-3430: would be handy to use caching to speed things up a lot
    resourceName = metricAdapter.getResourceName()

    canonicalResourceName = self.getInstanceNameForModelSpec(modelSpec)
    resourceLocation = metricAdapter.getResourceLocation()
    metricName = metricAdapter.getMetricName()
    metricPeriod = metricAdapter.getMetricPeriod()
    metricDescription = metricAdapter.getMetricSummary()
    nameColumnValue = self._composeMetricNameColumnValue(
      metricName=metricName,
      metricNamespace=metricSpec["namespace"])

    # Determine if the model should be started. This will happen if the
    # nativeMetric input includes both "min" and "max" or we have default values
    # for both "min" and "max"
    defaultMin = metricAdapter.getMetricDefaultMin()
    defaultMax = metricAdapter.getMetricDefaultMax()
    if defaultMin is None or defaultMax is None:
      defaultMin = defaultMax = None

    # Get user-provided min/max, if any
    modelParams = modelSpec.get("modelParams", dict())
    explicitMin = modelParams.get("min")
    explicitMax = modelParams.get("max")
    if (explicitMin is None) != (explicitMax is None):
      raise ValueError(
        "min and max params must both be None or non-None; modelSpec=%r"
        % (modelSpec,))

    minVal = explicitMin if explicitMin is not None else defaultMin
    maxVal = explicitMax if explicitMax is not None else defaultMax
    stats = {"min": minVal, "max": maxVal}

    swarmParams = scalar_metric_utils.generateSwarmParams(stats)

    # Perform the start-monitoring operation atomically/reliably

    @repository.retryOnTransientErrors
    def startMonitoringWithRetries():
      """ :returns: metricId """
      with self.connectionFactory() as conn:
        with conn.begin():
          repository.lockOperationExclusive(conn,
                                            repository.OperationLock.METRICS)

          # Check if the metric is already monitored
          matchingMetrics = repository.getCloudwatchMetricsForNameAndServer(
            conn,
            nameColumnValue,
            canonicalResourceName,
            fields=[schema.metric.c.uid, schema.metric.c.parameters])

          for m in matchingMetrics:
            parameters = htmengine.utils.jsonDecode(m.parameters)
            if (parameters["metricSpec"]["dimensions"] ==
                metricSpec["dimensions"]):
              msg = ("monitorMetric: Cloudwatch modelId=%s is already "
                     "monitoring metric=%s on resource=%s; model=%r"
                     % (m.uid, nameColumnValue, canonicalResourceName, m))
              self._log.warning(msg)
              raise grok.app.exceptions.MetricAlreadyMonitored(msg, uid=m.uid)

          # Add a metric row for the requested metric
          metricDict = repository.addMetric(
            conn,
            name=nameColumnValue,
            description=metricDescription,
            server=canonicalResourceName,
            location=resourceLocation,
            poll_interval=metricPeriod,
            status=MetricStatus.UNMONITORED,
            datasource=self._DATASOURCE,
            parameters=htmengine.utils.jsonEncode(modelSpec),
            tag_name=resourceName)

          metricId = metricDict["uid"]

          self._log.info("monitorMetric: metric=%s, stats=%r", metricId, stats)

          # Start monitoring
          scalar_metric_utils.startMonitoring(
            conn=conn,
            metricId=metricId,
            swarmParams=swarmParams,
            logger=self._log)

          return metricId

    return startMonitoringWithRetries()
Exemple #22
0
    def monitorMetric(self, modelSpec):
        """ Start monitoring a metric; create a "cloudwatch model" DAO object for
    the given model specification.

    :param modelSpec: model specification for Cloudwatch-based model
    :type modelSpec: dict

    ::

        {
          "datasource": "cloudwatch",

          "metricSpec": {
            "region": "us-west-2",
            "namespace": "AWS/EC2",
            "metric": "CPUUtilization",
            "dimensions": {
              "InstanceId": "i-12d67826"
            }
          },

          # optional
          "modelParams": {
            "min": 0,  # optional
            "max": 100  # optional
          }
        }

    :returns: datasource-specific unique model identifier

    :raises grok.app.exceptions.ObjectNotFoundError: if referenced metric
      doesn't exist

    :raises grok.app.exceptions.MetricNotSupportedError: if requested metric
      isn't supported

    :raises grok.app.exceptions.MetricAlreadyMonitored: if the metric is already
      being monitored
    """
        metricSpec = modelSpec["metricSpec"]
        metricAdapter = AWSResourceAdapterBase.createMetricAdapter(metricSpec)

        # NOTE: getResourceName may be slow (AWS query)
        # TODO MER-3430: would be handy to use caching to speed things up a lot
        resourceName = metricAdapter.getResourceName()

        canonicalResourceName = self.getInstanceNameForModelSpec(modelSpec)
        resourceLocation = metricAdapter.getResourceLocation()
        metricName = metricAdapter.getMetricName()
        metricPeriod = metricAdapter.getMetricPeriod()
        metricDescription = metricAdapter.getMetricSummary()
        nameColumnValue = self._composeMetricNameColumnValue(
            metricName=metricName, metricNamespace=metricSpec["namespace"]
        )

        # Determine if the model should be started. This will happen if the
        # nativeMetric input includes both "min" and "max" or we have default values
        # for both "min" and "max"
        defaultMin = metricAdapter.getMetricDefaultMin()
        defaultMax = metricAdapter.getMetricDefaultMax()
        if defaultMin is None or defaultMax is None:
            defaultMin = defaultMax = None

        # Get user-provided min/max, if any
        modelParams = modelSpec.get("modelParams", dict())
        explicitMin = modelParams.get("min")
        explicitMax = modelParams.get("max")
        if (explicitMin is None) != (explicitMax is None):
            raise ValueError("min and max params must both be None or non-None; modelSpec=%r" % (modelSpec,))

        minVal = explicitMin if explicitMin is not None else defaultMin
        maxVal = explicitMax if explicitMax is not None else defaultMax
        stats = {"min": minVal, "max": maxVal}

        swarmParams = scalar_metric_utils.generateSwarmParams(stats)

        # Perform the start-monitoring operation atomically/reliably

        @repository.retryOnTransientErrors
        def startMonitoringWithRetries():
            """ :returns: metricId """
            with self.connectionFactory() as conn:
                with conn.begin():
                    repository.lockOperationExclusive(conn, repository.OperationLock.METRICS)

                    # Check if the metric is already monitored
                    matchingMetrics = repository.getCloudwatchMetricsForNameAndServer(
                        conn,
                        nameColumnValue,
                        canonicalResourceName,
                        fields=[schema.metric.c.uid, schema.metric.c.parameters],
                    )

                    for m in matchingMetrics:
                        parameters = htmengine.utils.jsonDecode(m.parameters)
                        if parameters["metricSpec"]["dimensions"] == metricSpec["dimensions"]:
                            msg = (
                                "monitorMetric: Cloudwatch modelId=%s is already "
                                "monitoring metric=%s on resource=%s; model=%r"
                                % (m.uid, nameColumnValue, canonicalResourceName, m)
                            )
                            self._log.warning(msg)
                            raise grok.app.exceptions.MetricAlreadyMonitored(msg, uid=m.uid)

                    # Add a metric row for the requested metric
                    metricDict = repository.addMetric(
                        conn,
                        name=nameColumnValue,
                        description=metricDescription,
                        server=canonicalResourceName,
                        location=resourceLocation,
                        poll_interval=metricPeriod,
                        status=MetricStatus.UNMONITORED,
                        datasource=self._DATASOURCE,
                        parameters=htmengine.utils.jsonEncode(modelSpec),
                        tag_name=resourceName,
                    )

                    metricId = metricDict["uid"]

                    self._log.info("monitorMetric: metric=%s, stats=%r", metricId, stats)

                    # Start monitoring
                    scalar_metric_utils.startMonitoring(
                        conn=conn, metricId=metricId, swarmParams=swarmParams, logger=self._log
                    )

                    return metricId

        return startMonitoringWithRetries()