Ejemplo n.º 1
0
    def cloudwatch_timeframe(self):
        """
        Try to work out a likely timeframe for CloudWatch errors.
        """
        threshold = ThresholdMessage.from_message(self.state_reason)

        return Interval(start=threshold.date - dt.timedelta(seconds=300),
                        end=threshold.date + dt.timedelta(seconds=300))
Ejemplo n.º 2
0
    def cloudwatch_timeframe(self):
        """
        Try to work out a likely timeframe for CloudWatch errors.
        """
        threshold = ThresholdMessage.from_message(self.state_reason)

        try:
            return Interval(start=threshold.date - dt.timedelta(seconds=300),
                            end=threshold.date + dt.timedelta(seconds=300))
        except TypeError:
            # Raised when threshold.date is None.
            raise MessageHasNoDateError()
def get_human_message(alarm_name, state_reason):
    """
    Sometimes we can provide a more human-readable message than
    "Threshold Crossed".  Try to do so, if possible.
    """
    threshold = ThresholdMessage.from_message(state_reason)

    # For a DLQ, the lower threshold is always going to be zero, so it's
    # enough to state how many items were on the DLQ.  For example:
    #
    #   There is 1 item on the ID minter DLQ.
    #
    if alarm_name.endswith("_dlq_not_empty"):
        queue_name = alarm_name[:-len("_dlq_not_empty")]
        queue_length = threshold.actual_value

        if queue_length == 1:
            message = "There is 1 item"
        else:
            message = f"There are {queue_length} items"

        return message + f" on the {queue_name} DLQ."

    # For unhealthy hosts, the lower threshold is always going to be zero.
    # For example:
    #
    #   There are 2 unhealthy targets in the id_minter ALB target group.
    #
    if alarm_name.endswith("-alb-unhealthy-hosts"):
        group_name = alarm_name[:-len("-alb-unhealthy-hosts")]
        unhealthy_host_count = threshold.actual_value

        if unhealthy_host_count == 1:
            message = "There is 1 unhealthy target"
        else:
            message = f"There are {unhealthy_host_count} unhealthy targets"

        return message + f" in the {group_name} ALB target group."

    # For not-enough-healthy hosts, the lower threshold may be different,
    # so we include that in the message.  For example:
    #
    #   There aren't enough healthy targets in the ingestor
    #   (saw 2, expected at least 3).
    #
    if alarm_name.endswith("-alb-not-enough-healthy-hosts"):
        group_name = alarm_name[:-len("-alb-not-enough-healthy-hosts")]

        if threshold.is_breaching:
            return f"There are no healthy hosts in the {group_name} ALB target group."
        else:
            desired_count = threshold.desired_value
            actual_count = threshold.actual_value

            return (
                f"There aren't enough healthy targets in the {group_name} ALB target group "
                f"(saw {actual_count}, expected at least {desired_count}).")

    # Any number of 500 errors is bad!  For example:
    #
    #   There were multiple 500 errors (3) from the ingestor ALB target group.
    #
    # We put the numeral in brackets just to make the sentence easier to read.
    if alarm_name.endswith("-alb-target-500-errors"):
        group_name = alarm_name[:-len("-alb-target-500-errors")]
        error_count = threshold.actual_value

        if error_count == 1:
            return f"There was a 500 error from the {group_name} ALB target group."
        else:
            return f"There were multiple 500 errors ({error_count}) from the {group_name} ALB target group."

    # As are any number of Lambda errors.  Example:
    #
    #   There was an error in the post_to_slack Lambda.
    #
    if alarm_name.startswith("lambda-") and alarm_name.endswith("-errors"):
        lambda_name = alarm_name[len("lambda-"):-len("-errors")]
        error_count = threshold.actual_value

        if error_count == 1:
            return f"There was an error in the {lambda_name} Lambda."
        else:
            return f"There were {error_count} errors in the {lambda_name} Lambda."

    return state_reason
 def test_is_breaking(self, message, is_breaching):
     t = ThresholdMessage.from_message(message)
     assert t.is_breaching == is_breaching
 def test_unexpected_message_is_valueerror(self, bad_message):
     with pytest.raises(ValueError):
         ThresholdMessage.from_message(bad_message)
 def test_operator(self, message, expected_operator):
     t = ThresholdMessage.from_message(message)
     assert t.operator == expected_operator
 def test_date(self, message, expected_date):
     t = ThresholdMessage.from_message(message)
     assert t.date == expected_date
 def test_desired_value(self, message, desired_value):
     t = ThresholdMessage.from_message(message)
     assert t.desired_value == desired_value
 def test_actual_value(self, message, actual_value):
     t = ThresholdMessage.from_message(message)
     assert t.actual_value == actual_value
Ejemplo n.º 10
0
def get_human_message(alarm_name, state_reason):
    """
    Sometimes we can provide a more human-readable message than
    "Threshold Crossed".  Try to do so, if possible.
    """
    threshold = ThresholdMessage.from_message(state_reason)

    # For a DLQ, the lower threshold is always going to be zero, so it's
    # enough to state how many items were on the DLQ.  For example:
    #
    #   There is 1 item on the ID minter DLQ.
    #
    if alarm_name.endswith('_dlq_not_empty'):
        queue_name = alarm_name[:-len('_dlq_not_empty')]
        queue_length = threshold.actual_value

        if queue_length == 1:
            message = 'There is 1 item'
        else:
            message = f'There are {queue_length} items'

        return message + f' on the {queue_name} DLQ.'

    # For unhealthy hosts, the lower threshold is always going to be zero.
    # For example:
    #
    #   There are 2 unhealthy targets in the id_minter ALB target group.
    #
    if alarm_name.endswith('-alb-unhealthy-hosts'):
        group_name = alarm_name[:-len('-alb-unhealthy-hosts')]
        unhealthy_host_count = threshold.actual_value

        if unhealthy_host_count == 1:
            message = 'There is 1 unhealthy target'
        else:
            message = f'There are {unhealthy_host_count} unhealthy targets'

        return message + f' in the {group_name} ALB target group.'

    # For not-enough-healthy hosts, the lower threshold may be different,
    # so we include that in the message.  For example:
    #
    #   There aren't enough healthy targets in the ingestor
    #   (saw 2, expected at least 3).
    #
    if alarm_name.endswith('-alb-not-enough-healthy-hosts'):
        group_name = alarm_name[:-len('-alb-not-enough-healthy-hosts')]

        if threshold.is_breaching:
            return f'There are no healthy hosts in the {group_name} ALB target group.'
        else:
            desired_count = threshold.desired_value
            actual_count = threshold.actual_value

            return (
                f"There aren't enough healthy targets in the {group_name} ALB target group "
                f"(saw {actual_count}, expected at least {desired_count}).")

    # Any number of 500 errors is bad!  For example:
    #
    #   There were multiple 500 errors (3) from the ingestor ALB target group.
    #
    # We put the numeral in brackets just to make the sentence easier to read.
    if alarm_name.endswith('-alb-target-500-errors'):
        group_name = alarm_name[:-len('-alb-target-500-errors')]
        error_count = threshold.actual_value

        if error_count == 1:
            return f'There was a 500 error from the {group_name} ALB target group.'
        else:
            return f'There were multiple 500 errors ({error_count}) from the {group_name} ALB target group.'

    # As are any number of Lambda errors.  Example:
    #
    #   There was an error in the post_to_slack Lambda.
    #
    if alarm_name.startswith('lambda-') and alarm_name.endswith('-errors'):
        lambda_name = alarm_name[len('lambda-'):-len('-errors')]
        error_count = threshold.actual_value

        if error_count == 1:
            return f'There was an error in the {lambda_name} Lambda.'
        else:
            return f'There were {error_count} errors in the {lambda_name} Lambda.'

    # The snapshot generator queue filling up suggests the snapshot
    # generator isn't running correctly.  In that case, we can be helpful
    # and tell the user when the last snapshot was taken.
    if alarm_name == 'snapshot_scheduler_queue_not_empty':
        error_count = threshold.actual_value
        if error_count == 1:
            return 'The snapshot generator queue has 1 unprocessed item.'
        else:
            return f'The snapshot generator queue has {error_count} unprocessed items.'

    return state_reason