Example #1
0
class AlertForwarder(object):
    """Sends alerts to the Alert Processor and the alerts DynamoDB table."""

    def __init__(self):
        """Initialize the Forwarder with the boto3 clients and resource names."""
        self._table = AlertTable(os.environ['ALERTS_TABLE'])

    def send_alerts(self, alerts):
        """Send alerts to the Dynamo table.

        Args:
            alerts (list): A list of Alert instances to save to Dynamo.
        """
        if not alerts:
            return  # nothing to do

        try:
            self._table.add_alerts(alerts)
        except ClientError:
            # add_alerts() automatically retries transient errors - any raised ClientError
            # is likely unrecoverable. Log an exception and metric
            LOGGER.exception('An error occurred when sending alerts to DynamoDB')
            MetricLogger.log_metric(FUNCTION_NAME, MetricLogger.FAILED_DYNAMO_WRITES, 1)
            return

        LOGGER.info('Successfully sent %d alert(s) to dynamo:%s', len(alerts), self._table.name)
Example #2
0
class AlertMerger(object):
    """Dispatch alerts to the alert processor."""
    ALERT_MERGER = None  # AlertMerger instance which can be re-used across Lambda invocations

    # Async invocations of Lambda functions are capped at 128KB.
    # Set the max payload size to slightly under that to account for the rest of the message.
    MAX_LAMBDA_PAYLOAD_SIZE = 126000

    @classmethod
    def get_instance(cls):
        """Get an instance of the AlertMerger, using a cached version if possible."""
        if not cls.ALERT_MERGER:
            cls.ALERT_MERGER = AlertMerger()
        return cls.ALERT_MERGER

    def __init__(self):
        self.table = AlertTable(os.environ['ALERTS_TABLE'])
        self.alert_proc = os.environ['ALERT_PROCESSOR']
        self.alert_proc_timeout = int(
            os.environ['ALERT_PROCESSOR_TIMEOUT_SEC'])
        self.lambda_client = boto3.client('lambda')

    def _get_alerts(self, rule_name):
        """Build a list of Alert instances triggered from the given rule name."""
        alerts = []

        for record in self.table.get_alert_records(rule_name,
                                                   self.alert_proc_timeout):
            try:
                alerts.append(Alert.create_from_dynamo_record(record))
            except AlertCreationError:
                LOGGER.exception('Invalid alert record %s', record)

        return alerts

    @staticmethod
    def _merge_groups(alerts):
        """Gather alerts into groupings which can be merged together and sent now.

        Args:
            alerts (list): List of Alert instances with defined merge configuration.

        Returns:
            list<AlertMergeGroup>: Each returned merge group has the following properties:
                (1) The oldest alert is older than its merge window (i.e. should be sent now), AND
                (2) All alerts in the merge group fit within a single merge window, AND
                (3) All alerts in the merge group have the same values for all of their merge keys.

            Alerts which are too recent to fit in any merge group are excluded from the results.
        """
        merge_groups = []

        for alert in sorted(alerts):
            # Iterate over alerts (in order of creation) and try to add them to each merge group.
            if not any(group.add(alert) for group in merge_groups):
                # The alert doesn't fit in any merge group - try creating a new one.
                if datetime.utcnow() < alert.created + alert.merge_window:
                    # This alert is too recent - no other alerts can be merged. Stop here.
                    break
                merge_groups.append(AlertMergeGroup(alert))

        return merge_groups

    def _dispatch_alert(self, alert):
        """Dispatch a single alert to the alert processor."""
        alert.attempts += 1
        LOGGER.info('Dispatching %s to %s (attempt %d)', alert,
                    self.alert_proc, alert.attempts)
        MetricLogger.log_metric(ALERT_MERGER_NAME, MetricLogger.ALERT_ATTEMPTS,
                                alert.attempts)

        record_payload = json.dumps(alert.dynamo_record(),
                                    cls=Alert.AlertEncoder,
                                    separators=(',', ':'))
        if len(record_payload) <= self.MAX_LAMBDA_PAYLOAD_SIZE:
            # The entire alert fits in the Lambda payload - send it all
            payload = record_payload
        else:
            # The alert is too big - the alert processor will have to pull it from Dynamo
            payload = json.dumps(alert.dynamo_key)

        self.lambda_client.invoke(FunctionName=self.alert_proc,
                                  InvocationType='Event',
                                  Payload=payload,
                                  Qualifier='production')

        alert.dispatched = datetime.utcnow()
        self.table.mark_as_dispatched(alert)

    def dispatch(self):
        """Find and dispatch all pending alerts to the alert processor."""
        # To reduce the API calls to Dynamo, batch all additions and deletions until the end.
        merged_alerts = []  # List of newly created merge alerts
        alerts_to_delete = []  # List of alerts which can be deleted

        # TODO: Find a way to avoid a full table scan just to get rule names
        for rule_name in self.table.rule_names():
            alerts = self._get_alerts(rule_name)
            if not alerts:
                continue

            merge_enabled_alerts = []
            for alert in alerts:
                if alert.remaining_outputs:
                    # If an alert still has pending outputs, it needs to be sent immediately.
                    # For example, all alerts are sent to the default firehose now even if they will
                    # later be merged when sending to other outputs.
                    self._dispatch_alert(alert)
                elif alert.merge_enabled:
                    # This alert has finished sending to non-merged outputs; it is now a candidate
                    # for alert merging.
                    merge_enabled_alerts.append(alert)
                else:
                    # This alert has sent successfully but doesn't need to be merged.
                    # It should have been deleted by the alert processor, but we can do it now.
                    alerts_to_delete.append(alert)

            for group in self._merge_groups(merge_enabled_alerts):
                # Create a new merged Alert.
                new_alert = Alert.merge(group.alerts)
                LOGGER.info('Merged %d alerts into a new alert with ID %s',
                            len(group.alerts), new_alert.alert_id)
                merged_alerts.append(new_alert)

                # Since we already guaranteed that the original alerts have sent to the unmerged
                # outputs (e.g. default firehose), they can be safely marked for deletion.
                alerts_to_delete.extend(group.alerts)

        if merged_alerts:
            # Add new merged alerts to the alerts table and send them to the alert processor.
            self.table.add_alerts(merged_alerts)
            for alert in merged_alerts:
                self._dispatch_alert(alert)

        if alerts_to_delete:
            self.table.delete_alerts([(alert.rule_name, alert.alert_id)
                                      for alert in alerts_to_delete])