def auto_resolve_snapshot_incidents(alert_rule_id, **kwargs): from sentry.incidents.logic import update_incident_status from sentry.incidents.models import AlertRule try: alert_rule = AlertRule.objects_with_snapshots.get(id=alert_rule_id) except AlertRule.DoesNotExist: return if alert_rule.status != AlertRuleStatus.SNAPSHOT.value: return batch_size = 50 incidents = Incident.objects.filter(alert_rule=alert_rule).exclude( status=IncidentStatus.CLOSED.value)[:batch_size + 1] has_more = incidents.count() > batch_size if incidents: incidents = incidents[:batch_size] for incident in incidents: update_incident_status( incident, IncidentStatus.CLOSED, comment= "This alert has been auto-resolved because the rule that triggered it has been modified or deleted.", status_method=IncidentStatusMethod.RULE_UPDATED, ) if has_more: auto_resolve_snapshot_incidents.apply_async( kwargs={"alert_rule_id": alert_rule_id}, countdown=1)
def test_build_incident_attachment(self): from sentry.integrations.pagerduty.utils import build_incident_attachment alert_rule = self.create_alert_rule() incident = self.create_incident(alert_rule=alert_rule) update_incident_status( incident, IncidentStatus.CRITICAL, status_method=IncidentStatusMethod.RULE_TRIGGERED) action = self.create_alert_rule_trigger_action( target_identifier=self.service.id, type=AlertRuleTriggerAction.Type.PAGERDUTY, target_type=AlertRuleTriggerAction.TargetType.SPECIFIC, integration=self.integration, ) metric_value = 1000 data = build_incident_attachment(action, incident, self.integration_key, metric_value) assert data["routing_key"] == self.integration_key assert data["event_action"] == "trigger" assert data[ "dedup_key"] == f"incident_{incident.organization_id}_{incident.identifier}" assert data["payload"]["summary"] == alert_rule.name assert data["payload"]["severity"] == "critical" assert data["payload"]["source"] == str(incident.identifier) assert data["payload"]["custom_details"] == { "details": "1000 events in the last 10 minutes\nFilter: level:error" } assert data["links"][0]["text"] == f"Critical: {alert_rule.name}" assert data["links"][0][ "href"] == "http://testserver/organizations/baz/alerts/1/"
def test(self): alert_rule = self.create_alert_rule() incident = self.create_incident(alert_rule=alert_rule) update_incident_status( incident, IncidentStatus.CLOSED, status_method=IncidentStatusMethod.MANUAL ) self.run_test(incident, "resolve")
def run_test(self, incident, status, expected_date_closed, user=None, comment=None): prev_status = incident.status self.record_event.reset_mock() update_incident_status(incident, status, user=user, comment=comment) incident = Incident.objects.get(id=incident.id) assert incident.status == status.value assert incident.date_closed == expected_date_closed activity = self.get_most_recent_incident_activity(incident) assert activity.type == IncidentActivityType.STATUS_CHANGE.value assert activity.user == user if user: assert IncidentSubscription.objects.filter(incident=incident, user=user).exists() assert activity.value == six.text_type(status.value) assert activity.previous_value == six.text_type(prev_status) assert activity.comment == comment assert len(self.record_event.call_args_list) == 1 event = self.record_event.call_args[0][0] assert isinstance(event, IncidentStatusUpdatedEvent) assert event.data == { "organization_id": six.text_type(self.organization.id), "incident_id": six.text_type(incident.id), "incident_type": six.text_type(incident.type), "prev_status": six.text_type(prev_status), "status": six.text_type(incident.status), }
def test(self): closed_incident = create_incident( self.organization, IncidentType.CREATED, "Closed", "", groups=[self.group], date_started=timezone.now() - timedelta(days=30), ) update_incident_status(closed_incident, IncidentStatus.CLOSED) open_incident = create_incident( self.organization, IncidentType.CREATED, "Open", "", groups=[self.group], date_started=timezone.now() - timedelta(days=30), ) incidents = [closed_incident, open_incident] for incident, incident_stats in zip(incidents, bulk_get_incident_stats(incidents)): event_stats = get_incident_event_stats(incident) assert incident_stats["event_stats"].data["data"] == event_stats.data["data"] assert incident_stats["event_stats"].start == event_stats.start assert incident_stats["event_stats"].end == event_stats.end assert incident_stats["event_stats"].rollup == event_stats.rollup aggregates = get_incident_aggregates(incident) assert incident_stats["total_events"] == aggregates["count"] assert incident_stats["unique_users"] == aggregates["unique_users"]
def test_build_incident_attachment(self): from sentry.integrations.pagerduty.utils import build_incident_attachment alert_rule = self.create_alert_rule() incident = self.create_incident(alert_rule=alert_rule) update_incident_status( incident, IncidentStatus.CRITICAL, status_method=IncidentStatusMethod.RULE_TRIGGERED) integration_key = "pfc73e8cb4s44d519f3d63d45b5q77g9" metric_value = 1000 data = build_incident_attachment(incident, integration_key, metric_value) assert data["routing_key"] == "pfc73e8cb4s44d519f3d63d45b5q77g9" assert data["event_action"] == "trigger" assert data["dedup_key"] == "incident_{}_{}".format( incident.organization_id, incident.identifier) assert data["payload"]["summary"] == alert_rule.name assert data["payload"]["severity"] == "critical" assert data["payload"]["source"] == six.text_type(incident.identifier) assert data["payload"]["custom_details"] == { "details": "1000 events in the last 10 minutes\nFilter: level:error" } assert data["links"][0]["text"] == "Critical: {}".format( alert_rule.name) assert data["links"][0][ "href"] == "http://testserver/organizations/baz/alerts/1/"
def trigger_resolve_threshold(self, trigger, metric_value): """ Called when a subscription update exceeds the value defined in `trigger.resolve_threshold` and the trigger is currently ACTIVE. :return: """ self.trigger_resolve_counts[trigger.id] += 1 if self.trigger_resolve_counts[ trigger.id] >= self.alert_rule.threshold_period: metrics.incr("incidents.alert_rules.trigger", tags={"type": "resolve"}) incident_trigger = self.incident_triggers[trigger.id] incident_trigger.status = TriggerStatus.RESOLVED.value incident_trigger.save() self.handle_trigger_actions(incident_trigger, metric_value) self.handle_incident_severity_update() if self.check_triggers_resolved(): update_incident_status( self.active_incident, IncidentStatus.CLOSED, status_method=IncidentStatusMethod.RULE_TRIGGERED, date_closed=self.calculate_event_date_from_update_date( self.last_update), ) self.active_incident = None self.incident_triggers.clear() self.trigger_resolve_counts[trigger.id] = 0
def test_filter_start_end_times(self): self.create_team(organization=self.organization, members=[self.user]) old_incident = self.create_incident(date_started=timezone.now() - timedelta(hours=26)) update_incident_status( incident=old_incident, status=IncidentStatus.CLOSED, date_closed=timezone.now() - timedelta(hours=25), ) new_incident = self.create_incident(date_started=timezone.now() - timedelta(hours=2)) update_incident_status( incident=new_incident, status=IncidentStatus.CLOSED, date_closed=timezone.now() - timedelta(hours=1), ) self.login_as(self.user) with self.feature( ["organizations:incidents", "organizations:performance-view"]): resp_all = self.get_valid_response(self.organization.slug) resp_new = self.get_valid_response( self.organization.slug, start=(timezone.now() - timedelta(hours=12)).isoformat(), end=timezone.now().isoformat(), ) resp_old = self.get_valid_response( self.organization.slug, start=(timezone.now() - timedelta(hours=36)).isoformat(), end=(timezone.now() - timedelta(hours=24)).isoformat(), ) assert resp_all.data == serialize([new_incident, old_incident]) assert resp_new.data == serialize([new_incident]) assert resp_old.data == serialize([old_incident])
def test_incidents_list(self): alert_rule = self.create_alert_rule(name="Alert Rule #1") incident = self.create_incident( self.organization, title="Incident #1", date_started=timezone.now(), date_detected=timezone.now(), projects=[self.project], alert_rule=alert_rule, ) update_incident_status( incident, IncidentStatus.CRITICAL, status_method=IncidentStatusMethod.RULE_TRIGGERED) features = {feature: True for feature in FEATURE_NAME} with self.feature(features): self.browser.get(self.path) self.browser.wait_until_not('[data-test-id="loading-indicator"]') self.browser.wait_until_not('[data-test-id="loading-placeholder"]') self.browser.snapshot("incidents - list") details_url = f'[href="/organizations/{self.organization.slug}/alerts/rules/details/{alert_rule.id}/?alert={incident.id}' self.browser.wait_until(details_url) self.browser.click(details_url) self.browser.wait_until_not('[data-test-id="loading-indicator"]') self.browser.wait_until_test_id("incident-rule-title") self.browser.wait_until_not('[data-test-id="loading-placeholder"]') self.browser.blur() self.browser.snapshot("incidents - details")
def run_fire_test(self, method="fire"): alert_rule = self.create_alert_rule() incident = self.create_incident(alert_rule=alert_rule, status=IncidentStatus.CLOSED.value) if method == "resolve": update_incident_status( incident, IncidentStatus.CLOSED, status_method=IncidentStatusMethod.MANUAL ) self.run_test(incident, method)
def trigger_resolve_threshold(self): """ Called when a subscription update exceeds the value defined in `alert_rule.resolve_threshold` and there's a current active incident. :return: """ self.resolve_triggers += 1 if self.resolve_triggers >= self.alert_rule.threshold_period: update_incident_status(self.active_incident, IncidentStatus.CLOSED) self.resolve_triggers = 0 self.active_incident = None
def test_reopened(self): incident = create_incident( self.organization, IncidentType.CREATED, "Test", "", timezone.now(), projects=[self.project], ) update_incident_status(incident, IncidentStatus.CLOSED) with self.assertChanges( lambda: IncidentSnapshot.objects.filter(incident=incident).exists(), before=True, after=False, ): self.run_test(incident, IncidentStatus.OPEN, None)
def handle_incident_severity_update(self): if self.active_incident: active_incident_triggers = IncidentTrigger.objects.filter( incident=self.active_incident, status=TriggerStatus.ACTIVE.value) severity = None for active_incident_trigger in active_incident_triggers: trigger = active_incident_trigger.alert_rule_trigger if trigger.label == CRITICAL_TRIGGER_LABEL: severity = IncidentStatus.CRITICAL break elif trigger.label == WARNING_TRIGGER_LABEL: severity = IncidentStatus.WARNING if severity: update_incident_status(self.active_incident, severity)
def trigger_resolve_threshold(self, trigger): """ Called when a subscription update exceeds the value defined in `trigger.resolve_threshold` and the trigger is currently ACTIVE. :return: """ self.trigger_resolve_counts[trigger.id] += 1 if self.trigger_resolve_counts[ trigger.id] >= self.alert_rule.threshold_period: self.incident_triggers[ trigger.id].status = TriggerStatus.RESOLVED.value self.incident_triggers[trigger.id].save() if self.check_triggers_resolved(): update_incident_status(self.active_incident, IncidentStatus.CLOSED) self.active_incident = None self.incident_triggers.clear() self.trigger_resolve_counts[trigger.id] = 0
def run_test( self, incident, status, expected_date_closed, user=None, comment=None, ): prev_status = incident.status update_incident_status(incident, status, user=user, comment=comment) assert incident.status == status.value assert incident.date_closed == expected_date_closed activity = self.get_most_recent_incident_activity(incident) assert activity.type == IncidentActivityType.STATUS_CHANGE.value assert activity.user == user assert activity.value == six.text_type(status.value) assert activity.previous_value == six.text_type(prev_status) assert activity.comment == comment assert activity.event_stats_snapshot is None
def test(self): closed_incident = create_incident( self.organization, IncidentType.ALERT_TRIGGERED, "Closed", "", QueryAggregations.TOTAL, groups=[self.group], date_started=timezone.now() - timedelta(days=30), ) update_incident_status(closed_incident, IncidentStatus.CLOSED) open_incident = create_incident( self.organization, IncidentType.ALERT_TRIGGERED, "Open", "", QueryAggregations.TOTAL, groups=[self.group], date_started=timezone.now() - timedelta(days=30), ) incidents = [closed_incident, open_incident] changed = False for incident, incident_stats in zip( incidents, bulk_get_incident_stats(incidents)): event_stats = get_incident_event_stats(incident) assert incident_stats["event_stats"].data[ "data"] == event_stats.data["data"] expected_start = incident_stats["event_stats"].start expected_end = incident_stats["event_stats"].end if not changed: expected_start = expected_start - calculate_incident_prewindow( expected_start, expected_end, incident) changed = True assert event_stats.start == expected_start assert event_stats.end == expected_end assert incident_stats["event_stats"].rollup == event_stats.rollup aggregates = get_incident_aggregates(incident) assert incident_stats["total_events"] == aggregates["count"] assert incident_stats["unique_users"] == aggregates["unique_users"]
def run_test( self, incident, status, expected_date_closed, user=None, comment=None, ): prev_status = incident.status update_incident_status(incident, status, user=user, comment=comment) incident = Incident.objects.get(id=incident.id) assert incident.status == status.value assert incident.date_closed == expected_date_closed activity = self.get_most_recent_incident_activity(incident) assert activity.type == IncidentActivityType.STATUS_CHANGE.value assert activity.user == user if user: assert IncidentSubscription.objects.filter(incident=incident, user=user).exists() assert activity.value == six.text_type(status.value) assert activity.previous_value == six.text_type(prev_status) assert activity.comment == comment assert activity.event_stats_snapshot is None
def put(self, request, organization, incident): serializer = IncidentSerializer(data=request.data) if serializer.is_valid(): result = serializer.validated_data if result["status"] == IncidentStatus.CLOSED: incident = update_incident_status( incident=incident, status=result["status"], user=request.user, comment=result.get("comment"), ) return Response(serialize(incident, request.user, DetailedIncidentSerializer()), status=200) else: return Response("Status cannot be changed.", status=400) return Response(serializer.errors, status=400)
def put(self, request, organization, incident): serializer = IncidentSerializer(data=request.DATA) if serializer.is_valid(): result = serializer.object try: incident = update_incident_status( incident=incident, status=result['status'], user=request.user, comment=result.get('comment'), ) except StatusAlreadyChangedError: raise Response( 'Status is already set to {}'.format(result['status']), status=400, ) return Response(serialize(incident, request.user), status=200) return Response(serializer.errors, status=400)
def put(self, request, organization, incident): serializer = IncidentSerializer(data=request.data) if serializer.is_valid(): result = serializer.validated_data try: incident = update_incident_status( incident=incident, status=result["status"], user=request.user, comment=result.get("comment"), ) except StatusAlreadyChangedError: return Response("Status is already set to {}".format( result["status"]), status=400) return Response(serialize(incident, request.user, DetailedIncidentSerializer()), status=200) return Response(serializer.errors, status=400)
def process_update(self, subscription_update): dataset = self.subscription.snuba_query.dataset if dataset == "events" and not features.has( "organizations:incidents", self.subscription.project.organization): # They have downgraded since these subscriptions have been created. So we just ignore updates for now. metrics.incr( "incidents.alert_rules.ignore_update_missing_incidents") return elif dataset == "transactions" and not features.has( "organizations:performance-view", self.subscription.project.organization): # They have downgraded since these subscriptions have been created. So we just ignore updates for now. metrics.incr( "incidents.alert_rules.ignore_update_missing_incidents_performance" ) return if not hasattr(self, "alert_rule"): # If the alert rule has been removed then just skip metrics.incr( "incidents.alert_rules.no_alert_rule_for_subscription") logger.error( "Received an update for a subscription, but no associated alert rule exists" ) # TODO: Delete subscription here. return if subscription_update["timestamp"] <= self.last_update: metrics.incr( "incidents.alert_rules.skipping_already_processed_update") return self.last_update = subscription_update["timestamp"] if len(subscription_update["values"]["data"]) > 1: logger.warning( "Subscription returned more than 1 row of data", extra={ "subscription_id": self.subscription.id, "dataset": self.subscription.snuba_query.dataset, "snuba_subscription_id": self.subscription.subscription_id, "result": subscription_update, }, ) aggregation_value = subscription_update["values"]["data"][0].values( )[0] alert_operator, resolve_operator = self.THRESHOLD_TYPE_OPERATORS[ AlertRuleThresholdType(self.alert_rule.threshold_type)] for trigger in self.triggers: if alert_operator( aggregation_value, trigger.alert_threshold) and not self.check_trigger_status( trigger, TriggerStatus.ACTIVE): metrics.incr("incidents.alert_rules.threshold", tags={"type": "alert"}) with transaction.atomic(): self.trigger_alert_threshold(trigger, aggregation_value) else: self.trigger_alert_counts[trigger.id] = 0 if (resolve_operator(aggregation_value, self.calculate_resolve_threshold()) and self.active_incident): self.rule_resolve_counts += 1 if self.rule_resolve_counts >= self.alert_rule.threshold_period: # TODO: Make sure we iterate over critical then warning in order. # Potentially also de-dupe actions that are identical. Maybe just # collect all actions, de-dupe and resolve all at once metrics.incr("incidents.alert_rules.threshold", tags={"type": "resolve"}) for trigger in self.triggers: if self.check_trigger_status(trigger, TriggerStatus.ACTIVE): with transaction.atomic(): self.trigger_resolve_threshold( trigger, aggregation_value) update_incident_status( self.active_incident, IncidentStatus.CLOSED, status_method=IncidentStatusMethod.RULE_TRIGGERED, date_closed=self.calculate_event_date_from_update_date( self.last_update), ) self.active_incident = None self.incident_triggers.clear() self.rule_resolve_counts = 0 else: self.rule_resolve_counts = 0 # We update the rule stats here after we commit the transaction. This guarantees # that we'll never miss an update, since we'll never roll back if the process # is killed here. The trade-off is that we might process an update twice. Mostly # this will have no effect, but if someone manages to close a triggered incident # before the next one then we might alert twice. self.update_alert_rule_stats()
def process_update(self, subscription_update): dataset = self.subscription.snuba_query.dataset try: # Check that the project exists self.subscription.project except Project.DoesNotExist: metrics.incr("incidents.alert_rules.ignore_deleted_project") return if dataset == "events" and not features.has( "organizations:incidents", self.subscription.project.organization): # They have downgraded since these subscriptions have been created. So we just ignore updates for now. metrics.incr( "incidents.alert_rules.ignore_update_missing_incidents") return elif dataset == "transactions" and not features.has( "organizations:performance-view", self.subscription.project.organization): # They have downgraded since these subscriptions have been created. So we just ignore updates for now. metrics.incr( "incidents.alert_rules.ignore_update_missing_incidents_performance" ) return if not hasattr(self, "alert_rule"): # If the alert rule has been removed then just skip metrics.incr( "incidents.alert_rules.no_alert_rule_for_subscription") logger.error( "Received an update for a subscription, but no associated alert rule exists" ) # TODO: Delete subscription here. return if subscription_update["timestamp"] <= self.last_update: metrics.incr( "incidents.alert_rules.skipping_already_processed_update") return self.last_update = subscription_update["timestamp"] if len(subscription_update["values"]["data"]) > 1: logger.warning( "Subscription returned more than 1 row of data", extra={ "subscription_id": self.subscription.id, "dataset": self.subscription.snuba_query.dataset, "snuba_subscription_id": self.subscription.subscription_id, "result": subscription_update, }, ) aggregation_value = list( subscription_update["values"]["data"][0].values())[0] # In some cases Snuba can return a None value for an aggregation. This means # there were no rows present when we made the query, for certain types of # aggregations like avg. Defaulting this to 0 for now. It might turn out that # we'd prefer to skip the update in the future. if aggregation_value is None: aggregation_value = 0 alert_operator, resolve_operator = self.THRESHOLD_TYPE_OPERATORS[ AlertRuleThresholdType(self.alert_rule.threshold_type)] fired_incident_triggers = [] with transaction.atomic(): for trigger in self.triggers: if alert_operator(aggregation_value, trigger.alert_threshold ) and not self.check_trigger_status( trigger, TriggerStatus.ACTIVE): metrics.incr("incidents.alert_rules.threshold", tags={"type": "alert"}) incident_trigger = self.trigger_alert_threshold( trigger, aggregation_value) if incident_trigger is not None: fired_incident_triggers.append(incident_trigger) else: self.trigger_alert_counts[trigger.id] = 0 if (resolve_operator(aggregation_value, self.calculate_resolve_threshold()) and self.active_incident): self.rule_resolve_counts += 1 if self.rule_resolve_counts >= self.alert_rule.threshold_period: # TODO: Make sure we iterate over critical then warning in order. metrics.incr("incidents.alert_rules.threshold", tags={"type": "resolve"}) for trigger in self.triggers: if self.check_trigger_status(trigger, TriggerStatus.ACTIVE): incident_trigger = self.trigger_resolve_threshold( trigger, aggregation_value) if incident_trigger is not None: fired_incident_triggers.append( incident_trigger) update_incident_status( self.active_incident, IncidentStatus.CLOSED, status_method=IncidentStatusMethod.RULE_TRIGGERED, date_closed=self.calculate_event_date_from_update_date( self.last_update), ) self.active_incident = None self.incident_triggers.clear() self.rule_resolve_counts = 0 else: self.rule_resolve_counts = 0 if fired_incident_triggers: self.handle_trigger_actions(fired_incident_triggers, aggregation_value) # We update the rule stats here after we commit the transaction. This guarantees # that we'll never miss an update, since we'll never roll back if the process # is killed here. The trade-off is that we might process an update twice. Mostly # this will have no effect, but if someone manages to close a triggered incident # before the next one then we might alert twice. self.update_alert_rule_stats()
def test_status_already_set(self): incident = self.create_incident(status=IncidentStatus.WARNING.value) update_incident_status(incident, IncidentStatus.WARNING) assert incident.status == IncidentStatus.WARNING.value
def test(self): alert_rule = self.create_alert_rule() incident = self.create_incident(alert_rule=alert_rule) update_incident_status(incident, IncidentStatus.CLOSED) self.run_test(incident, "resolve")
def test(self): incident = self.create_incident() update_incident_status(incident, IncidentStatus.CLOSED) self.run_test(incident, "resolve")
def test_status_already_set(self): incident = self.create_incident(status=IncidentStatus.OPEN.value) with self.assertRaises(StatusAlreadyChangedError): update_incident_status(incident, IncidentStatus.OPEN)