def delete_old_primary_hash(event): """In case the primary hash changed during reprocessing, we need to tell Snuba before reinserting the event. Snuba may then insert a tombstone row depending on whether the primary_hash is part of the PK/sortkey or not. Only when the primary_hash changed and is part of the sortkey, we need to explicitly tombstone the old row. If the primary_hash is not part of the PK/sortkey, or if the primary_hash did not change, nothing needs to be done as ClickHouse's table merge will merge the two rows together. """ old_primary_hash = get_path(event.data, "contexts", "reprocessing", "original_primary_hash") if old_primary_hash is not None and old_primary_hash != event.get_primary_hash( ): from sentry import eventstream eventstream.tombstone_events_unsafe( event.project_id, [event.event_id], old_primary_hash=old_primary_hash, )
def handle_remaining_events(project_id, new_group_id, event_ids, remaining_events, from_timestamp, to_timestamp): """ Delete or merge/move associated per-event data: nodestore, event attachments, user reports. Mark the event as "tombstoned" in Snuba. This is not full event deletion. Snuba can still only delete entire groups, however we must only run this task for event IDs that we don't intend to reuse for reprocessed events. An event ID that is once tombstoned cannot be inserted over in eventstream. See doc comment in sentry.reprocessing2. """ from sentry import buffer from sentry.models.group import Group from sentry.reprocessing2 import EVENT_MODELS_TO_MIGRATE assert remaining_events in ("delete", "keep") if remaining_events == "delete": for cls in EVENT_MODELS_TO_MIGRATE: cls.objects.filter(project_id=project_id, event_id__in=event_ids).delete() # Remove from nodestore node_ids = [ Event.generate_node_id(project_id, event_id) for event_id in event_ids ] nodestore.delete_multi(node_ids) # Tell Snuba to delete the event data. eventstream.tombstone_events_unsafe(project_id, event_ids, from_timestamp=from_timestamp, to_timestamp=to_timestamp) elif remaining_events == "keep": for cls in EVENT_MODELS_TO_MIGRATE: cls.objects.filter( project_id=project_id, event_id__in=event_ids).update(group_id=new_group_id) eventstream.replace_group_unsafe( project_id, event_ids, new_group_id=new_group_id, from_timestamp=from_timestamp, to_timestamp=to_timestamp, ) buffer.incr(Group, {"times_seen": len(event_ids)}, {"id": new_group_id}) else: raise ValueError( f"Invalid value for remaining_events: {remaining_events}")
def handle_remaining_events(project_id, new_group_id, event_ids, remaining_events, from_timestamp, to_timestamp): """ Delete or merge/move associated per-event data: nodestore, event attachments, user reports. Mark the event as "tombstoned" in Snuba. This is not full event deletion. Snuba can still only delete entire groups, however we must only run this task for event IDs that we don't intend to reuse for reprocessed events. An event ID that is once tombstoned cannot be inserted over in eventstream. See doccomment in sentry.reprocessing2. """ assert remaining_events in ("delete", "keep") if remaining_events == "delete": models.EventAttachment.objects.filter(project_id=project_id, event_id__in=event_ids).delete() models.UserReport.objects.filter(project_id=project_id, event_id__in=event_ids).delete() # Remove from nodestore node_ids = [ Event.generate_node_id(project_id, event_id) for event_id in event_ids ] nodestore.delete_multi(node_ids) # Tell Snuba to delete the event data. eventstream.tombstone_events_unsafe(project_id, event_ids, from_timestamp=from_timestamp, to_timestamp=to_timestamp) elif remaining_events == "keep": eventstream.replace_group_unsafe( project_id, event_ids, new_group_id=new_group_id, from_timestamp=from_timestamp, to_timestamp=to_timestamp, ) else: raise ValueError( f"Invalid value for remaining_events: {remaining_events}")
def handle_remaining_events( project_id, new_group_id, remaining_events, # TODO(markus): Should be mandatory arguments. event_ids_redis_key=None, old_group_id=None, # TODO(markus): Deprecated arguments, can remove in next version. event_ids=None, from_timestamp=None, to_timestamp=None, ): """ Delete or merge/move associated per-event data: nodestore, event attachments, user reports. Mark the event as "tombstoned" in Snuba. This is not full event deletion. Snuba can still only delete entire groups, however we must only run this task for event IDs that we don't intend to reuse for reprocessed events. An event ID that is once tombstoned cannot be inserted over in eventstream. See doc comment in sentry.reprocessing2. """ from sentry import buffer from sentry.models.group import Group from sentry.reprocessing2 import EVENT_MODELS_TO_MIGRATE, pop_batched_events_from_redis if event_ids_redis_key is not None: event_ids, from_timestamp, to_timestamp = pop_batched_events_from_redis( event_ids_redis_key) metrics.timing( "events.reprocessing.handle_remaining_events.batch_size", len(event_ids), sample_rate=1.0, ) assert remaining_events in ("delete", "keep") if remaining_events == "delete": for cls in EVENT_MODELS_TO_MIGRATE: cls.objects.filter(project_id=project_id, event_id__in=event_ids).delete() # Remove from nodestore node_ids = [ Event.generate_node_id(project_id, event_id) for event_id in event_ids ] nodestore.delete_multi(node_ids) # Tell Snuba to delete the event data. eventstream.tombstone_events_unsafe(project_id, event_ids, from_timestamp=from_timestamp, to_timestamp=to_timestamp) elif remaining_events == "keep": for cls in EVENT_MODELS_TO_MIGRATE: cls.objects.filter( project_id=project_id, event_id__in=event_ids).update(group_id=new_group_id) eventstream.replace_group_unsafe( project_id, event_ids, new_group_id=new_group_id, from_timestamp=from_timestamp, to_timestamp=to_timestamp, ) buffer.incr(Group, {"times_seen": len(event_ids)}, {"id": new_group_id}) else: raise ValueError( f"Invalid value for remaining_events: {remaining_events}") if old_group_id is not None: from sentry.reprocessing2 import mark_event_reprocessed mark_event_reprocessed(group_id=old_group_id, project_id=project_id, num_events=len(event_ids))
def buffered_delete_old_primary_hash( project_id, group_id, event_id=None, datetime=None, old_primary_hash=None, current_primary_hash=None, force_flush_batch: bool = False, ): """ In case the primary hash changed during reprocessing, we need to tell Snuba before reinserting the event. Snuba may then insert a tombstone row depending on whether the primary_hash is part of the PK/sortkey or not. Only when the primary_hash changed and is part of the sortkey, we need to explicitly tombstone the old row. If the primary_hash is not part of the PK/sortkey, or if the primary_hash did not change, nothing needs to be done as ClickHouse's table merge will merge the two rows together. Like `buffered_handle_remaining_events`, this is a quick and dirty way to batch event IDs so requests to tombstone rows are not being individually sent over to Snuba. This also includes the same constraints for optimal performance as `buffered_handle_remaining_events` in that events being fed to this should have datetimes as close to each other as possible. Unfortunately, this function is invoked by tasks that are run asynchronously and therefore the guarantee from `buffered_handle_remaining_events` regarding events being sorted by timestamps is not applicable here. This function also does not batch events which have different old primary hashes together into one operation. This means that if the data being fed in tends to have a 1:1 ratio of event:old primary hashes, then the buffering in this effectively does nothing. """ from sentry import killswitches if killswitches.killswitch_matches_context( "reprocessing2.drop-delete-old-primary-hash", {"project_id": project_id}): return client = _get_sync_redis_client() # This is a meta key that contains old primary hashes. These hashes are then # combined with other values to construct a key that points to a list of # tombstonable events. primary_hash_set_key = f"re2:tombstone-primary-hashes:{project_id}:{group_id}" old_primary_hashes = client.smembers(primary_hash_set_key) def build_event_key(primary_hash): return f"re2:tombstones:{{{project_id}:{group_id}:{primary_hash}}}" if old_primary_hash is not None and old_primary_hash != current_primary_hash: event_key = build_event_key(old_primary_hash) client.lpush(event_key, f"{to_timestamp(datetime)};{event_id}") client.expire(event_key, settings.SENTRY_REPROCESSING_SYNC_TTL) if old_primary_hash not in old_primary_hashes: old_primary_hashes.add(old_primary_hash) client.sadd(primary_hash_set_key, old_primary_hash) client.expire(primary_hash_set_key, settings.SENTRY_REPROCESSING_SYNC_TTL) # Events for a group are split and bucketed by their primary hashes. If flushing is to be # performed on a per-group basis, the event count needs to be summed up across all buckets # belonging to a single group. event_count = 0 for primary_hash in old_primary_hashes: key = build_event_key(primary_hash) event_count += client.llen(key) if force_flush_batch or event_count > settings.SENTRY_REPROCESSING_REMAINING_EVENTS_BUF_SIZE: with sentry_sdk.start_span( op= "sentry.reprocessing2.buffered_delete_old_primary_hash.flush_events" ): for primary_hash in old_primary_hashes: event_key = build_event_key(primary_hash) event_ids, from_date, to_date = pop_batched_events_from_redis( event_key) # Racing might be happening between two different tasks. Give up on the # task that's lagging behind by prematurely terminating flushing. if len(event_ids) == 0: with sentry_sdk.configure_scope() as scope: scope.set_tag("project_id", project_id) scope.set_tag("old_group_id", group_id) scope.set_tag("old_primary_hash", old_primary_hash) logger.error( "reprocessing2.buffered_delete_old_primary_hash.empty_batch" ) return from sentry import eventstream # In the worst case scenario, a group will have a 1:1 mapping of primary hashes to # events, which means 1 insert per event. # The overall performance of this will be marginally better than the unbatched version # if a group has a lot of old primary hashes. eventstream.tombstone_events_unsafe( project_id, event_ids, old_primary_hash=old_primary_hash, from_timestamp=from_date, to_timestamp=to_date, ) # Try to track counts so if it turns out that tombstoned events trend towards a ratio of 1 # event per hash, a different solution may need to be considered. ratio = 0 if len( old_primary_hashes) == 0 else event_count / len(old_primary_hashes) metrics.timing( key="reprocessing2.buffered_delete_old_primary_hash.event_count", value=event_count, ) metrics.timing( key= "reprocessing2.buffered_delete_old_primary_hash.primary_hash_count", value=len(old_primary_hashes), ) metrics.timing( key= "reprocessing2.buffered_delete_old_primary_hash.primary_hash_to_event_ratio", value=ratio, )