def should_demote_symbolication(project_id: int) -> bool: """ Determines whether a project's symbolication events should be pushed to the low priority queue. The decision is made based on three factors, in order: 1. is the store.symbolicate-event-lpq-never killswitch set for the project? -> normal queue 2. is the store.symbolicate-event-lpq-always killswitch set for the project? -> low priority queue 3. has the project been selected for the lpq according to realtime_metrics? -> low priority queue Note that 3 is gated behind the config setting SENTRY_ENABLE_AUTO_LOW_PRIORITY_QUEUE. """ always_lowpri = killswitch_matches_context( "store.symbolicate-event-lpq-always", { "project_id": project_id, }, ) never_lowpri = killswitch_matches_context( "store.symbolicate-event-lpq-never", { "project_id": project_id, }, ) if never_lowpri: return False elif always_lowpri: return True else: return settings.SENTRY_ENABLE_AUTO_LOW_PRIORITY_QUEUE and realtime_metrics.is_lpq_project( project_id )
def should_demote_symbolication(project_id): """ Determines whether a project's symbolication events should be pushed to the low priority queue. """ always_lowpri = killswitch_matches_context( "store.symbolicate-event-lpq-always", { "project_id": project_id, }, ) never_lowpri = killswitch_matches_context( "store.symbolicate-event-lpq-never", { "project_id": project_id, }, ) return not never_lowpri and always_lowpri
def _do_process_event( cache_key, start_time, event_id, process_task, data=None, data_has_changed=None, from_symbolicate=False, ): from sentry.plugins.base import plugins if data is None: data = event_processing_store.get(cache_key) if data is None: metrics.incr("events.failed", tags={ "reason": "cache", "stage": "process" }, skip_internal=False) error_logger.error("process.failed.empty", extra={"cache_key": cache_key}) return data = CanonicalKeyDict(data) project_id = data["project"] set_current_event_project(project_id) event_id = data["event_id"] if killswitch_matches_context( "store.load-shed-process-event-projects", { "project_id": project_id, "event_id": event_id, "platform": data.get("platform") or "null", }, ): return with sentry_sdk.start_span( op="tasks.store.process_event.get_project_from_cache"): project = Project.objects.get_from_cache(id=project_id) with metrics.timer( "tasks.store.process_event.organization.get_from_cache"): project._organization_cache = Organization.objects.get_from_cache( id=project.organization_id) has_changed = bool(data_has_changed) with sentry_sdk.start_span( op="tasks.store.process_event.get_reprocessing_revision"): # Fetch the reprocessing revision reprocessing_rev = reprocessing.get_reprocessing_revision(project_id) # Stacktrace based event processors. with sentry_sdk.start_span(op="task.store.process_event.stacktraces"): with metrics.timer("tasks.store.process_event.stacktraces", tags={"from_symbolicate": from_symbolicate}): new_data = process_stacktraces(data) if new_data is not None: has_changed = True data = new_data # Second round of datascrubbing after stacktrace and language-specific # processing. First round happened as part of ingest. # # *Right now* the only sensitive data that is added in stacktrace # processing are usernames in filepaths, so we run directly after # stacktrace processors. # # We do not yet want to deal with context data produced by plugins like # sessionstack or fullstory (which are in `get_event_preprocessors`), as # this data is very unlikely to be sensitive data. This is why scrubbing # happens somewhere in the middle of the pipeline. # # On the other hand, Javascript event error translation is happening after # this block because it uses `get_event_preprocessors` instead of # `get_event_enhancers`. # # We are fairly confident, however, that this should run *before* # re-normalization as it is hard to find sensitive data in partially # trimmed strings. if has_changed and options.get("processing.can-use-scrubbers"): with sentry_sdk.start_span(op="task.store.datascrubbers.scrub"): with metrics.timer("tasks.store.datascrubbers.scrub", tags={"from_symbolicate": from_symbolicate}): new_data = safe_execute(scrub_data, project=project, event=data.data) # XXX(markus): When datascrubbing is finally "totally stable", we might want # to drop the event if it crashes to avoid saving PII if new_data is not None: data.data = new_data # TODO(dcramer): ideally we would know if data changed by default # Default event processors. for plugin in plugins.all(version=2): with sentry_sdk.start_span( op="task.store.process_event.preprocessors") as span: span.set_data("plugin", plugin.slug) span.set_data("from_symbolicate", from_symbolicate) with metrics.timer( "tasks.store.process_event.preprocessors", tags={ "plugin": plugin.slug, "from_symbolicate": from_symbolicate }, ): processors = safe_execute(plugin.get_event_preprocessors, data=data, _with_transaction=False) for processor in processors or (): try: result = processor(data) except Exception: error_logger.exception( "tasks.store.preprocessors.error") data.setdefault("_metrics", {})["flag.processing.error"] = True has_changed = True else: if result: data = result has_changed = True assert data[ "project"] == project_id, "Project cannot be mutated by plugins" # We cannot persist canonical types in the cache, so we need to # downgrade this. if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) if has_changed: # Run some of normalization again such that we don't: # - persist e.g. incredibly large stacktraces from minidumps # - store event timestamps that are older than our retention window # (also happening with minidumps) normalizer = StoreNormalizer(remove_other=False, is_renormalize=True, **DEFAULT_STORE_NORMALIZER_ARGS) data = normalizer.normalize_event(dict(data)) issues = data.get("processing_issues") try: if issues and create_failed_event( cache_key, data, project_id, list(issues.values()), event_id=event_id, start_time=start_time, reprocessing_rev=reprocessing_rev, ): return except RetryProcessing: # If `create_failed_event` indicates that we need to retry we # invoke ourselves again. This happens when the reprocessing # revision changed while we were processing. _do_preprocess_event(cache_key, data, start_time, event_id, process_task, project) return cache_key = event_processing_store.store(data) from_reprocessing = process_task is process_event_from_reprocessing submit_save_event(project, from_reprocessing, cache_key, event_id, start_time, data)
def _do_symbolicate_event(cache_key, start_time, event_id, symbolicate_task, data=None): from sentry.lang.native.processing import get_symbolication_function if data is None: data = event_processing_store.get(cache_key) if data is None: metrics.incr("events.failed", tags={ "reason": "cache", "stage": "symbolicate" }, skip_internal=False) error_logger.error("symbolicate.failed.empty", extra={"cache_key": cache_key}) return data = CanonicalKeyDict(data) project_id = data["project"] set_current_event_project(project_id) event_id = data["event_id"] if killswitch_matches_context( "store.load-shed-symbolicate-event-projects", { "project_id": project_id, "event_id": event_id, "platform": data.get("platform") or "null", }, ): return symbolication_function = get_symbolication_function(data) has_changed = False from_reprocessing = symbolicate_task is symbolicate_event_from_reprocessing symbolication_start_time = time() with sentry_sdk.start_span( op="tasks.store.symbolicate_event.symbolication") as span: span.set_data("symbolicaton_function", symbolication_function.__name__) with metrics.timer( "tasks.store.symbolicate_event.symbolication", tags={ "symbolication_function": symbolication_function.__name__ }, ): while True: try: with sentry_sdk.start_span( op="tasks.store.symbolicate_event.%s" % symbolication_function.__name__) as span: symbolicated_data = symbolication_function(data) span.set_data("symbolicated_data", bool(symbolicated_data)) if symbolicated_data: data = symbolicated_data has_changed = True break except RetrySymbolication as e: if (time() - symbolication_start_time ) > settings.SYMBOLICATOR_PROCESS_EVENT_WARN_TIMEOUT: error_logger.warning( "symbolicate.slow", extra={ "project_id": project_id, "event_id": event_id }, ) if (time() - symbolication_start_time ) > settings.SYMBOLICATOR_PROCESS_EVENT_HARD_TIMEOUT: # Do not drop event but actually continue with rest of pipeline # (persisting unsymbolicated event) metrics.incr( "tasks.store.symbolicate_event.fatal", tags={ "reason": "timeout", "symbolication_function": symbolication_function.__name__, }, ) error_logger.exception( "symbolicate.failed.infinite_retry", extra={ "project_id": project_id, "event_id": event_id }, ) data.setdefault("_metrics", {})["flag.processing.error"] = True data.setdefault("_metrics", {})["flag.processing.fatal"] = True has_changed = True break else: # sleep for `retry_after` but max 5 seconds and try again metrics.incr( "tasks.store.symbolicate_event.retry", tags={ "symbolication_function": symbolication_function.__name__ }, ) sleep(min(e.retry_after, SYMBOLICATOR_MAX_RETRY_AFTER)) continue except Exception: metrics.incr( "tasks.store.symbolicate_event.fatal", tags={ "reason": "error", "symbolication_function": symbolication_function.__name__, }, ) error_logger.exception( "tasks.store.symbolicate_event.symbolication") data.setdefault("_metrics", {})["flag.processing.error"] = True data.setdefault("_metrics", {})["flag.processing.fatal"] = True has_changed = True break # We cannot persist canonical types in the cache, so we need to # downgrade this. if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) if has_changed: cache_key = event_processing_store.store(data) process_task = process_event_from_reprocessing if from_reprocessing else process_event _do_process_event( cache_key=cache_key, start_time=start_time, event_id=event_id, process_task=process_task, data=data, data_has_changed=has_changed, from_symbolicate=True, )
def _do_save_event( cache_key: Optional[str] = None, data: Optional[Event] = None, start_time: Optional[int] = None, event_id: Optional[str] = None, project_id: Optional[int] = None, **kwargs: Any, ) -> None: """ Saves an event to the database. """ set_current_event_project(project_id) from sentry.event_manager import EventManager, HashDiscarded event_type = "none" if cache_key and data is None: with metrics.timer( "tasks.store.do_save_event.get_cache") as metric_tags: data = processing.event_processing_store.get(cache_key) if data is not None: metric_tags["event_type"] = event_type = data.get( "type") or "none" with metrics.global_tags(event_type=event_type): if data is not None: data = CanonicalKeyDict(data) if event_id is None and data is not None: event_id = data["event_id"] # only when we come from reprocessing we get a project_id sent into # the task. if project_id is None: project_id = data.pop("project") set_current_event_project(project_id) # We only need to delete raw events for events that support # reprocessing. If the data cannot be found we want to assume # that we need to delete the raw event. if not data or reprocessing.event_supports_reprocessing(data): with metrics.timer("tasks.store.do_save_event.delete_raw_event"): delete_raw_event(project_id, event_id, allow_hint_clear=True) # This covers two cases: where data is None because we did not manage # to fetch it from the default cache or the empty dictionary was # stored in the default cache. The former happens if the event # expired while being on the queue, the second happens on reprocessing # if the raw event was deleted concurrently while we held on to # it. This causes the node store to delete the data and we end up # fetching an empty dict. We could in theory not invoke `save_event` # in those cases but it's important that we always clean up the # reprocessing reports correctly or they will screw up the UI. So # to future proof this correctly we just handle this case here. if not data: metrics.incr("events.failed", tags={ "reason": "cache", "stage": "post" }, skip_internal=False) return try: if killswitch_matches_context( "store.load-shed-save-event-projects", { "project_id": project_id, "event_type": event_type, "platform": data.get("platform") or "none", }, ): raise HashDiscarded("Load shedding save_event") with metrics.timer("tasks.store.do_save_event.event_manager.save"): manager = EventManager(data) # event.project.organization is populated after this statement. manager.save(project_id, assume_normalized=True, start_time=start_time, cache_key=cache_key) # Put the updated event back into the cache so that post_process # has the most recent data. data = manager.get_data() if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) with metrics.timer( "tasks.store.do_save_event.write_processing_cache"): processing.event_processing_store.store(data) except HashDiscarded: # Delete the event payload from cache since it won't show up in post-processing. if cache_key: with metrics.timer("tasks.store.do_save_event.delete_cache"): processing.event_processing_store.delete_by_key(cache_key) finally: reprocessing2.mark_event_reprocessed(data) if cache_key: with metrics.timer( "tasks.store.do_save_event.delete_attachment_cache"): attachment_cache.delete(cache_key) if start_time: metrics.timing( "events.time-to-process", time() - start_time, instance=data["platform"], tags={ "is_reprocessing2": "true" if reprocessing2.is_reprocessed_event(data) else "false", }, ) time_synthetic_monitoring_event(data, project_id, start_time)
def _load_event( message: Message, projects: Mapping[int, Project] ) -> Optional[Tuple[Any, Callable[[str], None]]]: """ Perform some initial filtering and deserialize the message payload. If the event should be stored, the deserialized payload is returned along with a function that can be called with the event's storage key to resume processing after the event has been persisted and is available to be read by other processing components. """ payload = message["payload"] start_time = float(message["start_time"]) event_id = message["event_id"] project_id = int(message["project_id"]) remote_addr = message.get("remote_addr") attachments = message.get("attachments") or () sentry_sdk.set_extra("event_id", event_id) sentry_sdk.set_extra("len_attachments", len(attachments)) if project_id == settings.SENTRY_PROJECT: metrics.incr("internal.captured.ingest_consumer.unparsed") # check that we haven't already processed this event (a previous instance of the forwarder # died before it could commit the event queue offset) # # XXX(markus): I believe this code is extremely broken: # # * it practically uses memcached in prod which has no consistency # guarantees (no idea how we don't run into issues there) # # * a TTL of 1h basically doesn't guarantee any deduplication at all. It # just guarantees a good error message... for one hour. # # This code has been ripped from the old python store endpoint. We're # keeping it around because it does provide some protection against # reprocessing good events if a single consumer is in a restart loop. deduplication_key = f"ev:{project_id}:{event_id}" if cache.get(deduplication_key) is not None: logger.warning( "pre-process-forwarder detected a duplicated event" " with id:%s for project:%s.", event_id, project_id, ) return # message already processed do not reprocess if killswitch_matches_context( "store.load-shed-pipeline-projects", { "project_id": project_id, "event_id": event_id, "has_attachments": bool(attachments), }, ): # This killswitch is for the worst of scenarios and should probably not # cause additional load on our logging infrastructure return try: project = projects[project_id] except KeyError: logger.error("Project for ingested event does not exist: %s", project_id) return # Parse the JSON payload. This is required to compute the cache key and # call process_event. The payload will be put into Kafka raw, to avoid # serializing it again. # XXX: Do not use CanonicalKeyDict here. This may break preprocess_event # which assumes that data passed in is a raw dictionary. data = json.loads(payload) if project_id == settings.SENTRY_PROJECT: metrics.incr( "internal.captured.ingest_consumer.parsed", tags={"event_type": data.get("type") or "null"}, ) if killswitch_matches_context( "store.load-shed-parsed-pipeline-projects", { "organization_id": project.organization_id, "project_id": project.id, "event_type": data.get("type") or "null", "has_attachments": bool(attachments), "event_id": event_id, }, ): return def dispatch_task(cache_key: str) -> None: if attachments: with sentry_sdk.start_span(op="ingest_consumer.set_attachment_cache"): attachment_objects = [ CachedAttachment(type=attachment.pop("attachment_type"), **attachment) for attachment in attachments ] attachment_cache.set( cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT ) # Preprocess this event, which spawns either process_event or # save_event. Pass data explicitly to avoid fetching it again from the # cache. with sentry_sdk.start_span(op="ingest_consumer.process_event.preprocess_event"): preprocess_event( cache_key=cache_key, data=data, start_time=start_time, event_id=event_id, project=project, ) # remember for an 1 hour that we saved this event (deduplication protection) cache.set(deduplication_key, "", CACHE_TIMEOUT) # emit event_accepted once everything is done event_accepted.send_robust(ip=remote_addr, data=data, project=project, sender=process_event) return data, dispatch_task
def _do_symbolicate_event( cache_key, start_time, event_id, symbolicate_task, data=None, queue_switches=0 ): from sentry.lang.native.processing import get_symbolication_function if data is None: data = event_processing_store.get(cache_key) if data is None: metrics.incr( "events.failed", tags={"reason": "cache", "stage": "symbolicate"}, skip_internal=False ) error_logger.error("symbolicate.failed.empty", extra={"cache_key": cache_key}) return data = CanonicalKeyDict(data) project_id = data["project"] set_current_event_project(project_id) event_id = data["event_id"] from_reprocessing = ( symbolicate_task is symbolicate_event_from_reprocessing or symbolicate_task is symbolicate_event_from_reprocessing_low_priority ) # check whether the event is in the wrong queue and if so, move it to the other one. # we do this at most SYMBOLICATOR_MAX_QUEUE_SWITCHES times. if queue_switches >= SYMBOLICATOR_MAX_QUEUE_SWITCHES: metrics.gauge("tasks.store.symbolicate_event.low_priority.max_queue_switches", 1) else: is_low_priority = symbolicate_task in [ symbolicate_event_low_priority, symbolicate_event_from_reprocessing_low_priority, ] should_be_low_priority = should_demote_symbolication(project_id) if is_low_priority != should_be_low_priority: metrics.gauge("tasks.store.symbolicate_event.low_priority.wrong_queue", 1) submit_symbolicate( should_be_low_priority, from_reprocessing, cache_key, event_id, start_time, data, queue_switches + 1, ) return def _continue_to_process_event(): process_task = process_event_from_reprocessing if from_reprocessing else process_event _do_process_event( cache_key=cache_key, start_time=start_time, event_id=event_id, process_task=process_task, data=data, data_has_changed=has_changed, from_symbolicate=True, ) symbolication_function = get_symbolication_function(data) symbolication_function_name = getattr(symbolication_function, "__name__", "none") if killswitch_matches_context( "store.load-shed-symbolicate-event-projects", { "project_id": project_id, "event_id": event_id, "platform": data.get("platform") or "null", "symbolication_function": symbolication_function_name, }, ): return _continue_to_process_event() has_changed = False symbolication_start_time = time() submission_ratio = options.get("symbolicate-event.low-priority.metrics.submission-rate") submit_realtime_metrics = not from_reprocessing and random.random() < submission_ratio if submit_realtime_metrics: with sentry_sdk.start_span(op="tasks.store.symbolicate_event.low_priority.metrics.counter"): timestamp = int(symbolication_start_time) try: realtime_metrics.increment_project_event_counter(project_id, timestamp) except Exception as e: sentry_sdk.capture_exception(e) with sentry_sdk.start_span(op="tasks.store.symbolicate_event.symbolication") as span: span.set_data("symbolication_function", symbolication_function_name) with metrics.timer( "tasks.store.symbolicate_event.symbolication", tags={"symbolication_function": symbolication_function_name}, ): while True: try: with sentry_sdk.start_span( op="tasks.store.symbolicate_event.%s" % symbolication_function_name ) as span: symbolicated_data = symbolication_function(data) span.set_data("symbolicated_data", bool(symbolicated_data)) if symbolicated_data: data = symbolicated_data has_changed = True break except RetrySymbolication as e: if ( time() - symbolication_start_time ) > settings.SYMBOLICATOR_PROCESS_EVENT_WARN_TIMEOUT: error_logger.warning( "symbolicate.slow", extra={"project_id": project_id, "event_id": event_id}, ) if ( time() - symbolication_start_time ) > settings.SYMBOLICATOR_PROCESS_EVENT_HARD_TIMEOUT: # Do not drop event but actually continue with rest of pipeline # (persisting unsymbolicated event) metrics.incr( "tasks.store.symbolicate_event.fatal", tags={ "reason": "timeout", "symbolication_function": symbolication_function_name, }, ) error_logger.exception( "symbolicate.failed.infinite_retry", extra={"project_id": project_id, "event_id": event_id}, ) data.setdefault("_metrics", {})["flag.processing.error"] = True data.setdefault("_metrics", {})["flag.processing.fatal"] = True has_changed = True break else: # sleep for `retry_after` but max 5 seconds and try again metrics.incr( "tasks.store.symbolicate_event.retry", tags={"symbolication_function": symbolication_function_name}, ) sleep(min(e.retry_after, SYMBOLICATOR_MAX_RETRY_AFTER)) continue except Exception: metrics.incr( "tasks.store.symbolicate_event.fatal", tags={ "reason": "error", "symbolication_function": symbolication_function_name, }, ) error_logger.exception("tasks.store.symbolicate_event.symbolication") data.setdefault("_metrics", {})["flag.processing.error"] = True data.setdefault("_metrics", {})["flag.processing.fatal"] = True has_changed = True break if submit_realtime_metrics: with sentry_sdk.start_span( op="tasks.store.symbolicate_event.low_priority.metrics.histogram" ): symbolication_duration = int(time() - symbolication_start_time) try: realtime_metrics.increment_project_duration_counter( project_id, timestamp, symbolication_duration ) except Exception as e: sentry_sdk.capture_exception(e) # We cannot persist canonical types in the cache, so we need to # downgrade this. if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) if has_changed: cache_key = event_processing_store.store(data) return _continue_to_process_event()
def handle_owner_assignment(project, group, event): from sentry.models import GroupAssignee, ProjectOwnership with metrics.timer("post_process.handle_owner_assignment"): with sentry_sdk.start_span( op="post_process.handle_owner_assignment.cache_set_owner"): owner_key = "owner_exists:1:%s" % group.id owners_exists = cache.get(owner_key) if owners_exists is None: owners_exists = group.groupowner_set.exists() # Cache for an hour if it's assigned. We don't need to move that fast. cache.set(owner_key, owners_exists, 3600 if owners_exists else 60) with sentry_sdk.start_span( op="post_process.handle_owner_assignment.cache_set_assignee"): # Is the issue already assigned to a team or user? assignee_key = "assignee_exists:1:%s" % group.id assignees_exists = cache.get(assignee_key) if assignees_exists is None: assignees_exists = group.assignee_set.exists() # Cache for an hour if it's assigned. We don't need to move that fast. cache.set(assignee_key, assignees_exists, 3600 if assignees_exists else 60) if owners_exists and assignees_exists: return with sentry_sdk.start_span( op="post_process.handle_owner_assignment.get_autoassign_owners" ): if killswitch_matches_context( "post_process.get-autoassign-owners", { "project_id": project.id, }, ): # see ProjectOwnership.get_autoassign_owners auto_assignment = False owners = [] assigned_by_codeowners = False else: ( auto_assignment, owners, assigned_by_codeowners, ) = ProjectOwnership.get_autoassign_owners( group.project_id, event.data) with sentry_sdk.start_span( op="post_process.handle_owner_assignment.analytics_record"): if auto_assignment and owners and not assignees_exists: assignment = GroupAssignee.objects.assign(group, owners[0], create_only=True) if assignment["new_assignment"] or assignment[ "updated_assignment"]: analytics.record( "codeowners.assignment" if assigned_by_codeowners else "issueowners.assignment", organization_id=project.organization_id, project_id=project.id, group_id=group.id, ) with sentry_sdk.start_span( op="post_process.handle_owner_assignment.handle_group_owners"): if owners and not owners_exists: try: handle_group_owners(project, group, owners) except Exception: logger.exception("Failed to store group owners")
def buffered_delete_old_primary_hash( project_id, group_id, event_id=None, datetime=None, old_primary_hash=None, current_primary_hash=None, force_flush_batch: bool = False, ): """ In case the primary hash changed during reprocessing, we need to tell Snuba before reinserting the event. Snuba may then insert a tombstone row depending on whether the primary_hash is part of the PK/sortkey or not. Only when the primary_hash changed and is part of the sortkey, we need to explicitly tombstone the old row. If the primary_hash is not part of the PK/sortkey, or if the primary_hash did not change, nothing needs to be done as ClickHouse's table merge will merge the two rows together. Like `buffered_handle_remaining_events`, this is a quick and dirty way to batch event IDs so requests to tombstone rows are not being individually sent over to Snuba. This also includes the same constraints for optimal performance as `buffered_handle_remaining_events` in that events being fed to this should have datetimes as close to each other as possible. Unfortunately, this function is invoked by tasks that are run asynchronously and therefore the guarantee from `buffered_handle_remaining_events` regarding events being sorted by timestamps is not applicable here. This function also does not batch events which have different old primary hashes together into one operation. This means that if the data being fed in tends to have a 1:1 ratio of event:old primary hashes, then the buffering in this effectively does nothing. """ from sentry import killswitches if killswitches.killswitch_matches_context( "reprocessing2.drop-delete-old-primary-hash", {"project_id": project_id}): return client = _get_sync_redis_client() # This is a meta key that contains old primary hashes. These hashes are then # combined with other values to construct a key that points to a list of # tombstonable events. primary_hash_set_key = f"re2:tombstone-primary-hashes:{project_id}:{group_id}" old_primary_hashes = client.smembers(primary_hash_set_key) def build_event_key(primary_hash): return f"re2:tombstones:{{{project_id}:{group_id}:{primary_hash}}}" if old_primary_hash is not None and old_primary_hash != current_primary_hash: event_key = build_event_key(old_primary_hash) client.lpush(event_key, f"{to_timestamp(datetime)};{event_id}") client.expire(event_key, settings.SENTRY_REPROCESSING_SYNC_TTL) if old_primary_hash not in old_primary_hashes: old_primary_hashes.add(old_primary_hash) client.sadd(primary_hash_set_key, old_primary_hash) client.expire(primary_hash_set_key, settings.SENTRY_REPROCESSING_SYNC_TTL) # Events for a group are split and bucketed by their primary hashes. If flushing is to be # performed on a per-group basis, the event count needs to be summed up across all buckets # belonging to a single group. event_count = 0 for primary_hash in old_primary_hashes: key = build_event_key(primary_hash) event_count += client.llen(key) if force_flush_batch or event_count > settings.SENTRY_REPROCESSING_REMAINING_EVENTS_BUF_SIZE: with sentry_sdk.start_span( op= "sentry.reprocessing2.buffered_delete_old_primary_hash.flush_events" ): for primary_hash in old_primary_hashes: event_key = build_event_key(primary_hash) event_ids, from_date, to_date = pop_batched_events_from_redis( event_key) # Racing might be happening between two different tasks. Give up on the # task that's lagging behind by prematurely terminating flushing. if len(event_ids) == 0: with sentry_sdk.configure_scope() as scope: scope.set_tag("project_id", project_id) scope.set_tag("old_group_id", group_id) scope.set_tag("old_primary_hash", old_primary_hash) logger.error( "reprocessing2.buffered_delete_old_primary_hash.empty_batch" ) return from sentry import eventstream # In the worst case scenario, a group will have a 1:1 mapping of primary hashes to # events, which means 1 insert per event. # The overall performance of this will be marginally better than the unbatched version # if a group has a lot of old primary hashes. eventstream.tombstone_events_unsafe( project_id, event_ids, old_primary_hash=old_primary_hash, from_timestamp=from_date, to_timestamp=to_date, ) # Try to track counts so if it turns out that tombstoned events trend towards a ratio of 1 # event per hash, a different solution may need to be considered. ratio = 0 if len( old_primary_hashes) == 0 else event_count / len(old_primary_hashes) metrics.timing( key="reprocessing2.buffered_delete_old_primary_hash.event_count", value=event_count, ) metrics.timing( key= "reprocessing2.buffered_delete_old_primary_hash.primary_hash_count", value=len(old_primary_hashes), ) metrics.timing( key= "reprocessing2.buffered_delete_old_primary_hash.primary_hash_to_event_ratio", value=ratio, )