def reprocess_group( project_id, group_id, new_group_id=None, query_state=None, start_time=None, max_events=None, acting_user_id=None, ): from sentry.reprocessing2 import start_group_reprocessing if start_time is None: assert new_group_id is None start_time = time.time() new_group_id = start_group_reprocessing(project_id, group_id, max_events=max_events, acting_user_id=acting_user_id) assert new_group_id is not None query_state, events = celery_run_batch_query( filter=eventstore.Filter(project_ids=[project_id], group_ids=[group_id]), batch_size=GROUP_REPROCESSING_CHUNK_SIZE, state=query_state, referrer="reprocessing2.reprocess_group", ) if not events: return tombstoned_event_ids = [] for event in events: if max_events is None or max_events > 0: reprocess_event.delay( project_id=project_id, event_id=event.event_id, start_time=start_time, ) if max_events is not None: max_events -= 1 else: tombstoned_event_ids.append(event.event_id) # len(tombstoned_event_ids) is upper-bounded by GROUP_REPROCESSING_CHUNK_SIZE if tombstoned_event_ids: tombstone_events.delay(project_id=project_id, group_id=group_id, event_ids=tombstoned_event_ids) reprocess_group.delay( project_id=project_id, group_id=group_id, new_group_id=new_group_id, query_state=query_state, start_time=start_time, max_events=max_events, )
def unmerge( project_id, source_id, destination_id, fingerprints, actor_id, last_event=None, batch_size=500, source_fields_reset=False, eventstream_state=None, ): source = Group.objects.get(project_id=project_id, id=source_id) caches = get_caches() project = caches["Project"](project_id) # On the first iteration of this loop, we clear out all of the # denormalizations from the source group so that we can have a clean slate # for the new, repaired data. if last_event is None: fingerprints = lock_hashes(project_id, source_id, fingerprints) truncate_denormalizations(project, source) last_event, events = celery_run_batch_query( filter=eventstore.Filter(project_ids=[project_id], group_ids=[source.id]), batch_size=batch_size, state=last_event, referrer="unmerge", ) # If there are no more events to process, we're done with the migration. if not events: unlock_hashes(project_id, fingerprints) logger.warning("Unmerge complete (eventstream state: %s)", eventstream_state) if eventstream_state: eventstream.end_unmerge(eventstream_state) return destination_id source_events = [] destination_events = [] for event in events: (destination_events if get_fingerprint(event) in fingerprints else source_events).append(event) if source_events: if not source_fields_reset: source.update( **get_group_creation_attributes(caches, source_events)) source_fields_reset = True else: source.update( **get_group_backfill_attributes(caches, source, source_events)) (destination_id, eventstream_state) = migrate_events( caches, project, source_id, destination_id, fingerprints, destination_events, actor_id, eventstream_state, ) repair_denormalizations(caches, project, events) unmerge.delay( project_id, source_id, destination_id, fingerprints, actor_id, last_event=last_event, batch_size=batch_size, source_fields_reset=source_fields_reset, eventstream_state=eventstream_state, )
def reprocess_group( project_id, group_id, remaining_events="delete", new_group_id=None, query_state=None, start_time=None, max_events=None, acting_user_id=None, ): sentry_sdk.set_tag("project", project_id) from sentry.reprocessing2 import ( CannotReprocess, logger, mark_event_reprocessed, reprocess_event, start_group_reprocessing, ) if start_time is None: assert new_group_id is None start_time = time.time() new_group_id = start_group_reprocessing( project_id, group_id, max_events=max_events, acting_user_id=acting_user_id, remaining_events=remaining_events, ) assert new_group_id is not None query_state, events = celery_run_batch_query( filter=eventstore.Filter(project_ids=[project_id], group_ids=[group_id]), batch_size=GROUP_REPROCESSING_CHUNK_SIZE, state=query_state, referrer="reprocessing2.reprocess_group", ) if not events: # Need to delay this until we have enqueued all events and stopped # iterating over the batch query, if we take care of this in # finish_reprocessing it won't work, as for small max_events # finish_reprocessing may execute sooner than the last reprocess_group # iteration. eventstream.exclude_groups(project_id, [group_id]) return remaining_event_ids = [] remaining_events_min_datetime = None remaining_events_max_datetime = None for event in events: if max_events is None or max_events > 0: with sentry_sdk.start_span(op="reprocess_event"): try: reprocess_event( project_id=project_id, event_id=event.event_id, start_time=start_time, ) except CannotReprocess as e: logger.error(f"reprocessing2.{e}") except Exception: sentry_sdk.capture_exception() else: if max_events is not None: max_events -= 1 continue # In case of errors while kicking off reprocessing, mark the event # as reprocessed such that progressbar advances and the # finish_reprocessing task is still correctly spawned. mark_event_reprocessed(group_id=group_id, project_id=project_id) # In case of errors while kicking off reprocessing or if max_events has # been exceeded, do the default action. if remaining_events_min_datetime is None or remaining_events_min_datetime > event.datetime: remaining_events_min_datetime = event.datetime if remaining_events_max_datetime is None or remaining_events_max_datetime < event.datetime: remaining_events_max_datetime = event.datetime remaining_event_ids.append(event.event_id) # len(remaining_event_ids) is upper-bounded by GROUP_REPROCESSING_CHUNK_SIZE if remaining_event_ids: handle_remaining_events.delay( project_id=project_id, new_group_id=new_group_id, event_ids=remaining_event_ids, remaining_events=remaining_events, from_timestamp=remaining_events_min_datetime, to_timestamp=remaining_events_max_datetime, ) reprocess_group.delay( project_id=project_id, group_id=group_id, new_group_id=new_group_id, query_state=query_state, start_time=start_time, max_events=max_events, remaining_events=remaining_events, )
def reprocess_group( project_id, group_id, remaining_events="delete", new_group_id=None, query_state=None, start_time=None, max_events=None, acting_user_id=None, ): sentry_sdk.set_tag("project", project_id) sentry_sdk.set_tag("group_id", group_id) from sentry.reprocessing2 import ( CannotReprocess, buffered_handle_remaining_events, logger, reprocess_event, start_group_reprocessing, ) sentry_sdk.set_tag("is_start", "false") # Only executed once during reprocessing if start_time is None: assert new_group_id is None start_time = time.time() metrics.incr("events.reprocessing.start_group_reprocessing", sample_rate=1.0) sentry_sdk.set_tag("is_start", "true") new_group_id = start_group_reprocessing( project_id, group_id, max_events=max_events, acting_user_id=acting_user_id, remaining_events=remaining_events, ) assert new_group_id is not None query_state, events = celery_run_batch_query( filter=eventstore.Filter(project_ids=[project_id], group_ids=[group_id]), batch_size=settings.SENTRY_REPROCESSING_PAGE_SIZE, state=query_state, referrer="reprocessing2.reprocess_group", ) if not events: # Migrate events that belong to new group generated after reprocessing buffered_handle_remaining_events( project_id=project_id, old_group_id=group_id, new_group_id=new_group_id, datetime_to_event=[], remaining_events=remaining_events, force_flush_batch=True, ) return remaining_event_ids = [] for event in events: if max_events is None or max_events > 0: with sentry_sdk.start_span(op="reprocess_event"): try: reprocess_event( project_id=project_id, event_id=event.event_id, start_time=start_time, ) except CannotReprocess as e: logger.error(f"reprocessing2.{e}") except Exception: sentry_sdk.capture_exception() else: if max_events is not None: max_events -= 1 continue # In case of errors while kicking off reprocessing or if max_events has # been exceeded, do the default action. remaining_event_ids.append((event.datetime, event.event_id)) # len(remaining_event_ids) is upper-bounded by settings.SENTRY_REPROCESSING_PAGE_SIZE if remaining_event_ids: buffered_handle_remaining_events( project_id=project_id, old_group_id=group_id, new_group_id=new_group_id, datetime_to_event=remaining_event_ids, remaining_events=remaining_events, ) reprocess_group.delay( project_id=project_id, group_id=group_id, new_group_id=new_group_id, query_state=query_state, start_time=start_time, max_events=max_events, remaining_events=remaining_events, )
def unmerge(*posargs, **kwargs): args = UnmergeArgsBase.parse_arguments(*posargs, **kwargs) source = Group.objects.get(project_id=args.project_id, id=args.source_id) caches = get_caches() project = caches["Project"](args.project_id) # On the first iteration of this loop, we clear out all of the # denormalizations from the source group so that we can have a clean slate # for the new, repaired data. if isinstance(args, InitialUnmergeArgs): locked_primary_hashes = lock_hashes( args.project_id, args.source_id, args.replacement.primary_hashes_to_lock) truncate_denormalizations(project, source) last_event = None else: last_event = args.last_event locked_primary_hashes = args.locked_primary_hashes last_event, events = celery_run_batch_query( filter=eventstore.Filter(project_ids=[args.project_id], group_ids=[source.id]), batch_size=args.batch_size, state=last_event, referrer="unmerge", ) # If there are no more events to process, we're done with the migration. if not events: unlock_hashes(args.project_id, locked_primary_hashes) for unmerge_key, (group_id, eventstream_state) in args.destinations.items(): logger.warning("Unmerge complete (eventstream state: %s)", eventstream_state) if eventstream_state: args.replacement.stop_snuba_replacement(eventstream_state) return source_events = [] destination_events = {} for event in events: unmerge_key = args.replacement.get_unmerge_key(event, locked_primary_hashes) if unmerge_key is not None: destination_events.setdefault(unmerge_key, []).append(event) else: source_events.append(event) source_fields_reset = isinstance( args, SuccessiveUnmergeArgs) and args.source_fields_reset if source_events: if not source_fields_reset: source.update( **get_group_creation_attributes(caches, source_events)) source_fields_reset = True else: source.update( **get_group_backfill_attributes(caches, source, source_events)) destinations = dict(args.destinations) # XXX: This is only actually able to create a destination group and migrate # the group hashes if there are events that can be migrated. How do we # handle this if there aren't any events? We can't create a group (there # isn't any data to derive the aggregates from), so we'd have to mark the # hash as in limbo somehow...?) for unmerge_key, _destination_events in destination_events.items(): destination_id, eventstream_state = destinations.get(unmerge_key) or ( None, None) (destination_id, eventstream_state) = migrate_events( caches, project, args, _destination_events, locked_primary_hashes, destination_id, eventstream_state, ) destinations[unmerge_key] = destination_id, eventstream_state repair_denormalizations(caches, project, events) new_args = SuccessiveUnmergeArgs( project_id=args.project_id, source_id=args.source_id, replacement=args.replacement, actor_id=args.actor_id, batch_size=args.batch_size, last_event=last_event, destinations=destinations, locked_primary_hashes=locked_primary_hashes, source_fields_reset=source_fields_reset, ) unmerge.delay(**new_args.dump_arguments())