def test_basic(self): total = 10 for _ in range(total): self.create_user() qs = User.objects.all() assert len(list(RangeQuerySetWrapper(qs, step=2))) == total assert len(list(RangeQuerySetWrapper(qs, limit=5))) == 5
def cleanup_event_attachment_files(apps, schema_editor): """ Previously, cleanup task code did a SQL bulk delete on EventAttachment leaving orphaned File and FileBlob objects. These orphaned files now need to be purged as they are still consuming space. """ EventAttachment = apps.get_model("sentry", "EventAttachment") File = apps.get_model("sentry", "File") # Find the oldest live attachment as we only want to purge old files. # If there are not files skip everything. oldest_attachment = EventAttachment.objects.all().aggregate(Min("date_added")) if not oldest_attachment or oldest_attachment["date_added__min"] is None: return # File types used in event attachments. attachment_types = [ "event.applecrashreport", "event.attachment", "event.payload", "event.minidump", "unreal.context", "unreal.logs", ] file_query = File.objects.filter(timestamp__lt=oldest_attachment["date_added__min"]).filter( type__in=attachment_types ) for f in RangeQuerySetWrapper(file_query): # Double check that the file is not referenced. if not EventAttachment.objects.filter(file=f).exists(): f.delete()
def prepare_reports(dry_run=False, *args, **kwargs): timestamp, duration = _fill_default_parameters(*args, **kwargs) logger.info("reports.begin_prepare_report") organizations = _get_organization_queryset().values_list("id", flat=True) for i, organization_id in enumerate( RangeQuerySetWrapper(organizations, step=10000, result_value_getter=lambda item: item)): prepare_organization_report.delay(timestamp, duration, organization_id, dry_run=dry_run) if i % 10000 == 0: logger.info( "reports.scheduled_prepare_organization_report", extra={ "organization_id": organization_id, "total_scheduled": i }, ) default_cache.set(prepare_reports_verify_key(), "1", int(timedelta(days=3).total_seconds())) logger.info("reports.finish_prepare_report")
def backfill_group_ids(model): query = model.objects.filter(group_id__isnull=True) for attachment in RangeQuerySetWrapper(query, step=1000): event = eventstore.get_event_by_id(attachment.project_id, attachment.event_id) if event: model.objects.filter(id=attachment.id).update(group_id=event.group_id)
def cleanup_unused_files(quiet=False): """ Remove FileBlob's (and thus the actual files) if they are no longer referenced by any File. We set a minimum-age on the query to ensure that we don't try to remove any blobs which are brand new and potentially in the process of being referenced. """ from sentry.models import File, FileBlob, FileBlobIndex if quiet: from sentry.utils.query import RangeQuerySetWrapper else: from sentry.utils.query import RangeQuerySetWrapperWithProgressBar as RangeQuerySetWrapper cutoff = timezone.now() - timedelta(days=1) queryset = FileBlob.objects.filter(timestamp__lte=cutoff) for blob in RangeQuerySetWrapper(queryset): if FileBlobIndex.objects.filter(blob=blob).exists(): continue if File.objects.filter(blob=blob).exists(): continue blob.delete()
def repair_callsigns(): from sentry.utils.query import RangeQuerySetWrapperWithProgressBar, \ RangeQuerySetWrapper from sentry.models.counter import increment_project_counter from sentry.models import Organization, Group, Project click.echo('Repairing callsigns') queryset = Organization.objects.all() for org in RangeQuerySetWrapperWithProgressBar(queryset): projects = list(org.project_set.all()) callsigns = get_callsigns(projects) for project in projects: if project.callsign is None: Project.objects.filter( pk=project.id, callsign=None).update(callsign=callsigns[project.id]) q = Group.objects.filter( project=project, short_id=None, ) for group in RangeQuerySetWrapper(q): with catchable_atomic(): pending_short_id = increment_project_counter(project) updated = Group.objects.filter( pk=group.id, short_id=None).update(short_id=pending_short_id) if updated == 0: raise RollbackLocally()
def handle(self, **options): def _attach_fks(_events): project_ids = set([event.project_id for event in _events]) projects = { p.id: p for p in Project.objects.filter(id__in=project_ids) } group_ids = set([event.group_id for event in _events]) groups = {g.id: g for g in Group.objects.filter(id__in=group_ids)} for event in _events: event.project = projects[event.project_id] event.group = groups[event.group_id] from sentry import eventstream from sentry.utils.query import RangeQuerySetWrapper from_ts = options['from_ts'] to_ts = options['to_ts'] from_id = options['from_id'] to_id = options['to_id'] if (from_ts or to_ts) and (from_id or to_id): raise CommandError( 'You can either limit by primary key, or by timestamp.') elif from_ts and to_ts: events = self.get_events_by_timestamp(from_ts, to_ts) elif from_id and to_id: events = self.get_events_by_id(from_id, to_id) else: raise CommandError( 'Invalid arguments: either use --from/--to-id, or --from/--to-ts.' ) count = events.count() self.stdout.write('Events to process: {}\n'.format(count)) if count == 0: self.stdout.write('Nothing to do.\n') sys.exit(0) if not options['no_input']: proceed = raw_input('Do you want to continue? [y/N] ') if proceed.lower() not in ['yes', 'y']: raise CommandError('Aborted.') for event in RangeQuerySetWrapper(events, callbacks=(_attach_fks, )): primary_hash = event.get_primary_hash() eventstream.insert( group=event.group, event=event, is_new=False, is_sample=False, is_regression=False, is_new_group_environment=False, primary_hash=primary_hash, skip_consume=True, ) self.stdout.write('Done.\n')
def get_group_tag_value_iter(self, project_id, group_id, environment_id, key, callbacks=()): from sentry.utils.query import RangeQuerySetWrapper qs = self.get_group_tag_value_qs( project_id, group_id, environment_id, key ) return RangeQuerySetWrapper(queryset=qs, callbacks=callbacks)
def delete_alert_rules_incidents(apps, schema_editor): from sentry.utils.query import RangeQuerySetWrapper Incident = apps.get_model("sentry", "Incident") AlertRule = apps.get_model("sentry", "AlertRule") TimeSeriesSnapshot = apps.get_model("sentry", "TimeSeriesSnapshot") QuerySubscription = apps.get_model("sentry", "QuerySubscription") for incident in RangeQuerySetWrapper(Incident.objects.all()): incident.delete() for alert_rule in RangeQuerySetWrapper(AlertRule.objects.all()): alert_rule.delete() for snapshot in RangeQuerySetWrapper(TimeSeriesSnapshot.objects.all()): snapshot.delete() for sub in RangeQuerySetWrapper(QuerySubscription.objects.all()): sub.delete()
def test_loop_and_delete(self): total = 10 for _ in range(total): self.create_user() qs = User.objects.all() for user in RangeQuerySetWrapper(qs, step=2): user.delete() assert User.objects.all().count() == 0
def prepare_reports(dry_run=False, *args, **kwargs): timestamp, duration = _fill_default_parameters(*args, **kwargs) logger.info("reports.begin_prepare_report") organizations = _get_organization_queryset() for organization in RangeQuerySetWrapper(organizations, step=10000): prepare_organization_report.delay(timestamp, duration, organization.id, dry_run=dry_run)
def backfill_file_type(apps, schema_editor): """ Fill the new EventAttachment.type column with values from EventAttachment.file.type. """ EventAttachment = apps.get_model("sentry", "EventAttachment") all_event_attachments = EventAttachment.objects.select_related( "file").all() for event_attachment in RangeQuerySetWrapper(all_event_attachments, step=1000): if event_attachment.type is None: event_attachment.type = event_attachment.file.type event_attachment.save(update_fields=["type"])
def iterator_generic(self, chunk_size): from sentry.utils.query import RangeQuerySetWrapper qs = self.get_generic_queryset() chunk = [] for item in RangeQuerySetWrapper(qs): chunk.append(item.id) if len(chunk) == chunk_size: yield tuple(chunk) chunk = [] if chunk: yield tuple(chunk)
def forwards(self, orm): from sentry.utils.query import RangeQuerySetWrapper queryset = orm['sentry.File'].objects.all() for file in RangeQuerySetWrapper(queryset): if file.size: continue orm['sentry.File'].objects.filter(id=file.id).update( size=sum([ fbi.blob.size for fbi in orm['sentry.FileBlobIndex'].objects.filter( file=file, ).select_related('blob') ]), )
def get_scim_teams_members( team_list: Sequence[Team], ) -> MutableMapping[Team, MutableSequence[MutableMapping[str, Any]]]: members = RangeQuerySetWrapper( OrganizationMember.objects.filter(teams__in=team_list) .select_related("user") .prefetch_related("teams") .distinct("id"), limit=10000, ) member_map: MutableMapping[Team, MutableSequence[MutableMapping[str, Any]]] = defaultdict(list) for member in members: for team in member.teams.all(): member_map[team].append({"value": str(member.id), "display": member.get_email()}) return member_map
def backfill_snuba_query_event_type(apps, schema_editor): """ This backfills all SnubaQuery rows that don't have a `SnubaQueryEventType`. """ SnubaQuery = apps.get_model("sentry", "SnubaQuery") SnubaQueryEventType = apps.get_model("sentry", "SnubaQueryEventType") for snuba_query in RangeQuerySetWrapper(SnubaQuery.objects.all()): if not SnubaQueryEventType.objects.filter( snuba_query=snuba_query).exists(): # 0 is SnubaQueryEventType.EventTypes.ERROR, # 2 is SnubaQueryEventType.EventTypes.TRANSACTION. SnubaQueryEventType.objects.create( snuba_query=snuba_query, type=(0 if snuba_query.dataset == "events" else 2))
def get(self, request, organization, project, group_id, key): try: # TODO(tkaemming): This should *actually* redirect, see similar # comment in ``GroupEndpoint.convert_args``. group, _ = get_group_with_redirect( group_id, queryset=Group.objects.filter(project=project), ) except Group.DoesNotExist: raise Http404 if tagstore.is_reserved_key(key): lookup_key = 'sentry:{0}'.format(key) else: lookup_key = key try: environment_id = self._get_environment_id_from_request( request, project.organization_id) except Environment.DoesNotExist: # if the environment doesn't exist then the tag can't possibly exist raise Http404 # validate existance as it may be deleted try: tagstore.get_tag_key(project.id, environment_id, lookup_key) except tagstore.TagKeyNotFound: raise Http404 if key == 'user': callbacks = [attach_eventuser(project.id)] else: callbacks = [] queryset = RangeQuerySetWrapper( tagstore.get_group_tag_value_qs(group.project_id, group.id, environment_id, lookup_key), callbacks=callbacks, ) filename = '{}-{}'.format( group.qualified_short_id or group.id, key, ) return self.to_csv_response(queryset, filename, key=key)
def get(self, request, organization, project, team, group_id, key): try: # TODO(tkaemming): This should *actually* redirect, see similar # comment in ``GroupEndpoint.convert_args``. group, _ = get_group_with_redirect( group_id, queryset=Group.objects.filter(project=project), ) except Group.DoesNotExist: raise Http404 if TagKey.is_reserved_key(key): lookup_key = 'sentry:{0}'.format(key) else: lookup_key = key # validate existance as it may be deleted try: TagKey.objects.get( project_id=group.project_id, key=lookup_key, status=TagKeyStatus.VISIBLE, ) except TagKey.DoesNotExist: raise Http404 if key == 'user': callbacks = [attach_eventuser(project.id)] else: callbacks = [] queryset = RangeQuerySetWrapper( GroupTagValue.objects.filter( group_id=group.id, key=lookup_key, ), callbacks=callbacks, ) filename = '{}-{}'.format( group.qualified_short_id or group.id, key, ) return self.to_csv_response(queryset, filename, key=key)
def backfill_user_reports(apps, schema_editor): """ Processes user reports that are missing event data, and adds the appropriate data if the event exists in Clickhouse. """ UserReport = apps.get_model("sentry", "UserReport") user_reports = UserReport.objects.filter(group_id__isnull=True, environment_id__isnull=True) for report in RangeQuerySetWrapper(user_reports, step=1000): try: event = eventstore.get_event_by_id(report.project_id, report.event_id) except (SnubaError, QueryOutsideGroupActivityError, QueryOutsideRetentionError) as se: logger.warn( "failed to fetch event %s for project %d: %s" % (report.event_id, report.project_id, se) ) continue if event: report.update(group_id=event.group_id, environment_id=event.get_environment().id)
def cleanup(days=30, project=None, **kwargs): """ Deletes a portion of the trailing data in Sentry based on their creation dates. For example, if ``days`` is 30, this would attempt to clean up all data thats older than 30 days. :param project: limit all deletion scopes to messages that are part of the given project """ import datetime from django.utils import timezone from sentry.models import (Group, Event, MessageCountByMinute, MessageFilterValue, FilterKey, FilterValue, ProjectCountByMinute, SearchDocument, Activity, AffectedUserByGroup, LostPasswordHash) from sentry.utils.query import RangeQuerySetWrapper GENERIC_DELETES = ( (SearchDocument, 'date_changed'), (MessageCountByMinute, 'date'), (ProjectCountByMinute, 'date'), (MessageFilterValue, 'last_seen'), (Event, 'datetime'), (Activity, 'datetime'), (AffectedUserByGroup, 'last_seen'), # Group should probably be last (Group, 'last_seen'), ) log = cleanup.get_logger() ts = timezone.now() - datetime.timedelta(days=days) # Remove types which can easily be bound to project + date for model, date_col in GENERIC_DELETES: log.info("Removing %r for days=%s project=%r", model, days, project or '*') qs = model.objects.filter(**{'%s__lte' % (date_col, ): ts}) if project: qs = qs.filter(project=project) # XXX: we step through because the deletion collector will pull all relations into memory for obj in RangeQuerySetWrapper(qs): log.info("Removing %r", obj) obj.delete() log.info("Removing expired values for %r", LostPasswordHash) LostPasswordHash.objects.filter(date_added__lte=timezone.now() - datetime.timedelta(days=1)).delete() # We'll need this to confirm deletion of FilterKey and Filtervalue objects. mqs = MessageFilterValue.objects.all() if project: mqs = mqs.filter(project=project) # FilterKey log.info("Removing %r for days=%s project=%r", FilterKey, days, project or '*') qs = FilterKey.objects.all() if project: qs = qs.filter(project=project) for obj in RangeQuerySetWrapper(qs): if not mqs.filter(key=obj.key).exists(): log.info( "Removing unused filter %s=*", obj.key, ) qs.filter(key=obj.key).delete() obj.delete() # FilterValue log.info("Removing %r for days=%s project=%r", FilterValue, days, project or '*') qs = FilterValue.objects.all() if project: qs = qs.filter(project=project) for obj in RangeQuerySetWrapper(qs): if not mqs.filter(key=obj.key, value=obj.value).exists(): log.info("Removing unused filter %s=%s", obj.key, obj.value) qs.filter(key=obj.key, value=obj.value).delete() obj.delete()
def cleanup(days=30, logger=None, site=None, server=None, level=None, project=None): """ Deletes a portion of the trailing data in Sentry based on their creation dates. For example, if ``days`` is 30, this would attempt to clean up all data thats older than 30 days. :param logger: limit all deletion scopes to messages from the specified logger. :param site: limit the message deletion scope to the specified site. :param server: limit the message deletion scope to the specified server. :param level: limit all deleteion scopes to messages that are greater than or equal to level. """ import datetime from sentry.models import Group, Event, MessageCountByMinute, \ MessageFilterValue, FilterValue from sentry.utils import timezone from sentry.utils.query import RangeQuerySetWrapper, SkinnyQuerySet # TODO: we should collect which messages above were deleted # and potentially just send out post_delete signals where # GroupedMessage can update itself accordingly ts = timezone.now() - datetime.timedelta(days=days) # Message qs = SkinnyQuerySet(Event).filter(datetime__lte=ts) if logger: qs = qs.filter(logger=logger) if site: qs = qs.filter(site=site) if server: qs = qs.filter(server_name=server) if level: qs = qs.filter(level__gte=level) if project: qs = qs.filter(project=project) groups_to_check = set() for obj in RangeQuerySetWrapper(qs): print ">>> Removing <%s: id=%s>" % (obj.__class__.__name__, obj.pk) obj.delete() groups_to_check.add(obj.group_id) if not (server or site): # MessageCountByMinute qs = SkinnyQuerySet(MessageCountByMinute).filter(date__lte=ts) if logger: qs = qs.filter(group__logger=logger) if level: qs = qs.filter(group__level__gte=level) if project: qs = qs.filter(project=project) for obj in RangeQuerySetWrapper(qs): print ">>> Removing <%s: id=%s>" % (obj.__class__.__name__, obj.pk) obj.delete() # GroupedMessage qs = SkinnyQuerySet(Group).filter(last_seen__lte=ts) if logger: qs = qs.filter(logger=logger) if level: qs = qs.filter(level__gte=level) if project: qs = qs.filter(project=project) for obj in RangeQuerySetWrapper(qs): for key, value in SkinnyQuerySet(MessageFilterValue).filter( group=obj).values_list('key', 'value'): if not MessageFilterValue.objects.filter( key=key, value=value).exclude(group=obj).exists(): print ">>> Removing <FilterValue: key=%s, value=%s>" % ( key, value) FilterValue.objects.filter(key=key, value=value).delete() print ">>> Removing <%s: id=%s>" % (obj.__class__.__name__, obj.pk) obj.delete() # attempt to cleanup any groups that may now be empty groups_to_delete = [] for group_id in groups_to_check: if not Event.objects.filter(group=group_id).exists(): groups_to_delete.append(group_id) if groups_to_delete: for obj in SkinnyQuerySet(Group).filter(pk__in=groups_to_delete): for key, value in SkinnyQuerySet(MessageFilterValue).filter( group=obj).values_list('key', 'value'): if not MessageFilterValue.objects.filter( key=key, value=value).exclude(group=obj).exists(): print ">>> Removing <FilterValue: key=%s, value=%s>" % ( key, value) FilterValue.objects.filter(key=key, value=value).delete() print ">>> Removing <%s: id=%s>" % (obj.__class__.__name__, obj.pk) obj.delete()
def forwards(self, orm): from sentry.utils.query import (RangeQuerySetWrapper, RangeQuerySetWrapperWithProgressBar, WithProgressBar) Organization = orm['sentry.Organization'] OrganizationMember = orm['sentry.OrganizationMember'] PendingTeamMember = orm['sentry.PendingTeamMember'] TeamMember = orm['sentry.TeamMember'] Team = orm['sentry.Team'] teams_by_org = defaultdict(list) for org in RangeQuerySetWrapper(Organization.objects.all()): for team in Team.objects.filter(organization=org): teams_by_org[org].append(team) for org, team_list in WithProgressBar(list(teams_by_org.items()), caption='Organizations'): team_member_qs = TeamMember.objects.filter( team__organization=org).select_related('team') members_by_user = defaultdict(list) for member in team_member_qs.iterator(): if member.user_id == member.team.owner_id: continue # team owners are already present members_by_user[member.user_id].append(member) total_teams = len(team_list) for user_id, member_list in six.iteritems(members_by_user): # if they were a member of all teams, give them global access has_global_access = len(member_list) == total_teams # give them the highest level access they had access = min(m.type for m in member_list) sid = transaction.savepoint() try: om = OrganizationMember.objects.create( organization=org, user_id=user_id, type=access, has_global_access=has_global_access, ) except IntegrityError: transaction.savepoint_rollback(sid) continue else: transaction.savepoint_commit(sid) if not has_global_access: for member in member_list: om.teams.add(member.team) for pm in PendingTeamMember.objects.filter(team=team): om, _ = OrganizationMember.objects.get_or_create( organization=org, email=pm.email, has_global_access=False, defaults={'type': pm.type}, ) om.teams.add(team) transaction.commit()
def cleanup(days=30, logger=None, site=None, server=None, level=None, project=None, resolved=None, **kwargs): """ Deletes a portion of the trailing data in Sentry based on their creation dates. For example, if ``days`` is 30, this would attempt to clean up all data thats older than 30 days. :param logger: limit all deletion scopes to messages from the specified logger. :param site: limit the message deletion scope to the specified site. :param server: limit the message deletion scope to the specified server. :param level: limit all deletion scopes to messages that are greater than or equal to level. :param project: limit all deletion scopes to messages that are part of the given project :param resolved: limit all deletion scopes to messages that are resolved. """ import datetime from django.utils import timezone from sentry.models import Group, Event, MessageCountByMinute, \ MessageFilterValue, FilterKey, FilterValue, SearchDocument, ProjectCountByMinute from sentry.utils.query import RangeQuerySetWrapper, SkinnyQuerySet log = cleanup.get_logger() def cleanup_groups(iterable): for obj in iterable: log.info("Removing all matching <SearchDocument: group=%s>", obj.pk) SearchDocument.objects.filter(group=obj).delete() log.info("Removing <%s: id=%s>", obj.__class__.__name__, obj.pk) obj.delete() # TODO: we should collect which messages above were deleted # and potentially just send out post_delete signals where # GroupedMessage can update itself accordingly ts = timezone.now() - datetime.timedelta(days=days) # Message qs = SkinnyQuerySet(Event).filter(datetime__lte=ts) if logger: qs = qs.filter(logger=logger) if site: qs = qs.filter(site=site) if server: qs = qs.filter(server_name=server) if level: qs = qs.filter(level__gte=level) if project: qs = qs.filter(project=project) if resolved is True: qs = qs.filter(group__status=1) elif resolved is False: qs = qs.filter(group__status=0) groups_to_check = set() if resolved is None: for obj in RangeQuerySetWrapper(qs): log.info("Removing <%s: id=%s>", obj.__class__.__name__, obj.pk) obj.delete() groups_to_check.add(obj.group_id) if not (server or site): # MessageCountByMinute qs = SkinnyQuerySet(MessageCountByMinute).filter(date__lte=ts) if logger: qs = qs.filter(group__logger=logger) if level: qs = qs.filter(group__level__gte=level) if project: qs = qs.filter(project=project) if resolved is True: qs = qs.filter(group__status=1) elif resolved is False: qs = qs.filter(group__status=0) for obj in RangeQuerySetWrapper(qs): log.info("Removing <%s: id=%s>", obj.__class__.__name__, obj.pk) obj.delete() # Group qs = SkinnyQuerySet(Group).filter(last_seen__lte=ts) if logger: qs = qs.filter(logger=logger) if level: qs = qs.filter(level__gte=level) if project: qs = qs.filter(project=project) if resolved is True: qs = qs.filter(status=1) elif resolved is False: qs = qs.filter(status=0) cleanup_groups(RangeQuerySetWrapper(qs)) # Project counts # TODO: these dont handle filters qs = SkinnyQuerySet(ProjectCountByMinute).filter(date__lte=ts) if project: qs = qs.filter(project=project) for obj in RangeQuerySetWrapper(qs): log.info("Removing <%s: id=%s>", obj.__class__.__name__, obj.pk) obj.delete() # Filters qs = FilterKey.objects.all() if project: qs = qs.filter(project=project) mqs = MessageFilterValue.objects.all() if project: mqs = mqs.filter(project=project) for obj in RangeQuerySetWrapper(qs): if not mqs.filter(key=obj.key).exists(): log.info( "Removing filters for unused filter %s=*", obj.key, ) qs.filter(key=obj.key).delete() obj.delete() qs = FilterValue.objects.all() if project: qs = qs.filter(project=project) for obj in RangeQuerySetWrapper(qs): if not mqs.filter(key=obj.key, value=obj.value).exists(): log.info("Removing filters for unused filter %s=%s", obj.key, obj.value) qs.filter(key=obj.key, value=obj.value).delete() obj.delete() # attempt to cleanup any groups that may now be empty groups_to_delete = [] for group_id in groups_to_check: if not Event.objects.filter(group=group_id).exists(): groups_to_delete.append(group_id) if groups_to_delete: cleanup_groups(SkinnyQuerySet(Group).filter(pk__in=groups_to_delete))
def backfill_eventstream(apps, schema_editor): """ Inserts Postgres events into the eventstream if there are recent events in Postgres. This is for open source users migrating from 9.x who want to keep their events. If there are no recent events in Postgres, skip the backfill. """ from sentry import eventstore, eventstream from sentry.utils.query import RangeQuerySetWrapper Event = apps.get_model("sentry", "Event") Group = apps.get_model("sentry", "Group") Project = apps.get_model("sentry", "Project") # Kill switch to skip this migration skip_backfill = os.environ.get("SENTRY_SKIP_EVENTS_BACKFILL_FOR_10", False) # Use 90 day retention if the option has not been set or set to 0 DEFAULT_RETENTION = 90 retention_days = options.get( "system.event-retention-days") or DEFAULT_RETENTION def get_events(last_days): to_date = timezone.now() from_date = to_date - timedelta(days=last_days) return Event.objects.filter(datetime__gte=from_date, datetime__lte=to_date, group_id__isnull=False) def _attach_related(_events): project_ids = set() group_ids = set() for event in _events: project_ids.add(event.project_id) group_ids.add(event.group_id) projects = { p.id: p for p in Project.objects.filter(id__in=project_ids) } groups = {g.id: g for g in Group.objects.filter(id__in=group_ids)} for event in _events: event.project = projects.get(event.project_id) event.group = groups.get(event.group_id) eventstore.bind_nodes(_events, "data") if skip_backfill: print("Skipping backfill.\n") return events = get_events(retention_days) count = events.count() if count == 0: print("Nothing to do, skipping migration.\n") return print("Events to process: {}\n".format(count)) processed = 0 for e in RangeQuerySetWrapper(events, step=100, callbacks=(_attach_related, )): event = NewEvent(project_id=e.project_id, event_id=e.event_id, group_id=e.group_id, data=e.data.data) primary_hash = event.get_primary_hash() if event.project is None or event.group is None or len( event.data) == 0: print( "Skipped {} as group, project or node data information is invalid.\n" .format(event)) continue try: eventstream.insert( group=event.group, event=event, is_new=False, is_regression=False, is_new_group_environment=False, primary_hash=primary_hash, received_timestamp=event.data.get("received") or float(event.datetime.strftime("%s")), skip_consume=True, ) processed += 1 except Exception as error: print( "An error occured while trying to instert the following event: {}\n.----\n{}" .format(event, error)) if processed == 0: raise Exception( "Cannot migrate any event. If this is okay, re-run migrations with SENTRY_SKIP_EVENTS_BACKFILL_FOR_10 environment variable set to skip this step." ) print("Event migration done. Migrated {} of {} events.\n".format( processed, count))
def handle(self, from_ts=None, to_ts=None, last_days=None, from_id=None, to_id=None, no_input=False, **options): def _attach_related(_events): project_ids = set([event.project_id for event in _events]) projects = { p.id: p for p in Project.objects.filter(id__in=project_ids) } group_ids = set([event.group_id for event in _events]) groups = {g.id: g for g in Group.objects.filter(id__in=group_ids)} for event in _events: event.project = projects[event.project_id] event.group = groups[event.group_id] eventstore.bind_nodes(_events, "data") from sentry import eventstream from sentry.utils.query import RangeQuerySetWrapper filter_methods = bool(last_days) + bool(from_ts or to_ts) + bool( from_id or to_id) if filter_methods > 1: raise CommandError( "You can either limit by primary key, or by timestamp, or last X days." ) elif from_ts and to_ts: events = self.get_events_by_timestamp(from_ts, to_ts) elif last_days: events = self.get_events_by_last_days(last_days) elif from_id and to_id: events = self.get_events_by_id(from_id, to_id) else: raise CommandError( "Invalid arguments: either use --from/--to-id, or --from/--to-ts, or --last-days." ) count = events.count() self.stdout.write("Events to process: {}\n".format(count)) if count == 0: self.stdout.write("Nothing to do.\n") sys.exit(0) if not no_input: proceed = six.moves.input("Do you want to continue? [y/N] ") if proceed.strip().lower() not in ["yes", "y"]: raise CommandError("Aborted.") for event in RangeQuerySetWrapper(events, step=100, callbacks=(_attach_related, )): primary_hash = event.get_primary_hash() eventstream.insert( group=event.group, event=event, is_new=False, is_regression=False, is_new_group_environment=False, primary_hash=primary_hash, skip_consume=True, ) self.stdout.write("Done.\n")
def forwards(self, orm): from django.db.models import F from collections import defaultdict from sentry.utils.models import create_or_update from sentry.utils.query import RangeQuerySetWrapper # We don't fully merge results because it's simply not worth it for group in RangeQuerySetWrapper(orm['sentry.Group'].objects.all()): # could be already migrated if not orm['sentry.Group'].objects.filter(id=group.id).exists(): continue matches = list(orm['sentry.Group'].objects.exclude( id=group.id).filter(checksum=group.checksum, project=group.project)) if not matches: continue print "Merging duplicate events for %r" % (group, ) updates = defaultdict(int) updates.update({ 'first_seen': group.first_seen, 'last_seen': group.last_seen, 'active_at': group.active_at, }) tag_updates = defaultdict(lambda: defaultdict(int)) counts = defaultdict(lambda: defaultdict(int)) for other in matches: # migrate events first orm['sentry.Event'].objects.filter(group=other).update( group=group) updates['times_seen'] += other.times_seen updates['users_seen'] += other.users_seen updates['time_spent_total'] += other.time_spent_total updates['time_spent_count'] += other.time_spent_count for datecol in ('active_at', 'last_seen', 'first_seen'): val = getattr(other, datecol) if val and updates[datecol]: updates[datecol] = max(val, updates[datecol]) elif val: updates[datecol] = val # determine missing tags for tag in RangeQuerySetWrapper( orm['sentry.MessageFilterValue'].objects.filter( group=other)): key = tag_updates[(tag.key, tag.value)] key['times_seen'] += other.times_seen for datecol in ('last_seen', 'first_seen'): val = getattr(other, datecol) if val and updates[datecol]: updates[datecol] = max(val, updates[datecol]) elif val: updates[datecol] = val # determine counts for count in RangeQuerySetWrapper( orm['sentry.MessageCountByMinute'].objects.filter( group=other)): key = counts[count.date] key['times_seen'] += count.times_seen key['time_spent_total'] += count.time_spent_total key['time_spent_count'] += count.time_spent_count # migrate tags for (key, value), data in tag_updates.iteritems(): defaults = { 'times_seen': F('times_seen') + data['times_seen'], } if 'last_seen' in data: defaults['last_seen'] = data['last_seen'] if 'first_seen' in data: defaults['first_seen'] = data['first_seen'] create_or_update(orm['sentry.MessageFilterValue'], project=group.project, group=group, key=key, value=value, defaults=defaults) orm['sentry.MessageFilterValue'].objects.filter( group__in=matches).delete() # migrate counts for date, data in counts.iteritems(): create_or_update( orm['sentry.MessageCountByMinute'], project=group.project, group=group, date=date, defaults={ 'times_seen': F('times_seen') + data['times_seen'], 'time_spent_total': F('time_spent_total') + data['time_spent_total'], 'time_spent_count': F('time_spent_count') + data['time_spent_count'], }) orm['sentry.MessageCountByMinute'].objects.filter( group__in=matches).delete() orm['sentry.Group'].objects.filter(id=group.id).update( times_seen=F('times_seen') + updates['times_seen'], users_seen=F('users_seen') + updates['user_seen'], time_spent_total=F('time_spent_total') + updates['time_spent_total'], time_spent_count=F('time_spent_count') + updates['time_spent_count'], last_seen=updates['last_seen'], first_seen=updates['first_seen'], active_at=updates['active_at'], ) for other in matches: other.delete()
def backfill_eventstream(apps, schema_editor): """ Inserts Postgres events into the eventstream if there are recent events in Postgres. This is for open source users migrating from 9.x who want to keep their events. If there are no recent events in Postgres, skip the backfill. """ from sentry import eventstore, eventstream from sentry.utils.query import RangeQuerySetWrapper Event = apps.get_model("sentry", "Event") Group = apps.get_model("sentry", "Group") Project = apps.get_model("sentry", "Project") # Kill switch to skip this migration skip_backfill = os.environ.get("SENTRY_SKIP_EVENTS_BACKFILL_FOR_10", False) # Use 90 day retention if the option has not been set or set to 0 DEFAULT_RETENTION = 90 retention_days = options.get( "system.event-retention-days") or DEFAULT_RETENTION def get_events(last_days): to_date = datetime.now() from_date = to_date - timedelta(days=last_days) return Event.objects.filter(datetime__gte=from_date, datetime__lte=to_date, group_id__isnull=False) def _attach_related(_events): project_ids = set() group_ids = set() for event in _events: project_ids.add(event.project_id) group_ids.add(event.group_id) projects = { p.id: p for p in Project.objects.filter(id__in=project_ids) } groups = {g.id: g for g in Group.objects.filter(id__in=group_ids)} for event in _events: event.project = projects.get(event.project_id) event.group = groups.get(event.group_id) eventstore.bind_nodes(_events, "data") if skip_backfill: print("Skipping backfill.\n") return events = get_events(retention_days) count = events.count() if count == 0: print("Nothing to do, skipping migration.\n") return print("Events to process: {}\n".format(count)) processed = 0 for event in RangeQuerySetWrapper(events, step=100, callbacks=(_attach_related, )): primary_hash = event.get_primary_hash() if event.project is None or event.group is None: print("Skipped {} as group or project information is invalid.\n". format(event)) continue eventstream.insert( group=event.group, event=event, is_new=False, is_regression=False, is_new_group_environment=False, primary_hash=primary_hash, skip_consume=True, ) processed += 1 print("Event migration done. Processed {} of {} events.\n".format( processed, count))
def backfill_eventstream(apps, schema_editor): """ Inserts Postgres events into the eventstream if there are recent events in Postgres. This is for open source users migrating from 9.x who want to keep their events. If there are no recent events in Postgres, skip the backfill. """ from sentry import eventstore, eventstream from sentry.utils.query import RangeQuerySetWrapper Event = apps.get_model("sentry", "Event") Group = apps.get_model("sentry", "Group") Project = apps.get_model("sentry", "Project") # Kill switch to skip this migration skip_backfill = os.environ.get("SENTRY_SKIP_EVENTS_BACKFILL_FOR_10", False) # Use 90 day retention if the option has not been set or set to 0 DEFAULT_RETENTION = 90 retention_days = options.get( "system.event-retention-days") or DEFAULT_RETENTION def get_events(last_days): to_date = timezone.now() from_date = to_date - timedelta(days=last_days) return Event.objects.filter(datetime__gte=from_date, datetime__lte=to_date, group_id__isnull=False) def _attach_related(_events): project_ids = set() group_ids = set() for event in _events: project_ids.add(event.project_id) group_ids.add(event.group_id) projects = { p.id: p for p in Project.objects.filter(id__in=project_ids) } groups = {g.id: g for g in Group.objects.filter(id__in=group_ids)} for event in _events: event.project = projects.get(event.project_id) event.group = groups.get(event.group_id) # When migrating old data from Sentry 9.0.0 to 9.1.2 to 10 in rapid succession, the event timestamp may be # missing. This adds it back if "timestamp" not in event.data.data: event.data.data["timestamp"] = to_timestamp(event.datetime) eventstore.bind_nodes(_events, "data") if skip_backfill: print("Skipping backfill.\n") # noqa: B314 return events = get_events(retention_days) count = events.count() if count == 0: print("Nothing to do, skipping migration.\n") # noqa: B314 return print("Events to process: {}\n".format(count)) # noqa: B314 processed = 0 for e in RangeQuerySetWrapper(events, step=100, callbacks=(_attach_related, )): event = NewEvent(project_id=e.project_id, event_id=e.event_id, group_id=e.group_id, data=e.data.data) try: group = event.group except Group.DoesNotExist: group = None if event.project is None or group is None or len(event.data) == 0: print( # noqa: B314 "Skipped {} as group, project or node data information is invalid.\n" .format(event)) continue try: eventstream.insert( group=event.group, event=event, is_new=False, is_regression=False, is_new_group_environment=False, primary_hash=event.get_primary_hash(), received_timestamp=event.data.get("received") or float(event.datetime.strftime("%s")), skip_consume=True, ) # The node ID format was changed in Sentry 9.1.0 # (https://github.com/getsentry/sentry/commit/f73a4039d16a5c4f88bde37f6464cac21deb50e1) # If we are migrating from older versions of Sentry (i.e. 9.0.0 and earlier) # we need to resave the node using the new node ID scheme and delete the old # node. old_node_id = e.data.id new_node_id = event.data.id if old_node_id != new_node_id: event.data.save() nodestore.delete(old_node_id) processed += 1 except Exception as error: print( # noqa: B314 "An error occured while trying to migrate the following event: {}\n.----\n{}" .format(event, error)) if processed == 0: raise Exception( "Cannot migrate any event. If this is okay, re-run migrations with SENTRY_SKIP_EVENTS_BACKFILL_FOR_10 environment variable set to skip this step." ) print( # noqa: B314 "Event migration done. Migrated {} of {} events.\n".format( processed, count))