def handle(self, **options): def _attach_fks(_events): project_ids = set([event.project_id for event in _events]) projects = { p.id: p for p in Project.objects.filter(id__in=project_ids) } group_ids = set([event.group_id for event in _events]) groups = {g.id: g for g in Group.objects.filter(id__in=group_ids)} for event in _events: event.project = projects[event.project_id] event.group = groups[event.group_id] from sentry import eventstream from sentry.utils.query import RangeQuerySetWrapper from_ts = options['from_ts'] to_ts = options['to_ts'] from_id = options['from_id'] to_id = options['to_id'] if (from_ts or to_ts) and (from_id or to_id): raise CommandError( 'You can either limit by primary key, or by timestamp.') elif from_ts and to_ts: events = self.get_events_by_timestamp(from_ts, to_ts) elif from_id and to_id: events = self.get_events_by_id(from_id, to_id) else: raise CommandError( 'Invalid arguments: either use --from/--to-id, or --from/--to-ts.' ) count = events.count() self.stdout.write('Events to process: {}\n'.format(count)) if count == 0: self.stdout.write('Nothing to do.\n') sys.exit(0) if not options['no_input']: proceed = raw_input('Do you want to continue? [y/N] ') if proceed.lower() not in ['yes', 'y']: raise CommandError('Aborted.') for event in RangeQuerySetWrapper(events, callbacks=(_attach_fks, )): primary_hash = event.get_primary_hash() eventstream.insert( group=event.group, event=event, is_new=False, is_sample=False, is_regression=False, is_new_group_environment=False, primary_hash=primary_hash, skip_consume=True, ) self.stdout.write('Done.\n')
def handle(self, **options): def _attach_related(_events): project_ids = set([event.project_id for event in _events]) projects = {p.id: p for p in Project.objects.filter(id__in=project_ids)} group_ids = set([event.group_id for event in _events]) groups = {g.id: g for g in Group.objects.filter(id__in=group_ids)} for event in _events: event.project = projects[event.project_id] event.group = groups[event.group_id] Event.objects.bind_nodes(_events, 'data') from sentry import eventstream from sentry.utils.query import RangeQuerySetWrapper from_ts = options['from_ts'] to_ts = options['to_ts'] from_id = options['from_id'] to_id = options['to_id'] if (from_ts or to_ts) and (from_id or to_id): raise CommandError('You can either limit by primary key, or by timestamp.') elif from_ts and to_ts: events = self.get_events_by_timestamp(from_ts, to_ts) elif from_id and to_id: events = self.get_events_by_id(from_id, to_id) else: raise CommandError('Invalid arguments: either use --from/--to-id, or --from/--to-ts.') count = events.count() self.stdout.write('Events to process: {}\n'.format(count)) if count == 0: self.stdout.write('Nothing to do.\n') sys.exit(0) if not options['no_input']: proceed = six.moves.input('Do you want to continue? [y/N] ') if proceed.strip().lower() not in ['yes', 'y']: raise CommandError('Aborted.') for event in RangeQuerySetWrapper(events, step=100, callbacks=(_attach_related,)): primary_hash = event.get_primary_hash() eventstream.insert( group=event.group, event=event, is_new=False, is_sample=False, is_regression=False, is_new_group_environment=False, primary_hash=primary_hash, skip_consume=True, ) self.stdout.write('Done.\n')
def _eventstream_insert_many(jobs): for job in jobs: eventstream.insert( group=job["group"], event=job["event"], is_new=job["is_new"], is_regression=job["is_regression"], is_new_group_environment=job["is_new_group_environment"], primary_hash=job["data"]["hashes"][0], received_timestamp=job["received_timestamp"], # We are choosing to skip consuming the event back # in the eventstream if it's flagged as raw. # This means that we want to publish the event # through the event stream, but we don't care # about post processing and handling the commit. skip_consume=job.get("raw", False), )
def save(self, project_id, raw=False, assume_normalized=False, cache_key=None): """ We re-insert events with duplicate IDs into Snuba, which is responsible for deduplicating events. Since deduplication in Snuba is on the primary key (based on event ID, project ID and day), events with same IDs are only deduplicated if their timestamps fall on the same day. The latest event always wins and overwrites the value of events received earlier in that day. Since we increment counters and frequencies here before events get inserted to eventstream these numbers may be larger than the total number of events if we receive duplicate event IDs that fall on the same day (that do not hit cache first). """ # Normalize if needed if not self._normalized: if not assume_normalized: self.normalize() self._normalized = True data = self._data project = Project.objects.get_from_cache(id=project_id) project._organization_cache = Organization.objects.get_from_cache( id=project.organization_id) # Pull out the culprit culprit = self.get_culprit() # Pull the toplevel data we're interested in level = data.get("level") # TODO(mitsuhiko): this code path should be gone by July 2018. # This is going to be fine because no code actually still depends # on integers here. When we need an integer it will be converted # into one later. Old workers used to send integers here. if level is not None and isinstance(level, six.integer_types): level = LOG_LEVELS[level] transaction_name = data.get("transaction") logger_name = data.get("logger") release = data.get("release") dist = data.get("dist") environment = data.get("environment") recorded_timestamp = data.get("timestamp") # We need to swap out the data with the one internal to the newly # created event object event = self._get_event_instance(project_id=project_id) self._data = data = event.data.data event._project_cache = project date = event.datetime platform = event.platform event_id = event.event_id if transaction_name: transaction_name = force_text(transaction_name) # Right now the event type is the signal to skip the group. This # is going to change a lot. if event.get_event_type() == "transaction": issueless_event = True else: issueless_event = False # Some of the data that are toplevel attributes are duplicated # into tags (logger, level, environment, transaction). These are # different from legacy attributes which are normalized into tags # ahead of time (site, server_name). setdefault_path(data, "tags", value=[]) set_tag(data, "level", level) if logger_name: set_tag(data, "logger", logger_name) if environment: set_tag(data, "environment", environment) if transaction_name: set_tag(data, "transaction", transaction_name) if release: # dont allow a conflicting 'release' tag pop_tag(data, "release") release = Release.get_or_create(project=project, version=release, date_added=date) set_tag(data, "sentry:release", release.version) if dist and release: dist = release.add_dist(dist, date) # dont allow a conflicting 'dist' tag pop_tag(data, "dist") set_tag(data, "sentry:dist", dist.name) else: dist = None event_user = self._get_event_user(project, data) if event_user: # dont allow a conflicting 'user' tag pop_tag(data, "user") set_tag(data, "sentry:user", event_user.tag_value) # At this point we want to normalize the in_app values in case the # clients did not set this appropriately so far. grouping_config = load_grouping_config( get_grouping_config_dict_for_event_data(data, project)) normalize_stacktraces_for_grouping(data, grouping_config) for plugin in plugins.for_project(project, version=None): added_tags = safe_execute(plugin.get_tags, event, _with_transaction=False) if added_tags: # plugins should not override user provided tags for key, value in added_tags: if get_tag(data, key) is None: set_tag(data, key, value) for path, iface in six.iteritems(event.interfaces): for k, v in iface.iter_tags(): set_tag(data, k, v) # Get rid of ephemeral interface data if iface.ephemeral: data.pop(iface.path, None) # The active grouping config was put into the event in the # normalize step before. We now also make sure that the # fingerprint was set to `'{{ default }}' just in case someone # removed it from the payload. The call to get_hashes will then # look at `grouping_config` to pick the right parameters. data["fingerprint"] = data.get("fingerprint") or ["{{ default }}"] apply_server_fingerprinting( data, get_fingerprinting_config_for_project(project)) # Here we try to use the grouping config that was requested in the # event. If that config has since been deleted (because it was an # experimental grouping config) we fall back to the default. try: hashes = event.get_hashes() except GroupingConfigNotFound: data["grouping_config"] = get_grouping_config_dict_for_project( project) hashes = event.get_hashes() data["hashes"] = hashes # we want to freeze not just the metadata and type in but also the # derived attributes. The reason for this is that we push this # data into kafka for snuba processing and our postprocessing # picks up the data right from the snuba topic. For most usage # however the data is dynamically overridden by Event.title and # Event.location (See Event.as_dict) materialized_metadata = self.materialize_metadata() data.update(materialized_metadata) data["culprit"] = culprit received_timestamp = event.data.get("received") or float( event.datetime.strftime("%s")) if not issueless_event: # The group gets the same metadata as the event when it's flushed but # additionally the `last_received` key is set. This key is used by # _save_aggregate. group_metadata = dict(materialized_metadata) group_metadata["last_received"] = received_timestamp kwargs = { "platform": platform, "message": event.search_message, "culprit": culprit, "logger": logger_name, "level": LOG_LEVELS_MAP.get(level), "last_seen": date, "first_seen": date, "active_at": date, "data": group_metadata, } if release: kwargs["first_release"] = release try: group, is_new, is_regression = self._save_aggregate( event=event, hashes=hashes, release=release, **kwargs) except HashDiscarded: event_discarded.send_robust(project=project, sender=EventManager) metrics.incr( "events.discarded", skip_internal=True, tags={ "organization_id": project.organization_id, "platform": platform }, ) raise else: event_saved.send_robust(project=project, event_size=event.size, sender=EventManager) event.group = group else: group = None is_new = False is_regression = False event_saved.send_robust(project=project, event_size=event.size, sender=EventManager) # store a reference to the group id to guarantee validation of isolation event.data.bind_ref(event) environment = Environment.get_or_create(project=project, name=environment) if group: group_environment, is_new_group_environment = GroupEnvironment.get_or_create( group_id=group.id, environment_id=environment.id, defaults={"first_release": release if release else None}, ) else: is_new_group_environment = False if release: ReleaseEnvironment.get_or_create(project=project, release=release, environment=environment, datetime=date) ReleaseProjectEnvironment.get_or_create(project=project, release=release, environment=environment, datetime=date) if group: grouprelease = GroupRelease.get_or_create( group=group, release=release, environment=environment, datetime=date) counters = [(tsdb.models.project, project.id)] if group: counters.append((tsdb.models.group, group.id)) if release: counters.append((tsdb.models.release, release.id)) tsdb.incr_multi(counters, timestamp=event.datetime, environment_id=environment.id) frequencies = [] if group: frequencies.append((tsdb.models.frequent_environments_by_group, { group.id: { environment.id: 1 } })) if release: frequencies.append((tsdb.models.frequent_releases_by_group, { group.id: { grouprelease.id: 1 } })) if frequencies: tsdb.record_frequency_multi(frequencies, timestamp=event.datetime) if group: UserReport.objects.filter(project=project, event_id=event_id).update( group=group, environment=environment) # Enusre the _metrics key exists. This is usually created during # and prefilled with ingestion sizes. event_metrics = event.data.get("_metrics") or {} event.data["_metrics"] = event_metrics # Capture the actual size that goes into node store. event_metrics["bytes.stored.event"] = len( json.dumps(dict(event.data.items()))) # Load attachments first, but persist them at the very last after # posting to eventstream to make sure all counters and eventstream are # incremented for sure. attachments = self.get_attachments(cache_key, event) for attachment in attachments: key = "bytes.stored.%s" % (attachment.type, ) event_metrics[key] = (event_metrics.get(key) or 0) + len( attachment.data) # Write the event to Nodestore event.data.save() if event_user: counters = [(tsdb.models.users_affected_by_project, project.id, (event_user.tag_value, ))] if group: counters.append((tsdb.models.users_affected_by_group, group.id, (event_user.tag_value, ))) tsdb.record_multi(counters, timestamp=event.datetime, environment_id=environment.id) if release: if is_new: buffer.incr( ReleaseProject, {"new_groups": 1}, { "release_id": release.id, "project_id": project.id }, ) if is_new_group_environment: buffer.incr( ReleaseProjectEnvironment, {"new_issues_count": 1}, { "project_id": project.id, "release_id": release.id, "environment_id": environment.id, }, ) if not raw: if not project.first_event: project.update(first_event=date) first_event_received.send_robust(project=project, event=event, sender=Project) eventstream.insert( group=group, event=event, is_new=is_new, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=hashes[0], # We are choosing to skip consuming the event back # in the eventstream if it's flagged as raw. # This means that we want to publish the event # through the event stream, but we don't care # about post processing and handling the commit. skip_consume=raw, ) # Do this last to ensure signals get emitted even if connection to the # file store breaks temporarily. self.save_attachments(attachments, event) metric_tags = {"from_relay": "_relay_processed" in self._data} metrics.timing("events.latency", received_timestamp - recorded_timestamp, tags=metric_tags) metrics.timing("events.size.data.post_save", event.size, tags=metric_tags) metrics.incr( "events.post_save.normalize.errors", amount=len(self._data.get("errors") or ()), tags=metric_tags, ) return event
def save(self, project, raw=False): from sentry.tasks.post_process import index_event_tags data = self.data project = Project.objects.get_from_cache(id=project) # Check to make sure we're not about to do a bunch of work that's # already been done if we've processed an event with this ID. (This # isn't a perfect solution -- this doesn't handle ``EventMapping`` and # there's a race condition between here and when the event is actually # saved, but it's an improvement. See GH-7677.) try: event = Event.objects.get( project_id=project.id, event_id=data['event_id'], ) except Event.DoesNotExist: pass else: self.logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': data['event_id'], 'project_id': project.id, 'model': Event.__name__, } ) return event # First we pull out our top-level (non-data attr) kwargs event_id = data.pop('event_id') level = data.pop('level') transaction_name = data.pop('transaction', None) culprit = data.pop('culprit', None) logger_name = data.pop('logger', None) server_name = data.pop('server_name', None) site = data.pop('site', None) checksum = data.pop('checksum', None) fingerprint = data.pop('fingerprint', None) platform = data.pop('platform', None) release = data.pop('release', None) dist = data.pop('dist', None) environment = data.pop('environment', None) # unused time_spent = data.pop('time_spent', None) message = data.pop('message', '') if not culprit: if transaction_name: culprit = transaction_name else: culprit = generate_culprit(data, platform=platform) culprit = force_text(culprit) if transaction_name: transaction_name = force_text(transaction_name) recorded_timestamp = data.pop('timestamp') date = datetime.fromtimestamp(recorded_timestamp) date = date.replace(tzinfo=timezone.utc) kwargs = { 'platform': platform, } event = Event( project_id=project.id, event_id=event_id, data=data, time_spent=time_spent, datetime=date, **kwargs ) event._project_cache = project data = event.data.data # convert this to a dict to ensure we're only storing one value per key # as most parts of Sentry dont currently play well with multiple values tags = dict(data.get('tags') or []) tags['level'] = LOG_LEVELS[level] if logger_name: tags['logger'] = logger_name if server_name: tags['server_name'] = server_name if site: tags['site'] = site if environment: tags['environment'] = environment if transaction_name: tags['transaction'] = transaction_name if release: # dont allow a conflicting 'release' tag if 'release' in tags: del tags['release'] release = Release.get_or_create( project=project, version=release, date_added=date, ) tags['sentry:release'] = release.version if dist and release: dist = release.add_dist(dist, date) tags['sentry:dist'] = dist.name else: dist = None event_user = self._get_event_user(project, data) if event_user: # dont allow a conflicting 'user' tag if 'user' in tags: del tags['user'] tags['sentry:user'] = event_user.tag_value # At this point we want to normalize the in_app values in case the # clients did not set this appropriately so far. normalize_in_app(data) for plugin in plugins.for_project(project, version=None): added_tags = safe_execute(plugin.get_tags, event, _with_transaction=False) if added_tags: # plugins should not override user provided tags for key, value in added_tags: tags.setdefault(key, value) for path, iface in six.iteritems(event.interfaces): for k, v in iface.iter_tags(): tags[k] = v # Get rid of ephemeral interface data if iface.ephemeral: data.pop(iface.get_path(), None) # tags are stored as a tuple tags = tags.items() data['tags'] = tags data['fingerprint'] = fingerprint or ['{{ default }}'] # prioritize fingerprint over checksum as its likely the client defaulted # a checksum whereas the fingerprint was explicit if fingerprint: hashes = [md5_from_hash(h) for h in get_hashes_from_fingerprint(event, fingerprint)] elif checksum: if HASH_RE.match(checksum): hashes = [checksum] else: hashes = [md5_from_hash([checksum]), checksum] data['checksum'] = checksum else: hashes = [md5_from_hash(h) for h in get_hashes_for_event(event)] # TODO(dcramer): temp workaround for complexity data['message'] = message event_type = eventtypes.get(data.get('type', 'default'))(data) event_metadata = event_type.get_metadata() # TODO(dcramer): temp workaround for complexity del data['message'] data['type'] = event_type.key data['metadata'] = event_metadata # index components into ``Event.message`` # See GH-3248 if event_type.key != 'default': if 'sentry.interfaces.Message' in data and \ data['sentry.interfaces.Message']['message'] != message: message = u'{} {}'.format( message, data['sentry.interfaces.Message']['message'], ) if not message: message = '' elif not isinstance(message, six.string_types): message = force_text(message) for value in six.itervalues(event_metadata): value_u = force_text(value, errors='replace') if value_u not in message: message = u'{} {}'.format(message, value_u) if culprit and culprit not in message: culprit_u = force_text(culprit, errors='replace') message = u'{} {}'.format(message, culprit_u) message = trim(message.strip(), settings.SENTRY_MAX_MESSAGE_LENGTH) event.message = message kwargs['message'] = message received_timestamp = event.data.get('received') or float(event.datetime.strftime('%s')) group_kwargs = kwargs.copy() group_kwargs.update( { 'culprit': culprit, 'logger': logger_name, 'level': level, 'last_seen': date, 'first_seen': date, 'active_at': date, 'data': { 'last_received': received_timestamp, 'type': event_type.key, # we cache the events metadata on the group to ensure its # accessible in the stream 'metadata': event_metadata, }, } ) if release: group_kwargs['first_release'] = release try: group, is_new, is_regression, is_sample = self._save_aggregate( event=event, hashes=hashes, release=release, **group_kwargs ) except HashDiscarded: event_discarded.send_robust( project=project, sender=EventManager, ) metrics.incr( 'events.discarded', skip_internal=True, tags={ 'organization_id': project.organization_id, 'platform': platform, }, ) raise else: event_saved.send_robust( project=project, event_size=event.size, sender=EventManager, ) event.group = group # store a reference to the group id to guarantee validation of isolation event.data.bind_ref(event) # When an event was sampled, the canonical source of truth # is the EventMapping table since we aren't going to be writing out an actual # Event row. Otherwise, if the Event isn't being sampled, we can safely # rely on the Event table itself as the source of truth and ignore # EventMapping since it's redundant information. if is_sample: try: with transaction.atomic(using=router.db_for_write(EventMapping)): EventMapping.objects.create(project=project, group=group, event_id=event_id) except IntegrityError: self.logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': EventMapping.__name__, } ) return event environment = Environment.get_or_create( project=project, name=environment, ) group_environment, is_new_group_environment = GroupEnvironment.get_or_create( group_id=group.id, environment_id=environment.id, defaults={ 'first_release_id': release.id if release else None, }, ) if release: ReleaseEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) ReleaseProjectEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) grouprelease = GroupRelease.get_or_create( group=group, release=release, environment=environment, datetime=date, ) counters = [ (tsdb.models.group, group.id), (tsdb.models.project, project.id), ] if release: counters.append((tsdb.models.release, release.id)) tsdb.incr_multi(counters, timestamp=event.datetime, environment_id=environment.id) frequencies = [ # (tsdb.models.frequent_projects_by_organization, { # project.organization_id: { # project.id: 1, # }, # }), # (tsdb.models.frequent_issues_by_project, { # project.id: { # group.id: 1, # }, # }) (tsdb.models.frequent_environments_by_group, { group.id: { environment.id: 1, }, }) ] if release: frequencies.append( (tsdb.models.frequent_releases_by_group, { group.id: { grouprelease.id: 1, }, }) ) tsdb.record_frequency_multi(frequencies, timestamp=event.datetime) UserReport.objects.filter( project=project, event_id=event_id, ).update( group=group, environment=environment, ) # save the event unless its been sampled if not is_sample: try: with transaction.atomic(using=router.db_for_write(Event)): event.save() except IntegrityError: self.logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': Event.__name__, } ) return event index_event_tags.delay( organization_id=project.organization_id, project_id=project.id, group_id=group.id, environment_id=environment.id, event_id=event.id, tags=tags, date_added=event.datetime, ) if event_user: tsdb.record_multi( ( (tsdb.models.users_affected_by_group, group.id, (event_user.tag_value, )), (tsdb.models.users_affected_by_project, project.id, (event_user.tag_value, )), ), timestamp=event.datetime, environment_id=environment.id, ) if release: if is_new: buffer.incr( ReleaseProject, {'new_groups': 1}, { 'release_id': release.id, 'project_id': project.id, } ) if is_new_group_environment: buffer.incr( ReleaseProjectEnvironment, {'new_issues_count': 1}, { 'project_id': project.id, 'release_id': release.id, 'environment_id': environment.id, } ) safe_execute(Group.objects.add_tags, group, environment, tags, _with_transaction=False) if not raw: if not project.first_event: project.update(first_event=date) first_event_received.send_robust(project=project, group=group, sender=Project) eventstream.insert( group=group, event=event, is_new=is_new, is_sample=is_sample, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=hashes[0], # We are choosing to skip consuming the event back # in the eventstream if it's flagged as raw. # This means that we want to publish the event # through the event stream, but we don't care # about post processing and handling the commit. skip_consume=raw, ) metrics.timing( 'events.latency', received_timestamp - recorded_timestamp, tags={ 'project_id': project.id, }, ) return event
def save(self, project_id, raw=False, assume_normalized=False): # Normalize if needed if not self._normalized: if not assume_normalized: self.normalize() self._normalized = True data = self._data project = Project.objects.get_from_cache(id=project_id) project._organization_cache = Organization.objects.get_from_cache( id=project.organization_id) # Check to make sure we're not about to do a bunch of work that's # already been done if we've processed an event with this ID. (This # isn't a perfect solution -- this doesn't handle ``EventMapping`` and # there's a race condition between here and when the event is actually # saved, but it's an improvement. See GH-7677.) try: event = Event.objects.get( project_id=project.id, event_id=data['event_id'], ) except Event.DoesNotExist: pass else: # Make sure we cache on the project before returning event._project_cache = project logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': data['event_id'], 'project_id': project.id, 'model': Event.__name__, } ) return event # Pull out the culprit culprit = self.get_culprit() # Pull the toplevel data we're interested in level = data.get('level') # TODO(mitsuhiko): this code path should be gone by July 2018. # This is going to be fine because no code actually still depends # on integers here. When we need an integer it will be converted # into one later. Old workers used to send integers here. if level is not None and isinstance(level, six.integer_types): level = LOG_LEVELS[level] transaction_name = data.get('transaction') logger_name = data.get('logger') release = data.get('release') dist = data.get('dist') environment = data.get('environment') recorded_timestamp = data.get('timestamp') # We need to swap out the data with the one internal to the newly # created event object event = self._get_event_instance(project_id=project_id) self._data = data = event.data.data event._project_cache = project date = event.datetime platform = event.platform event_id = event.event_id if transaction_name: transaction_name = force_text(transaction_name) # Some of the data that are toplevel attributes are duplicated # into tags (logger, level, environment, transaction). These are # different from legacy attributes which are normalized into tags # ahead of time (site, server_name). setdefault_path(data, 'tags', value=[]) set_tag(data, 'level', level) if logger_name: set_tag(data, 'logger', logger_name) if environment: set_tag(data, 'environment', environment) if transaction_name: set_tag(data, 'transaction', transaction_name) if release: # dont allow a conflicting 'release' tag pop_tag(data, 'release') release = Release.get_or_create( project=project, version=release, date_added=date, ) set_tag(data, 'sentry:release', release.version) if dist and release: dist = release.add_dist(dist, date) # dont allow a conflicting 'dist' tag pop_tag(data, 'dist') set_tag(data, 'sentry:dist', dist.name) else: dist = None event_user = self._get_event_user(project, data) if event_user: # dont allow a conflicting 'user' tag pop_tag(data, 'user') set_tag(data, 'sentry:user', event_user.tag_value) # At this point we want to normalize the in_app values in case the # clients did not set this appropriately so far. grouping_config = load_grouping_config( get_grouping_config_dict_for_event_data(data, project)) normalize_stacktraces_for_grouping(data, grouping_config) for plugin in plugins.for_project(project, version=None): added_tags = safe_execute(plugin.get_tags, event, _with_transaction=False) if added_tags: # plugins should not override user provided tags for key, value in added_tags: if get_tag(data, key) is None: set_tag(data, key, value) for path, iface in six.iteritems(event.interfaces): for k, v in iface.iter_tags(): set_tag(data, k, v) # Get rid of ephemeral interface data if iface.ephemeral: data.pop(iface.path, None) # The active grouping config was put into the event in the # normalize step before. We now also make sure that the # fingerprint was set to `'{{ default }}' just in case someone # removed it from the payload. The call to get_hashes will then # look at `grouping_config` to pick the right paramters. data['fingerprint'] = data.get('fingerprint') or ['{{ default }}'] apply_server_fingerprinting(data, get_fingerprinting_config_for_project(project)) # Here we try to use the grouping config that was requested in the # event. If that config has since been deleted (because it was an # experimental grouping config) we fall back to the default. try: hashes = event.get_hashes() except GroupingConfigNotFound: data['grouping_config'] = get_grouping_config_dict_for_project(project) hashes = event.get_hashes() data['hashes'] = hashes # we want to freeze not just the metadata and type in but also the # derived attributes. The reason for this is that we push this # data into kafka for snuba processing and our postprocessing # picks up the data right from the snuba topic. For most usage # however the data is dynamically overriden by Event.title and # Event.location (See Event.as_dict) materialized_metadata = self.materialize_metadata() event_metadata = materialized_metadata['metadata'] data.update(materialized_metadata) data['culprit'] = culprit # index components into ``Event.message`` # See GH-3248 event.message = self.get_search_message(event_metadata, culprit) received_timestamp = event.data.get('received') or float(event.datetime.strftime('%s')) # The group gets the same metadata as the event when it's flushed but # additionally the `last_received` key is set. This key is used by # _save_aggregate. group_metadata = dict(materialized_metadata) group_metadata['last_received'] = received_timestamp kwargs = { 'platform': platform, 'message': event.message, 'culprit': culprit, 'logger': logger_name, 'level': LOG_LEVELS_MAP.get(level), 'last_seen': date, 'first_seen': date, 'active_at': date, 'data': group_metadata, } if release: kwargs['first_release'] = release try: group, is_new, is_regression, is_sample = self._save_aggregate( event=event, hashes=hashes, release=release, **kwargs ) except HashDiscarded: event_discarded.send_robust( project=project, sender=EventManager, ) metrics.incr( 'events.discarded', skip_internal=True, tags={ 'organization_id': project.organization_id, 'platform': platform, }, ) raise else: event_saved.send_robust( project=project, event_size=event.size, sender=EventManager, ) event.group = group # store a reference to the group id to guarantee validation of isolation event.data.bind_ref(event) # When an event was sampled, the canonical source of truth # is the EventMapping table since we aren't going to be writing out an actual # Event row. Otherwise, if the Event isn't being sampled, we can safely # rely on the Event table itself as the source of truth and ignore # EventMapping since it's redundant information. if is_sample: try: with transaction.atomic(using=router.db_for_write(EventMapping)): EventMapping.objects.create(project=project, group=group, event_id=event_id) except IntegrityError: logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': EventMapping.__name__, } ) return event environment = Environment.get_or_create( project=project, name=environment, ) group_environment, is_new_group_environment = GroupEnvironment.get_or_create( group_id=group.id, environment_id=environment.id, defaults={ 'first_release': release if release else None, }, ) if release: ReleaseEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) ReleaseProjectEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) grouprelease = GroupRelease.get_or_create( group=group, release=release, environment=environment, datetime=date, ) counters = [ (tsdb.models.group, group.id), (tsdb.models.project, project.id), ] if release: counters.append((tsdb.models.release, release.id)) tsdb.incr_multi(counters, timestamp=event.datetime, environment_id=environment.id) frequencies = [ # (tsdb.models.frequent_projects_by_organization, { # project.organization_id: { # project.id: 1, # }, # }), # (tsdb.models.frequent_issues_by_project, { # project.id: { # group.id: 1, # }, # }) (tsdb.models.frequent_environments_by_group, { group.id: { environment.id: 1, }, }) ] if release: frequencies.append( (tsdb.models.frequent_releases_by_group, { group.id: { grouprelease.id: 1, }, }) ) tsdb.record_frequency_multi(frequencies, timestamp=event.datetime) UserReport.objects.filter( project=project, event_id=event_id, ).update( group=group, environment=environment, ) # Update any event attachment that arrived before the event group was defined. EventAttachment.objects.filter( project_id=project.id, event_id=event_id, ).update( group_id=group.id, ) # save the event unless its been sampled if not is_sample: try: with transaction.atomic(using=router.db_for_write(Event)): event.save() except IntegrityError: logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': Event.__name__, } ) return event tagstore.delay_index_event_tags( organization_id=project.organization_id, project_id=project.id, group_id=group.id, environment_id=environment.id, event_id=event.id, tags=event.tags, date_added=event.datetime, ) if event_user: tsdb.record_multi( ( (tsdb.models.users_affected_by_group, group.id, (event_user.tag_value, )), (tsdb.models.users_affected_by_project, project.id, (event_user.tag_value, )), ), timestamp=event.datetime, environment_id=environment.id, ) if release: if is_new: buffer.incr( ReleaseProject, {'new_groups': 1}, { 'release_id': release.id, 'project_id': project.id, } ) if is_new_group_environment: buffer.incr( ReleaseProjectEnvironment, {'new_issues_count': 1}, { 'project_id': project.id, 'release_id': release.id, 'environment_id': environment.id, } ) safe_execute( Group.objects.add_tags, group, environment, event.get_tags(), _with_transaction=False) if not raw: if not project.first_event: project.update(first_event=date) first_event_received.send_robust(project=project, group=group, sender=Project) eventstream.insert( group=group, event=event, is_new=is_new, is_sample=is_sample, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=hashes[0], # We are choosing to skip consuming the event back # in the eventstream if it's flagged as raw. # This means that we want to publish the event # through the event stream, but we don't care # about post processing and handling the commit. skip_consume=raw, ) metrics.timing( 'events.latency', received_timestamp - recorded_timestamp, tags={ 'project_id': project.id, }, ) metrics.timing( 'events.size.data.post_save', event.size, tags={'project_id': project.id} ) return event
def backfill_eventstream(apps, schema_editor): """ Inserts Postgres events into the eventstream if there are recent events in Postgres. This is for open source users migrating from 9.x who want to keep their events. If there are no recent events in Postgres, skip the backfill. """ from sentry import eventstore, eventstream from sentry.utils.query import RangeQuerySetWrapper Event = apps.get_model("sentry", "Event") Group = apps.get_model("sentry", "Group") Project = apps.get_model("sentry", "Project") # Kill switch to skip this migration skip_backfill = os.environ.get("SENTRY_SKIP_EVENTS_BACKFILL_FOR_10", False) # Use 90 day retention if the option has not been set or set to 0 DEFAULT_RETENTION = 90 retention_days = options.get( "system.event-retention-days") or DEFAULT_RETENTION def get_events(last_days): to_date = timezone.now() from_date = to_date - timedelta(days=last_days) return Event.objects.filter(datetime__gte=from_date, datetime__lte=to_date, group_id__isnull=False) def _attach_related(_events): project_ids = set() group_ids = set() for event in _events: project_ids.add(event.project_id) group_ids.add(event.group_id) projects = { p.id: p for p in Project.objects.filter(id__in=project_ids) } groups = {g.id: g for g in Group.objects.filter(id__in=group_ids)} for event in _events: event.project = projects.get(event.project_id) event.group = groups.get(event.group_id) eventstore.bind_nodes(_events, "data") if skip_backfill: print("Skipping backfill.\n") return events = get_events(retention_days) count = events.count() if count == 0: print("Nothing to do, skipping migration.\n") return print("Events to process: {}\n".format(count)) processed = 0 for e in RangeQuerySetWrapper(events, step=100, callbacks=(_attach_related, )): event = NewEvent(project_id=e.project_id, event_id=e.event_id, group_id=e.group_id, data=e.data.data) primary_hash = event.get_primary_hash() if event.project is None or event.group is None or len( event.data) == 0: print( "Skipped {} as group, project or node data information is invalid.\n" .format(event)) continue try: eventstream.insert( group=event.group, event=event, is_new=False, is_regression=False, is_new_group_environment=False, primary_hash=primary_hash, received_timestamp=event.data.get("received") or float(event.datetime.strftime("%s")), skip_consume=True, ) processed += 1 except Exception as error: print( "An error occured while trying to instert the following event: {}\n.----\n{}" .format(event, error)) if processed == 0: raise Exception( "Cannot migrate any event. If this is okay, re-run migrations with SENTRY_SKIP_EVENTS_BACKFILL_FOR_10 environment variable set to skip this step." ) print("Event migration done. Migrated {} of {} events.\n".format( processed, count))
def handle(self, from_ts=None, to_ts=None, last_days=None, from_id=None, to_id=None, no_input=False, **options): def _attach_related(_events): project_ids = set([event.project_id for event in _events]) projects = { p.id: p for p in Project.objects.filter(id__in=project_ids) } group_ids = set([event.group_id for event in _events]) groups = {g.id: g for g in Group.objects.filter(id__in=group_ids)} for event in _events: event.project = projects[event.project_id] event.group = groups[event.group_id] eventstore.bind_nodes(_events, "data") from sentry import eventstream from sentry.utils.query import RangeQuerySetWrapper filter_methods = bool(last_days) + bool(from_ts or to_ts) + bool( from_id or to_id) if filter_methods > 1: raise CommandError( "You can either limit by primary key, or by timestamp, or last X days." ) elif from_ts and to_ts: events = self.get_events_by_timestamp(from_ts, to_ts) elif last_days: events = self.get_events_by_last_days(last_days) elif from_id and to_id: events = self.get_events_by_id(from_id, to_id) else: raise CommandError( "Invalid arguments: either use --from/--to-id, or --from/--to-ts, or --last-days." ) count = events.count() self.stdout.write("Events to process: {}\n".format(count)) if count == 0: self.stdout.write("Nothing to do.\n") sys.exit(0) if not no_input: proceed = six.moves.input("Do you want to continue? [y/N] ") if proceed.strip().lower() not in ["yes", "y"]: raise CommandError("Aborted.") for event in RangeQuerySetWrapper(events, step=100, callbacks=(_attach_related, )): primary_hash = event.get_primary_hash() eventstream.insert( group=event.group, event=event, is_new=False, is_regression=False, is_new_group_environment=False, primary_hash=primary_hash, skip_consume=True, ) self.stdout.write("Done.\n")
def backfill_eventstream(apps, schema_editor): """ Inserts Postgres events into the eventstream if there are recent events in Postgres. This is for open source users migrating from 9.x who want to keep their events. If there are no recent events in Postgres, skip the backfill. """ from sentry import eventstore, eventstream from sentry.utils.query import RangeQuerySetWrapper Event = apps.get_model("sentry", "Event") Group = apps.get_model("sentry", "Group") Project = apps.get_model("sentry", "Project") # Kill switch to skip this migration skip_backfill = os.environ.get("SENTRY_SKIP_EVENTS_BACKFILL_FOR_10", False) # Use 90 day retention if the option has not been set or set to 0 DEFAULT_RETENTION = 90 retention_days = options.get( "system.event-retention-days") or DEFAULT_RETENTION def get_events(last_days): to_date = datetime.now() from_date = to_date - timedelta(days=last_days) return Event.objects.filter(datetime__gte=from_date, datetime__lte=to_date, group_id__isnull=False) def _attach_related(_events): project_ids = set() group_ids = set() for event in _events: project_ids.add(event.project_id) group_ids.add(event.group_id) projects = { p.id: p for p in Project.objects.filter(id__in=project_ids) } groups = {g.id: g for g in Group.objects.filter(id__in=group_ids)} for event in _events: event.project = projects.get(event.project_id) event.group = groups.get(event.group_id) eventstore.bind_nodes(_events, "data") if skip_backfill: print("Skipping backfill.\n") return events = get_events(retention_days) count = events.count() if count == 0: print("Nothing to do, skipping migration.\n") return print("Events to process: {}\n".format(count)) processed = 0 for event in RangeQuerySetWrapper(events, step=100, callbacks=(_attach_related, )): primary_hash = event.get_primary_hash() if event.project is None or event.group is None: print("Skipped {} as group or project information is invalid.\n". format(event)) continue eventstream.insert( group=event.group, event=event, is_new=False, is_regression=False, is_new_group_environment=False, primary_hash=primary_hash, skip_consume=True, ) processed += 1 print("Event migration done. Processed {} of {} events.\n".format( processed, count))
def backfill_eventstream(apps, schema_editor): """ Inserts Postgres events into the eventstream if there are recent events in Postgres. This is for open source users migrating from 9.x who want to keep their events. If there are no recent events in Postgres, skip the backfill. """ from sentry import eventstore, eventstream from sentry.utils.query import RangeQuerySetWrapper Event = apps.get_model("sentry", "Event") Group = apps.get_model("sentry", "Group") Project = apps.get_model("sentry", "Project") # Kill switch to skip this migration skip_backfill = os.environ.get("SENTRY_SKIP_EVENTS_BACKFILL_FOR_10", False) # Use 90 day retention if the option has not been set or set to 0 DEFAULT_RETENTION = 90 retention_days = options.get( "system.event-retention-days") or DEFAULT_RETENTION def get_events(last_days): to_date = timezone.now() from_date = to_date - timedelta(days=last_days) return Event.objects.filter(datetime__gte=from_date, datetime__lte=to_date, group_id__isnull=False) def _attach_related(_events): project_ids = set() group_ids = set() for event in _events: project_ids.add(event.project_id) group_ids.add(event.group_id) projects = { p.id: p for p in Project.objects.filter(id__in=project_ids) } groups = {g.id: g for g in Group.objects.filter(id__in=group_ids)} for event in _events: event.project = projects.get(event.project_id) event.group = groups.get(event.group_id) # When migrating old data from Sentry 9.0.0 to 9.1.2 to 10 in rapid succession, the event timestamp may be # missing. This adds it back if "timestamp" not in event.data.data: event.data.data["timestamp"] = to_timestamp(event.datetime) eventstore.bind_nodes(_events, "data") if skip_backfill: print("Skipping backfill.\n") # noqa: B314 return events = get_events(retention_days) count = events.count() if count == 0: print("Nothing to do, skipping migration.\n") # noqa: B314 return print("Events to process: {}\n".format(count)) # noqa: B314 processed = 0 for e in RangeQuerySetWrapper(events, step=100, callbacks=(_attach_related, )): event = NewEvent(project_id=e.project_id, event_id=e.event_id, group_id=e.group_id, data=e.data.data) try: group = event.group except Group.DoesNotExist: group = None if event.project is None or group is None or len(event.data) == 0: print( # noqa: B314 "Skipped {} as group, project or node data information is invalid.\n" .format(event)) continue try: eventstream.insert( group=event.group, event=event, is_new=False, is_regression=False, is_new_group_environment=False, primary_hash=event.get_primary_hash(), received_timestamp=event.data.get("received") or float(event.datetime.strftime("%s")), skip_consume=True, ) # The node ID format was changed in Sentry 9.1.0 # (https://github.com/getsentry/sentry/commit/f73a4039d16a5c4f88bde37f6464cac21deb50e1) # If we are migrating from older versions of Sentry (i.e. 9.0.0 and earlier) # we need to resave the node using the new node ID scheme and delete the old # node. old_node_id = e.data.id new_node_id = event.data.id if old_node_id != new_node_id: event.data.save() nodestore.delete(old_node_id) processed += 1 except Exception as error: print( # noqa: B314 "An error occured while trying to migrate the following event: {}\n.----\n{}" .format(event, error)) if processed == 0: raise Exception( "Cannot migrate any event. If this is okay, re-run migrations with SENTRY_SKIP_EVENTS_BACKFILL_FOR_10 environment variable set to skip this step." ) print( # noqa: B314 "Event migration done. Migrated {} of {} events.\n".format( processed, count))
def save(self, project, raw=False): from sentry.tasks.post_process import index_event_tags data = self.data project = Project.objects.get_from_cache(id=project) # Check to make sure we're not about to do a bunch of work that's # already been done if we've processed an event with this ID. (This # isn't a perfect solution -- this doesn't handle ``EventMapping`` and # there's a race condition between here and when the event is actually # saved, but it's an improvement. See GH-7677.) try: event = Event.objects.get( project_id=project.id, event_id=data['event_id'], ) except Event.DoesNotExist: pass else: self.logger.info('duplicate.found', exc_info=True, extra={ 'event_uuid': data['event_id'], 'project_id': project.id, 'model': Event.__name__, }) return event # First we pull out our top-level (non-data attr) kwargs event_id = data.pop('event_id') level = data.pop('level') transaction_name = data.pop('transaction', None) culprit = data.pop('culprit', None) logger_name = data.pop('logger', None) server_name = data.pop('server_name', None) site = data.pop('site', None) checksum = data.pop('checksum', None) fingerprint = data.pop('fingerprint', None) platform = data.pop('platform', None) release = data.pop('release', None) dist = data.pop('dist', None) environment = data.pop('environment', None) # unused time_spent = data.pop('time_spent', None) message = data.pop('message', '') if not culprit: if transaction_name: culprit = transaction_name else: culprit = generate_culprit(data, platform=platform) culprit = force_text(culprit) if transaction_name: transaction_name = force_text(transaction_name) recorded_timestamp = data.pop('timestamp') date = datetime.fromtimestamp(recorded_timestamp) date = date.replace(tzinfo=timezone.utc) kwargs = { 'platform': platform, } event = Event(project_id=project.id, event_id=event_id, data=data, time_spent=time_spent, datetime=date, **kwargs) event._project_cache = project data = event.data.data # convert this to a dict to ensure we're only storing one value per key # as most parts of Sentry dont currently play well with multiple values tags = dict(data.get('tags') or []) tags['level'] = LOG_LEVELS[level] if logger_name: tags['logger'] = logger_name if server_name: tags['server_name'] = server_name if site: tags['site'] = site if environment: tags['environment'] = environment if transaction_name: tags['transaction'] = transaction_name if release: # dont allow a conflicting 'release' tag if 'release' in tags: del tags['release'] release = Release.get_or_create( project=project, version=release, date_added=date, ) tags['sentry:release'] = release.version if dist and release: dist = release.add_dist(dist, date) tags['sentry:dist'] = dist.name else: dist = None event_user = self._get_event_user(project, data) if event_user: # dont allow a conflicting 'user' tag if 'user' in tags: del tags['user'] tags['sentry:user'] = event_user.tag_value # At this point we want to normalize the in_app values in case the # clients did not set this appropriately so far. normalize_in_app(data) for plugin in plugins.for_project(project, version=None): added_tags = safe_execute(plugin.get_tags, event, _with_transaction=False) if added_tags: # plugins should not override user provided tags for key, value in added_tags: tags.setdefault(key, value) for path, iface in six.iteritems(event.interfaces): for k, v in iface.iter_tags(): tags[k] = v # Get rid of ephemeral interface data if iface.ephemeral: data.pop(iface.get_path(), None) # tags are stored as a tuple tags = tags.items() data['tags'] = tags data['fingerprint'] = fingerprint or ['{{ default }}'] # prioritize fingerprint over checksum as its likely the client defaulted # a checksum whereas the fingerprint was explicit if fingerprint: hashes = [ md5_from_hash(h) for h in get_hashes_from_fingerprint(event, fingerprint) ] elif checksum: if HASH_RE.match(checksum): hashes = [checksum] else: hashes = [md5_from_hash([checksum]), checksum] data['checksum'] = checksum else: hashes = [md5_from_hash(h) for h in get_hashes_for_event(event)] # TODO(dcramer): temp workaround for complexity data['message'] = message event_type = eventtypes.get(data.get('type', 'default'))(data) event_metadata = event_type.get_metadata() # TODO(dcramer): temp workaround for complexity del data['message'] data['type'] = event_type.key data['metadata'] = event_metadata # index components into ``Event.message`` # See GH-3248 if event_type.key != 'default': if 'sentry.interfaces.Message' in data and \ data['sentry.interfaces.Message']['message'] != message: message = u'{} {}'.format( message, data['sentry.interfaces.Message']['message'], ) if not message: message = '' elif not isinstance(message, six.string_types): message = force_text(message) for value in six.itervalues(event_metadata): value_u = force_text(value, errors='replace') if value_u not in message: message = u'{} {}'.format(message, value_u) if culprit and culprit not in message: culprit_u = force_text(culprit, errors='replace') message = u'{} {}'.format(message, culprit_u) message = trim(message.strip(), settings.SENTRY_MAX_MESSAGE_LENGTH) event.message = message kwargs['message'] = message received_timestamp = event.data.get('received') or float( event.datetime.strftime('%s')) group_kwargs = kwargs.copy() group_kwargs.update({ 'culprit': culprit, 'logger': logger_name, 'level': level, 'last_seen': date, 'first_seen': date, 'active_at': date, 'data': { 'last_received': received_timestamp, 'type': event_type.key, # we cache the events metadata on the group to ensure its # accessible in the stream 'metadata': event_metadata, }, }) if release: group_kwargs['first_release'] = release try: group, is_new, is_regression, is_sample = self._save_aggregate( event=event, hashes=hashes, release=release, **group_kwargs) except HashDiscarded: event_discarded.send_robust( project=project, sender=EventManager, ) metrics.incr( 'events.discarded', skip_internal=True, tags={ 'organization_id': project.organization_id, 'platform': platform, }, ) raise else: event_saved.send_robust( project=project, event_size=event.size, sender=EventManager, ) event.group = group # store a reference to the group id to guarantee validation of isolation event.data.bind_ref(event) # When an event was sampled, the canonical source of truth # is the EventMapping table since we aren't going to be writing out an actual # Event row. Otherwise, if the Event isn't being sampled, we can safely # rely on the Event table itself as the source of truth and ignore # EventMapping since it's redundant information. if is_sample: try: with transaction.atomic( using=router.db_for_write(EventMapping)): EventMapping.objects.create(project=project, group=group, event_id=event_id) except IntegrityError: self.logger.info('duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': EventMapping.__name__, }) return event environment = Environment.get_or_create( project=project, name=environment, ) group_environment, is_new_group_environment = GroupEnvironment.get_or_create( group_id=group.id, environment_id=environment.id, defaults={ 'first_release_id': release.id if release else None, }, ) if release: ReleaseEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) ReleaseProjectEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) grouprelease = GroupRelease.get_or_create( group=group, release=release, environment=environment, datetime=date, ) counters = [ (tsdb.models.group, group.id), (tsdb.models.project, project.id), ] if release: counters.append((tsdb.models.release, release.id)) tsdb.incr_multi(counters, timestamp=event.datetime, environment_id=environment.id) frequencies = [ # (tsdb.models.frequent_projects_by_organization, { # project.organization_id: { # project.id: 1, # }, # }), # (tsdb.models.frequent_issues_by_project, { # project.id: { # group.id: 1, # }, # }) (tsdb.models.frequent_environments_by_group, { group.id: { environment.id: 1, }, }) ] if release: frequencies.append((tsdb.models.frequent_releases_by_group, { group.id: { grouprelease.id: 1, }, })) tsdb.record_frequency_multi(frequencies, timestamp=event.datetime) UserReport.objects.filter( project=project, event_id=event_id, ).update( group=group, environment=environment, ) # save the event unless its been sampled if not is_sample: try: with transaction.atomic(using=router.db_for_write(Event)): event.save() except IntegrityError: self.logger.info('duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': Event.__name__, }) return event index_event_tags.delay( organization_id=project.organization_id, project_id=project.id, group_id=group.id, environment_id=environment.id, event_id=event.id, tags=tags, date_added=event.datetime, ) if event_user: tsdb.record_multi( ( (tsdb.models.users_affected_by_group, group.id, (event_user.tag_value, )), (tsdb.models.users_affected_by_project, project.id, (event_user.tag_value, )), ), timestamp=event.datetime, environment_id=environment.id, ) if release: if is_new: buffer.incr(ReleaseProject, {'new_groups': 1}, { 'release_id': release.id, 'project_id': project.id, }) if is_new_group_environment: buffer.incr(ReleaseProjectEnvironment, {'new_issues_count': 1}, { 'project_id': project.id, 'release_id': release.id, 'environment_id': environment.id, }) safe_execute(Group.objects.add_tags, group, environment, tags, _with_transaction=False) if not raw: if not project.first_event: project.update(first_event=date) first_event_received.send_robust(project=project, group=group, sender=Project) eventstream.insert( group=group, event=event, is_new=is_new, is_sample=is_sample, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=hashes[0], # We are choosing to skip consuming the event back # in the eventstream if it's flagged as raw. # This means that we want to publish the event # through the event stream, but we don't care # about post processing and handling the commit. skip_consume=raw, ) metrics.timing( 'events.latency', received_timestamp - recorded_timestamp, tags={ 'project_id': project.id, }, ) return event
def save(self, project_id, raw=False, assume_normalized=False): # Normalize if needed if not self._normalized: if not assume_normalized: self.normalize() self._normalized = True data = self._data project = Project.objects.get_from_cache(id=project_id) project._organization_cache = Organization.objects.get_from_cache( id=project.organization_id) # Check to make sure we're not about to do a bunch of work that's # already been done if we've processed an event with this ID. (This # isn't a perfect solution -- this doesn't handle ``EventMapping`` and # there's a race condition between here and when the event is actually # saved, but it's an improvement. See GH-7677.) try: event = Event.objects.get( project_id=project.id, event_id=data['event_id'], ) except Event.DoesNotExist: pass else: # Make sure we cache on the project before returning event._project_cache = project logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': data['event_id'], 'project_id': project.id, 'model': Event.__name__, } ) return event # Pull out the culprit culprit = self.get_culprit() # Pull the toplevel data we're interested in level = data.get('level') # TODO(mitsuhiko): this code path should be gone by July 2018. # This is going to be fine because no code actually still depends # on integers here. When we need an integer it will be converted # into one later. Old workers used to send integers here. if level is not None and isinstance(level, six.integer_types): level = LOG_LEVELS[level] transaction_name = data.get('transaction') logger_name = data.get('logger') release = data.get('release') dist = data.get('dist') environment = data.get('environment') recorded_timestamp = data.get('timestamp') # We need to swap out the data with the one internal to the newly # created event object event = self._get_event_instance(project_id=project_id) self._data = data = event.data.data event._project_cache = project date = event.datetime platform = event.platform event_id = event.event_id if transaction_name: transaction_name = force_text(transaction_name) # Some of the data that are toplevel attributes are duplicated # into tags (logger, level, environment, transaction). These are # different from legacy attributes which are normalized into tags # ahead of time (site, server_name). setdefault_path(data, 'tags', value=[]) set_tag(data, 'level', level) if logger_name: set_tag(data, 'logger', logger_name) if environment: set_tag(data, 'environment', environment) if transaction_name: set_tag(data, 'transaction', transaction_name) if release: # dont allow a conflicting 'release' tag pop_tag(data, 'release') release = Release.get_or_create( project=project, version=release, date_added=date, ) set_tag(data, 'sentry:release', release.version) if dist and release: dist = release.add_dist(dist, date) # dont allow a conflicting 'dist' tag pop_tag(data, 'dist') set_tag(data, 'sentry:dist', dist.name) else: dist = None event_user = self._get_event_user(project, data) if event_user: # dont allow a conflicting 'user' tag pop_tag(data, 'user') set_tag(data, 'sentry:user', event_user.tag_value) # At this point we want to normalize the in_app values in case the # clients did not set this appropriately so far. grouping_config = load_grouping_config( get_grouping_config_dict_for_event_data(data, project)) normalize_stacktraces_for_grouping(data, grouping_config) for plugin in plugins.for_project(project, version=None): added_tags = safe_execute(plugin.get_tags, event, _with_transaction=False) if added_tags: # plugins should not override user provided tags for key, value in added_tags: if get_tag(data, key) is None: set_tag(data, key, value) for path, iface in six.iteritems(event.interfaces): for k, v in iface.iter_tags(): set_tag(data, k, v) # Get rid of ephemeral interface data if iface.ephemeral: data.pop(iface.path, None) # The active grouping config was put into the event in the # normalize step before. We now also make sure that the # fingerprint was set to `'{{ default }}' just in case someone # removed it from the payload. The call to get_hashes will then # look at `grouping_config` to pick the right paramters. data['fingerprint'] = data.get('fingerprint') or ['{{ default }}'] apply_server_fingerprinting(data, get_fingerprinting_config_for_project(project)) hashes = event.get_hashes() data['hashes'] = hashes # we want to freeze not just the metadata and type in but also the # derived attributes. The reason for this is that we push this # data into kafka for snuba processing and our postprocessing # picks up the data right from the snuba topic. For most usage # however the data is dynamically overriden by Event.title and # Event.location (See Event.as_dict) materialized_metadata = self.materialize_metadata() event_metadata = materialized_metadata['metadata'] data.update(materialized_metadata) data['culprit'] = culprit # index components into ``Event.message`` # See GH-3248 event.message = self.get_search_message(event_metadata, culprit) received_timestamp = event.data.get('received') or float(event.datetime.strftime('%s')) # The group gets the same metadata as the event when it's flushed but # additionally the `last_received` key is set. This key is used by # _save_aggregate. group_metadata = dict(materialized_metadata) group_metadata['last_received'] = received_timestamp kwargs = { 'platform': platform, 'message': event.message, 'culprit': culprit, 'logger': logger_name, 'level': LOG_LEVELS_MAP.get(level), 'last_seen': date, 'first_seen': date, 'active_at': date, 'data': group_metadata, } if release: kwargs['first_release'] = release try: group, is_new, is_regression, is_sample = self._save_aggregate( event=event, hashes=hashes, release=release, **kwargs ) except HashDiscarded: event_discarded.send_robust( project=project, sender=EventManager, ) metrics.incr( 'events.discarded', skip_internal=True, tags={ 'organization_id': project.organization_id, 'platform': platform, }, ) raise else: event_saved.send_robust( project=project, event_size=event.size, sender=EventManager, ) event.group = group # store a reference to the group id to guarantee validation of isolation event.data.bind_ref(event) # When an event was sampled, the canonical source of truth # is the EventMapping table since we aren't going to be writing out an actual # Event row. Otherwise, if the Event isn't being sampled, we can safely # rely on the Event table itself as the source of truth and ignore # EventMapping since it's redundant information. if is_sample: try: with transaction.atomic(using=router.db_for_write(EventMapping)): EventMapping.objects.create(project=project, group=group, event_id=event_id) except IntegrityError: logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': EventMapping.__name__, } ) return event environment = Environment.get_or_create( project=project, name=environment, ) group_environment, is_new_group_environment = GroupEnvironment.get_or_create( group_id=group.id, environment_id=environment.id, defaults={ 'first_release': release if release else None, }, ) if release: ReleaseEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) ReleaseProjectEnvironment.get_or_create( project=project, release=release, environment=environment, datetime=date, ) grouprelease = GroupRelease.get_or_create( group=group, release=release, environment=environment, datetime=date, ) counters = [ (tsdb.models.group, group.id), (tsdb.models.project, project.id), ] if release: counters.append((tsdb.models.release, release.id)) tsdb.incr_multi(counters, timestamp=event.datetime, environment_id=environment.id) frequencies = [ # (tsdb.models.frequent_projects_by_organization, { # project.organization_id: { # project.id: 1, # }, # }), # (tsdb.models.frequent_issues_by_project, { # project.id: { # group.id: 1, # }, # }) (tsdb.models.frequent_environments_by_group, { group.id: { environment.id: 1, }, }) ] if release: frequencies.append( (tsdb.models.frequent_releases_by_group, { group.id: { grouprelease.id: 1, }, }) ) tsdb.record_frequency_multi(frequencies, timestamp=event.datetime) UserReport.objects.filter( project=project, event_id=event_id, ).update( group=group, environment=environment, ) # save the event unless its been sampled if not is_sample: try: with transaction.atomic(using=router.db_for_write(Event)): event.save() except IntegrityError: logger.info( 'duplicate.found', exc_info=True, extra={ 'event_uuid': event_id, 'project_id': project.id, 'group_id': group.id, 'model': Event.__name__, } ) return event tagstore.delay_index_event_tags( organization_id=project.organization_id, project_id=project.id, group_id=group.id, environment_id=environment.id, event_id=event.id, tags=event.tags, date_added=event.datetime, ) if event_user: tsdb.record_multi( ( (tsdb.models.users_affected_by_group, group.id, (event_user.tag_value, )), (tsdb.models.users_affected_by_project, project.id, (event_user.tag_value, )), ), timestamp=event.datetime, environment_id=environment.id, ) if release: if is_new: buffer.incr( ReleaseProject, {'new_groups': 1}, { 'release_id': release.id, 'project_id': project.id, } ) if is_new_group_environment: buffer.incr( ReleaseProjectEnvironment, {'new_issues_count': 1}, { 'project_id': project.id, 'release_id': release.id, 'environment_id': environment.id, } ) safe_execute( Group.objects.add_tags, group, environment, event.get_tags(), _with_transaction=False) if not raw: if not project.first_event: project.update(first_event=date) first_event_received.send_robust(project=project, group=group, sender=Project) eventstream.insert( group=group, event=event, is_new=is_new, is_sample=is_sample, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=hashes[0], # We are choosing to skip consuming the event back # in the eventstream if it's flagged as raw. # This means that we want to publish the event # through the event stream, but we don't care # about post processing and handling the commit. skip_consume=raw, ) metrics.timing( 'events.latency', received_timestamp - recorded_timestamp, tags={ 'project_id': project.id, }, ) metrics.timing( 'events.size.data.post_save', event.size, tags={'project_id': project.id} ) return event