def _normalize_impl(self, project_id=None): if self._project and project_id and project_id != self._project.id: raise RuntimeError( "Initialized EventManager with one project ID and called save() with another one" ) if self._normalized: raise RuntimeError("Already normalized") self._normalized = True from sentry_relay.processing import StoreNormalizer rust_normalizer = StoreNormalizer( project_id=self._project.id if self._project else project_id, client_ip=self._client_ip, client=self._auth.client if self._auth else None, key_id=six.text_type(self._key.id) if self._key else None, grouping_config=self._grouping_config, protocol_version=six.text_type(self.version) if self.version is not None else None, is_renormalize=self._is_renormalize, remove_other=self._remove_other, normalize_user_agent=True, sent_at=self.sent_at.isoformat() if self.sent_at is not None else None, **DEFAULT_STORE_NORMALIZER_ARGS ) self._data = CanonicalKeyDict(rust_normalizer.normalize_event(dict(self._data)))
def __init__(self, data, skip_renormalization=False, **kwargs): is_renormalized = isinstance(data, EventDict) or ( isinstance(data, NodeData) and isinstance(data.data, EventDict) ) if not skip_renormalization and not is_renormalized: normalizer = StoreNormalizer(is_renormalize=True, enable_trimming=False) data = normalizer.normalize_event(dict(data)) CanonicalKeyDict.__init__(self, data, **kwargs)
def _normalize_impl(self): if self._normalized: raise RuntimeError("Already normalized") self._normalized = True from sentry_relay.processing import StoreNormalizer rust_normalizer = StoreNormalizer( project_id=self._project.id if self._project else None, client_ip=self._client_ip, client=self._auth.client if self._auth else None, key_id=six.text_type(self._key.id) if self._key else None, grouping_config=self._grouping_config, protocol_version=six.text_type(self.version) if self.version is not None else None, is_renormalize=self._is_renormalize, remove_other=self._remove_other, normalize_user_agent=True, **DEFAULT_STORE_NORMALIZER_ARGS) self._data = CanonicalKeyDict( rust_normalizer.normalize_event(dict(self._data)))
def _do_process_event( cache_key, start_time, event_id, process_task, data=None, data_has_changed=None, from_symbolicate=False, ): from sentry.plugins.base import plugins if data is None: data = event_processing_store.get(cache_key) if data is None: metrics.incr("events.failed", tags={ "reason": "cache", "stage": "process" }, skip_internal=False) error_logger.error("process.failed.empty", extra={"cache_key": cache_key}) return data = CanonicalKeyDict(data) project_id = data["project"] set_current_event_project(project_id) event_id = data["event_id"] if killswitch_matches_context( "store.load-shed-process-event-projects", { "project_id": project_id, "event_id": event_id, "platform": data.get("platform") or "null", }, ): return with sentry_sdk.start_span( op="tasks.store.process_event.get_project_from_cache"): project = Project.objects.get_from_cache(id=project_id) with metrics.timer( "tasks.store.process_event.organization.get_from_cache"): project._organization_cache = Organization.objects.get_from_cache( id=project.organization_id) has_changed = bool(data_has_changed) with sentry_sdk.start_span( op="tasks.store.process_event.get_reprocessing_revision"): # Fetch the reprocessing revision reprocessing_rev = reprocessing.get_reprocessing_revision(project_id) # Stacktrace based event processors. with sentry_sdk.start_span(op="task.store.process_event.stacktraces"): with metrics.timer("tasks.store.process_event.stacktraces", tags={"from_symbolicate": from_symbolicate}): new_data = process_stacktraces(data) if new_data is not None: has_changed = True data = new_data # Second round of datascrubbing after stacktrace and language-specific # processing. First round happened as part of ingest. # # *Right now* the only sensitive data that is added in stacktrace # processing are usernames in filepaths, so we run directly after # stacktrace processors. # # We do not yet want to deal with context data produced by plugins like # sessionstack or fullstory (which are in `get_event_preprocessors`), as # this data is very unlikely to be sensitive data. This is why scrubbing # happens somewhere in the middle of the pipeline. # # On the other hand, Javascript event error translation is happening after # this block because it uses `get_event_preprocessors` instead of # `get_event_enhancers`. # # We are fairly confident, however, that this should run *before* # re-normalization as it is hard to find sensitive data in partially # trimmed strings. if has_changed and options.get("processing.can-use-scrubbers"): with sentry_sdk.start_span(op="task.store.datascrubbers.scrub"): with metrics.timer("tasks.store.datascrubbers.scrub", tags={"from_symbolicate": from_symbolicate}): new_data = safe_execute(scrub_data, project=project, event=data.data) # XXX(markus): When datascrubbing is finally "totally stable", we might want # to drop the event if it crashes to avoid saving PII if new_data is not None: data.data = new_data # TODO(dcramer): ideally we would know if data changed by default # Default event processors. for plugin in plugins.all(version=2): with sentry_sdk.start_span( op="task.store.process_event.preprocessors") as span: span.set_data("plugin", plugin.slug) span.set_data("from_symbolicate", from_symbolicate) with metrics.timer( "tasks.store.process_event.preprocessors", tags={ "plugin": plugin.slug, "from_symbolicate": from_symbolicate }, ): processors = safe_execute(plugin.get_event_preprocessors, data=data, _with_transaction=False) for processor in processors or (): try: result = processor(data) except Exception: error_logger.exception( "tasks.store.preprocessors.error") data.setdefault("_metrics", {})["flag.processing.error"] = True has_changed = True else: if result: data = result has_changed = True assert data[ "project"] == project_id, "Project cannot be mutated by plugins" # We cannot persist canonical types in the cache, so we need to # downgrade this. if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) if has_changed: # Run some of normalization again such that we don't: # - persist e.g. incredibly large stacktraces from minidumps # - store event timestamps that are older than our retention window # (also happening with minidumps) normalizer = StoreNormalizer(remove_other=False, is_renormalize=True, **DEFAULT_STORE_NORMALIZER_ARGS) data = normalizer.normalize_event(dict(data)) issues = data.get("processing_issues") try: if issues and create_failed_event( cache_key, data, project_id, list(issues.values()), event_id=event_id, start_time=start_time, reprocessing_rev=reprocessing_rev, ): return except RetryProcessing: # If `create_failed_event` indicates that we need to retry we # invoke ourselves again. This happens when the reprocessing # revision changed while we were processing. _do_preprocess_event(cache_key, data, start_time, event_id, process_task, project) return cache_key = event_processing_store.store(data) from_reprocessing = process_task is process_event_from_reprocessing submit_save_event(project, from_reprocessing, cache_key, event_id, start_time, data)
def _do_process_event(cache_key, start_time, event_id, process_task, data=None): from sentry.plugins.base import plugins if data is None: data = default_cache.get(cache_key) if data is None: metrics.incr("events.failed", tags={ "reason": "cache", "stage": "process" }, skip_internal=False) error_logger.error("process.failed.empty", extra={"cache_key": cache_key}) return data = CanonicalKeyDict(data) project_id = data["project"] event_id = data["event_id"] with configure_scope() as scope: scope.set_tag("project", project_id) has_changed = False # Fetch the reprocessing revision reprocessing_rev = reprocessing.get_reprocessing_revision(project_id) try: # Event enhancers. These run before anything else. for plugin in plugins.all(version=2): enhancers = safe_execute(plugin.get_event_enhancers, data=data) for enhancer in enhancers or (): enhanced = safe_execute( enhancer, data, _passthrough_errors=(RetrySymbolication, )) if enhanced: data = enhanced has_changed = True # Stacktrace based event processors. new_data = process_stacktraces(data) if new_data is not None: has_changed = True data = new_data except RetrySymbolication as e: if start_time and (time() - start_time) > 120: error_logger.warning("process.slow", extra={ "project_id": project_id, "event_id": event_id }) if start_time and (time() - start_time) > 3600: # Do not drop event but actually continue with rest of pipeline # (persisting unsymbolicated event) error_logger.exception( "process.failed.infinite_retry", extra={ "project_id": project_id, "event_id": event_id }, ) else: retry_process_event.apply_async( args=(), kwargs={ "process_task_name": process_task.__name__, "task_kwargs": { "cache_key": cache_key, "event_id": event_id, "start_time": start_time, }, }, countdown=e.retry_after, ) return # TODO(dcramer): ideally we would know if data changed by default # Default event processors. for plugin in plugins.all(version=2): processors = safe_execute(plugin.get_event_preprocessors, data=data, _with_transaction=False) for processor in processors or (): result = safe_execute(processor, data) if result: data = result has_changed = True assert data[ "project"] == project_id, "Project cannot be mutated by preprocessor" project = Project.objects.get_from_cache(id=project_id) # We cannot persist canonical types in the cache, so we need to # downgrade this. if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) if has_changed: # Run some of normalization again such that we don't: # - persist e.g. incredibly large stacktraces from minidumps # - store event timestamps that are older than our retention window # (also happening with minidumps) normalizer = StoreNormalizer(remove_other=False, is_renormalize=True, **DEFAULT_STORE_NORMALIZER_ARGS) data = normalizer.normalize_event(dict(data)) issues = data.get("processing_issues") try: if issues and create_failed_event( cache_key, data, project_id, list(issues.values()), event_id=event_id, start_time=start_time, reprocessing_rev=reprocessing_rev, ): return except RetryProcessing: # If `create_failed_event` indicates that we need to retry we # invoke outselves again. This happens when the reprocessing # revision changed while we were processing. from_reprocessing = process_task is process_event_from_reprocessing submit_process(project, from_reprocessing, cache_key, event_id, start_time, data) process_task.delay(cache_key, start_time=start_time, event_id=event_id) return default_cache.set(cache_key, data, 3600) submit_save_event(project, cache_key, event_id, start_time, data)
def normalize_event(event, project_id): normalizer = StoreNormalizer(project_id=project_id, ) return normalizer.normalize_event(event)
def _do_process_event(cache_key, start_time, event_id, process_task, data=None): from sentry.plugins.base import plugins if data is None: data = default_cache.get(cache_key) if data is None: metrics.incr( "events.failed", tags={"reason": "cache", "stage": "process"}, skip_internal=False ) error_logger.error("process.failed.empty", extra={"cache_key": cache_key}) return data = CanonicalKeyDict(data) project_id = data["project"] event_id = data["event_id"] project = Project.objects.get_from_cache(id=project_id) with configure_scope() as scope: scope.set_tag("project", project_id) has_changed = False # Fetch the reprocessing revision reprocessing_rev = reprocessing.get_reprocessing_revision(project_id) try: # Event enhancers. These run before anything else. for plugin in plugins.all(version=2): enhancers = safe_execute(plugin.get_event_enhancers, data=data) for enhancer in enhancers or (): enhanced = safe_execute(enhancer, data, _passthrough_errors=(RetrySymbolication,)) if enhanced: data = enhanced has_changed = True # Stacktrace based event processors. new_data = process_stacktraces(data) if new_data is not None: has_changed = True data = new_data except RetrySymbolication as e: if start_time and (time() - start_time) > settings.SYMBOLICATOR_PROCESS_EVENT_WARN_TIMEOUT: error_logger.warning( "process.slow", extra={"project_id": project_id, "event_id": event_id} ) if start_time and (time() - start_time) > settings.SYMBOLICATOR_PROCESS_EVENT_HARD_TIMEOUT: # Do not drop event but actually continue with rest of pipeline # (persisting unsymbolicated event) error_logger.exception( "process.failed.infinite_retry", extra={"project_id": project_id, "event_id": event_id}, ) else: retry_process_event.apply_async( args=(), kwargs={ "process_task_name": process_task.__name__, "task_kwargs": { "cache_key": cache_key, "event_id": event_id, "start_time": start_time, }, }, countdown=e.retry_after, ) return # Second round of datascrubbing after stacktrace and language-specific # processing. First round happened as part of ingest. # # We assume that all potential PII is produced as part of stacktrace # processors and event enhancers. # # We assume that plugins for eg sessionstack (running via # `plugin.get_event_preprocessors`) are not producing data that should be # PII-stripped, ever. # # XXX(markus): Javascript event error translation is happening after this block # because it uses `get_event_preprocessors` instead of # `get_event_enhancers`, possibly move? if has_changed and features.has( "organizations:datascrubbers-v2", project.organization, actor=None ): with metrics.timer("tasks.store.datascrubbers.scrub"): project_config = get_project_config(project) new_data = safe_execute(scrub_data, project_config=project_config, event=data.data) # XXX(markus): When datascrubbing is finally "totally stable", we might want # to drop the event if it crashes to avoid saving PII if new_data is not None: data.data = new_data # TODO(dcramer): ideally we would know if data changed by default # Default event processors. for plugin in plugins.all(version=2): processors = safe_execute( plugin.get_event_preprocessors, data=data, _with_transaction=False ) for processor in processors or (): result = safe_execute(processor, data) if result: data = result has_changed = True assert data["project"] == project_id, "Project cannot be mutated by plugins" # We cannot persist canonical types in the cache, so we need to # downgrade this. if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) if has_changed: # Run some of normalization again such that we don't: # - persist e.g. incredibly large stacktraces from minidumps # - store event timestamps that are older than our retention window # (also happening with minidumps) normalizer = StoreNormalizer( remove_other=False, is_renormalize=True, **DEFAULT_STORE_NORMALIZER_ARGS ) data = normalizer.normalize_event(dict(data)) issues = data.get("processing_issues") try: if issues and create_failed_event( cache_key, data, project_id, list(issues.values()), event_id=event_id, start_time=start_time, reprocessing_rev=reprocessing_rev, ): return except RetryProcessing: # If `create_failed_event` indicates that we need to retry we # invoke outselves again. This happens when the reprocessing # revision changed while we were processing. from_reprocessing = process_task is process_event_from_reprocessing submit_process(project, from_reprocessing, cache_key, event_id, start_time, data) process_task.delay(cache_key, start_time=start_time, event_id=event_id) return default_cache.set(cache_key, data, 3600) submit_save_event(project, cache_key, event_id, start_time, data)
def _do_process_event( cache_key, start_time, event_id, process_task, data=None, data_has_changed=None, new_process_behavior=None, ): from sentry.plugins.base import plugins if data is None: data = default_cache.get(cache_key) if data is None: metrics.incr("events.failed", tags={ "reason": "cache", "stage": "process" }, skip_internal=False) error_logger.error("process.failed.empty", extra={"cache_key": cache_key}) return data = CanonicalKeyDict(data) project_id = data["project"] set_current_project(project_id) event_id = data["event_id"] project = Project.objects.get_from_cache(id=project_id) has_changed = bool(data_has_changed) new_process_behavior = bool(new_process_behavior) metrics.incr("tasks.store.process_event.new_process_behavior", tags={"value": new_process_behavior}) # Fetch the reprocessing revision reprocessing_rev = reprocessing.get_reprocessing_revision(project_id) try: if not new_process_behavior: # Event enhancers. These run before anything else. for plugin in plugins.all(version=2): with metrics.timer("tasks.store.process_event.enhancers", tags={"plugin": plugin.slug}): enhancers = safe_execute(plugin.get_event_enhancers, data=data) for enhancer in enhancers or (): enhanced = safe_execute( enhancer, data, _passthrough_errors=(RetrySymbolication, )) if enhanced: data = enhanced has_changed = True # Stacktrace based event processors. with metrics.timer("tasks.store.process_event.stacktraces"): new_data = process_stacktraces(data) if new_data is not None: has_changed = True data = new_data except RetrySymbolication as e: if start_time and ( time() - start_time) > settings.SYMBOLICATOR_PROCESS_EVENT_WARN_TIMEOUT: error_logger.warning("process.slow", extra={ "project_id": project_id, "event_id": event_id }) if start_time and ( time() - start_time) > settings.SYMBOLICATOR_PROCESS_EVENT_HARD_TIMEOUT: # Do not drop event but actually continue with rest of pipeline # (persisting unsymbolicated event) error_logger.exception( "process.failed.infinite_retry", extra={ "project_id": project_id, "event_id": event_id }, ) else: retry_process_event.apply_async( args=(), kwargs={ "process_task_name": process_task.__name__, "task_kwargs": { "cache_key": cache_key, "event_id": event_id, "start_time": start_time, }, }, countdown=e.retry_after, ) return # Second round of datascrubbing after stacktrace and language-specific # processing. First round happened as part of ingest. # # *Right now* the only sensitive data that is added in stacktrace # processing are usernames in filepaths, so we run directly after # stacktrace processors and `get_event_enhancers`. # # We do not yet want to deal with context data produced by plugins like # sessionstack or fullstory (which are in `get_event_preprocessors`), as # this data is very unlikely to be sensitive data. This is why scrubbing # happens somewhere in the middle of the pipeline. # # On the other hand, Javascript event error translation is happening after # this block because it uses `get_event_preprocessors` instead of # `get_event_enhancers`. # # We are fairly confident, however, that this should run *before* # re-normalization as it is hard to find sensitive data in partially # trimmed strings. if (has_changed and options.get("processing.can-use-scrubbers") and features.has("organizations:datascrubbers-v2", project.organization, actor=None)): with metrics.timer("tasks.store.datascrubbers.scrub"): project_config = get_project_config(project) new_data = safe_execute(scrub_data, project_config=project_config, event=data.data) # XXX(markus): When datascrubbing is finally "totally stable", we might want # to drop the event if it crashes to avoid saving PII if new_data is not None: data.data = new_data # TODO(dcramer): ideally we would know if data changed by default # Default event processors. for plugin in plugins.all(version=2): with metrics.timer("tasks.store.process_event.preprocessors", tags={"plugin": plugin.slug}): processors = safe_execute(plugin.get_event_preprocessors, data=data, _with_transaction=False) for processor in processors or (): result = safe_execute(processor, data) if result: data = result has_changed = True assert data[ "project"] == project_id, "Project cannot be mutated by plugins" # We cannot persist canonical types in the cache, so we need to # downgrade this. if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) if has_changed: # Run some of normalization again such that we don't: # - persist e.g. incredibly large stacktraces from minidumps # - store event timestamps that are older than our retention window # (also happening with minidumps) normalizer = StoreNormalizer(remove_other=False, is_renormalize=True, **DEFAULT_STORE_NORMALIZER_ARGS) data = normalizer.normalize_event(dict(data)) issues = data.get("processing_issues") try: if issues and create_failed_event( cache_key, data, project_id, list(issues.values()), event_id=event_id, start_time=start_time, reprocessing_rev=reprocessing_rev, ): return except RetryProcessing: # If `create_failed_event` indicates that we need to retry we # invoke outselves again. This happens when the reprocessing # revision changed while we were processing. _do_preprocess_event(cache_key, data, start_time, event_id, process_task, project) return default_cache.set(cache_key, data, 3600) submit_save_event(project, cache_key, event_id, start_time, data)