def test_attachment_outcomes(self): manager = EventManager(make_event(message="foo"), project=self.project) manager.normalize() a1 = CachedAttachment(name="a1", data=b"hello") a2 = CachedAttachment(name="a2", data=b"limited", rate_limited=True) a3 = CachedAttachment(name="a3", data=b"world") cache_key = cache_key_for_event(manager.get_data()) attachment_cache.set(cache_key, attachments=[a1, a2, a3]) mock_track_outcome = mock.Mock() with mock.patch("sentry.event_manager.track_outcome", mock_track_outcome): with self.feature("organizations:event-attachments"): manager.save(1, cache_key=cache_key) assert mock_track_outcome.call_count == 3 for o in mock_track_outcome.mock_calls: assert o.kwargs["outcome"] == Outcome.ACCEPTED for o in mock_track_outcome.mock_calls[:2]: assert o.kwargs["category"] == DataCategory.ATTACHMENT assert o.kwargs["quantity"] == 5 final = mock_track_outcome.mock_calls[2] assert final.kwargs["category"] == DataCategory.DEFAULT
def save_unprocessed_event(project, event_id): """ Move event from event_processing_store into nodestore. Only call if event has outcome=accepted. """ if not features.has("projects:reprocessing-v2", project, actor=None): return with sentry_sdk.start_span( op= "sentry.reprocessing2.save_unprocessed_event.get_unprocessed_event" ): data = event_processing_store.get(cache_key_for_event({ "project": project.id, "event_id": event_id }), unprocessed=True) if data is None: return with sentry_sdk.start_span( op="sentry.reprocessing2.save_unprocessed_event.set_nodestore"): node_id = _generate_unprocessed_event_node_id(project_id=project.id, event_id=event_id) nodestore.set(node_id, data)
def store(self, event: Event, unprocessed: bool = False) -> str: with sentry_sdk.start_span(op="eventstore.processing.store"): key = cache_key_for_event(event) if unprocessed: key = self.__get_unprocessed_key(key) self.inner.set(key, event, self.timeout) return key
def process_individual_attachment(message, projects): event_id = message["event_id"] project_id = int(message["project_id"]) cache_key = cache_key_for_event({ "event_id": event_id, "project": project_id }) try: project = projects[project_id] except KeyError: logger.error("Project for ingested event does not exist: %s", project_id) return if not features.has("organizations:event-attachments", project.organization, actor=None): logger.info("Organization has no event attachments: %s", project_id) return # Attachments may be uploaded for events that already exist. Fetch the # existing group_id, so that the attachment can be fetched by group-level # APIs. This is inherently racy. events = eventstore.get_unfetched_events(filter=eventstore.Filter( event_ids=[event_id], project_ids=[project.id]), limit=1) group_id = None if events: group_id = events[0].group_id attachment = message["attachment"] attachment = attachment_cache.get_from_chunks( key=cache_key, type=attachment.pop("attachment_type"), **attachment) if attachment.type != "event.attachment": logger.exception("invalid individual attachment type: %s", attachment.type) return file = File.objects.create( name=attachment.name, type=attachment.type, headers={"Content-Type": attachment.content_type}, ) try: data = attachment.data except MissingAttachmentChunks: logger.exception("Missing chunks for cache_key=%s", cache_key) return file.putfile(BytesIO(data)) EventAttachment.objects.create(project_id=project.id, group_id=group_id, event_id=event_id, name=attachment.name, file=file) attachment.delete()
def insert_data_to_database_legacy(data, start_time=None, from_reprocessing=False, attachments=None): """ Yet another "fast path" to ingest an event without making it go through Relay. Please consider using functions from the ingest consumer instead, or, if you're within tests, to use `TestCase.store_event`. """ # XXX(markus): Delete this function and merge with ingest consumer logic. if start_time is None: start_time = time() # we might be passed some subclasses of dict that fail dumping if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) cache_timeout = 3600 cache_key = cache_key_for_event(data) default_cache.set(cache_key, data, cache_timeout) # Attachments will be empty or None if the "event-attachments" feature # is turned off. For native crash reports it will still contain the # crash dump (e.g. minidump) so we can load it during processing. if attachments is not None: attachment_cache.set(cache_key, attachments, cache_timeout) task = from_reprocessing and preprocess_event_from_reprocessing or preprocess_event task.delay(cache_key=cache_key, start_time=start_time, event_id=data["event_id"])
def _dispatch_post_process_group_task( self, event, is_new, is_regression, is_new_group_environment, primary_hash, skip_consume=False, ): if skip_consume: logger.info("post_process.skip.raw_event", extra={"event_id": event.event_id}) else: cache_key = cache_key_for_event({ "project": event.project_id, "event_id": event.event_id }) post_process_group.delay( is_new=is_new, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=primary_hash, cache_key=cache_key, group_id=event.group_id, )
def _dispatch_post_process_group_task( self, event_id: str, project_id: int, group_id: Optional[int], is_new: bool, is_regression: bool, is_new_group_environment: bool, primary_hash: Optional[str], skip_consume: bool = False, ) -> None: if skip_consume: logger.info("post_process.skip.raw_event", extra={"event_id": event_id}) else: cache_key = cache_key_for_event({ "project": project_id, "event_id": event_id }) post_process_group.delay( is_new=is_new, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=primary_hash, cache_key=cache_key, group_id=group_id, )
def insert_data_to_database(self, data, start_time=None, from_reprocessing=False, attachments=None): if start_time is None: start_time = time() # we might be passed some subclasses of dict that fail dumping if isinstance(data, CANONICAL_TYPES): data = dict(data.items()) cache_timeout = 3600 cache_key = cache_key_for_event(data) default_cache.set(cache_key, data, cache_timeout) # Attachments will be empty or None if the "event-attachments" feature # is turned off. For native crash reports it will still contain the # crash dump (e.g. minidump) so we can load it during processing. if attachments is not None: attachment_cache.set(cache_key, attachments, cache_timeout) task = from_reprocessing and preprocess_event_from_reprocessing or preprocess_event task.delay(cache_key=cache_key, start_time=start_time, event_id=data["event_id"])
def process_individual_attachment(message): event_id = message["event_id"] project_id = message["project_id"] cache_key = cache_key_for_event({ "event_id": event_id, "project": project_id }) try: project = Project.objects.get_from_cache(id=project_id) except Project.DoesNotExist: logger.error("Project for ingested event does not exist: %s", project_id) return attachment = message["attachment"] attachment = attachment_cache.get_from_chunks( key=cache_key, type=attachment.pop("attachment_type"), **attachment) assert attachment.type == "event.attachment", attachment.type file = File.objects.create( name=attachment.name, type=attachment.type, headers={"Content-Type": attachment.content_type}, ) file.putfile(BytesIO(attachment.data)) EventAttachment.objects.create(project_id=project.id, event_id=event_id, name=attachment.name, file=file) attachment.delete()
def process_message(self, message): message = msgpack.unpackb(message.value(), use_list=False) payload = message["payload"] start_time = float(message["start_time"]) event_id = message["event_id"] project_id = message["project_id"] remote_addr = message.get("remote_addr") # check that we haven't already processed this event (a previous instance of the forwarder # died before it could commit the event queue offset) deduplication_key = "ev:{}:{}".format(project_id, event_id) if cache.get(deduplication_key) is not None: logger.warning( "pre-process-forwarder detected a duplicated event" " with id:%s for project:%s.", event_id, project_id, ) return True # message already processed do not reprocess try: project = Project.objects.get_from_cache(id=project_id) except Project.DoesNotExist: logger.error("Project for ingested event does not exist: %s", project_id) return True # Parse the JSON payload. This is required to compute the cache key and # call process_event. The payload will be put into Kafka raw, to avoid # serializing it again. # XXX: Do not use CanonicalKeyDict here. This may break preprocess_event # which assumes that data passed in is a raw dictionary. data = json.loads(payload) cache_timeout = 3600 cache_key = cache_key_for_event(data) default_cache.set(cache_key, data, cache_timeout) # Preprocess this event, which spawns either process_event or # save_event. Pass data explicitly to avoid fetching it again from the # cache. preprocess_event( cache_key=cache_key, data=data, start_time=start_time, event_id=event_id, project=project, ) # remember for an 1 hour that we saved this event (deduplication protection) cache.set(deduplication_key, "", 3600) # emit event_accepted once everything is done event_accepted.send_robust( ip=remote_addr, data=data, project=project, sender=self.process_message ) # Return *something* so that it counts against batch size return True
def process_attachment_chunk(message, projects): payload = message["payload"] event_id = message["event_id"] project_id = message["project_id"] id = message["id"] chunk_index = message["chunk_index"] cache_key = cache_key_for_event({"event_id": event_id, "project": project_id}) attachment_cache.set_chunk( key=cache_key, id=id, chunk_index=chunk_index, chunk_data=payload, timeout=CACHE_TIMEOUT )
def event_preprocessor(data): extra = data.setdefault("extra", {}) extra.setdefault("processing_counter", 0) extra["processing_counter"] += 1 cache_key = cache_key_for_event(data) attachments = attachment_cache.get(cache_key) extra.setdefault("attachments", []).append([attachment.type for attachment in attachments]) return data
def process_individual_attachment(message, projects): event_id = message["event_id"] project_id = int(message["project_id"]) cache_key = cache_key_for_event({ "event_id": event_id, "project": project_id }) try: project = projects[project_id] except KeyError: logger.error("Project for ingested event does not exist: %s", project_id) return if not features.has("organizations:event-attachments", project.organization, actor=None): logger.info("Organization has no event attachments: %s", project_id) return # Attachments may be uploaded for events that already exist. Fetch the # existing group_id, so that the attachment can be fetched by group-level # APIs. This is inherently racy. events = eventstore.get_unfetched_events(filter=eventstore.Filter( event_ids=[event_id], project_ids=[project.id]), limit=1) group_id = None if events: group_id = events[0].group_id attachment = message["attachment"] attachment = attachment_cache.get_from_chunks( key=cache_key, type=attachment.pop("attachment_type"), **attachment) if attachment.type != "event.attachment": logger.exception("invalid individual attachment type: %s", attachment.type) return save_attachment( cache_key, attachment, project, event_id, key_id=None, # TODO: Inject this from Relay group_id=group_id, start_time=None, # TODO: Inject this from Relay ) attachment.delete()
def process_individual_attachment(message, projects): event_id = message["event_id"] project_id = int(message["project_id"]) cache_key = cache_key_for_event({ "event_id": event_id, "project": project_id }) try: project = projects[project_id] except KeyError: logger.error("Project for ingested event does not exist: %s", project_id) return if not features.has("organizations:event-attachments", project.organization, actor=None): logger.info("Organization has no event attachments: %s", project_id) return attachment = message["attachment"] attachment = attachment_cache.get_from_chunks( key=cache_key, type=attachment.pop("attachment_type"), **attachment) assert attachment.type == "event.attachment", attachment.type file = File.objects.create( name=attachment.name, type=attachment.type, headers={"Content-Type": attachment.content_type}, ) file.putfile(BytesIO(attachment.data)) EventAttachment.objects.create(project_id=project.id, event_id=event_id, name=attachment.name, file=file) attachment.delete()
def _dispatch_post_process_group_task( self, event, is_new, is_regression, is_new_group_environment, primary_hash, skip_consume=False, ): if skip_consume: logger.info("post_process.skip.raw_event", extra={"event_id": event.event_id}) else: random_val = random.random() cache_key = cache_key_for_event({ "project": event.project_id, "event_id": event.event_id }) if options.get("postprocess.use-cache-key") > random_val: post_process_group.delay( event=None, is_new=is_new, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=primary_hash, cache_key=cache_key, group_id=event.group_id, ) else: # Pass the cache key here to ensure that the processing cache is removed. post_process_group.delay( event=event, is_new=is_new, is_regression=is_regression, is_new_group_environment=is_new_group_environment, primary_hash=primary_hash, cache_key=cache_key, )
def _do_process_event(message, projects): payload = message["payload"] start_time = float(message["start_time"]) event_id = message["event_id"] project_id = int(message["project_id"]) remote_addr = message.get("remote_addr") attachments = message.get("attachments") or () # check that we haven't already processed this event (a previous instance of the forwarder # died before it could commit the event queue offset) # # XXX(markus): I believe this code is extremely broken: # # * it practically uses memcached in prod which has no consistency # guarantees (no idea how we don't run into issues there) # # * a TTL of 1h basically doesn't guarantee any deduplication at all. It # just guarantees a good error message... for one hour. # # This code has been ripped from the old python store endpoint. We're # keeping it around because it does provide some protection against # reprocessing good events if a single consumer is in a restart loop. deduplication_key = "ev:{}:{}".format(project_id, event_id) if cache.get(deduplication_key) is not None: logger.warning( "pre-process-forwarder detected a duplicated event" " with id:%s for project:%s.", event_id, project_id, ) return # message already processed do not reprocess try: project = projects[project_id] except KeyError: logger.error("Project for ingested event does not exist: %s", project_id) return # Parse the JSON payload. This is required to compute the cache key and # call process_event. The payload will be put into Kafka raw, to avoid # serializing it again. # XXX: Do not use CanonicalKeyDict here. This may break preprocess_event # which assumes that data passed in is a raw dictionary. data = json.loads(payload) cache_key = cache_key_for_event(data) default_cache.set(cache_key, data, CACHE_TIMEOUT) if attachments: attachment_objects = [ CachedAttachment(type=attachment.pop("attachment_type"), **attachment) for attachment in attachments ] attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) # Preprocess this event, which spawns either process_event or # save_event. Pass data explicitly to avoid fetching it again from the # cache. with sentry_sdk.start_span( op="ingest_consumer.process_event.preprocess_event"): preprocess_event( cache_key=cache_key, data=data, start_time=start_time, event_id=event_id, project=project, ) # remember for an 1 hour that we saved this event (deduplication protection) cache.set(deduplication_key, "", CACHE_TIMEOUT) # emit event_accepted once everything is done event_accepted.send_robust(ip=remote_addr, data=data, project=project, sender=process_event)
def process_event(message, projects): payload = message["payload"] start_time = float(message["start_time"]) event_id = message["event_id"] project_id = int(message["project_id"]) remote_addr = message.get("remote_addr") attachments = message.get("attachments") or () # check that we haven't already processed this event (a previous instance of the forwarder # died before it could commit the event queue offset) deduplication_key = "ev:{}:{}".format(project_id, event_id) if cache.get(deduplication_key) is not None: logger.warning( "pre-process-forwarder detected a duplicated event" " with id:%s for project:%s.", event_id, project_id, ) return # message already processed do not reprocess try: project = projects[project_id] except KeyError: logger.error("Project for ingested event does not exist: %s", project_id) return # Parse the JSON payload. This is required to compute the cache key and # call process_event. The payload will be put into Kafka raw, to avoid # serializing it again. # XXX: Do not use CanonicalKeyDict here. This may break preprocess_event # which assumes that data passed in is a raw dictionary. data = json.loads(payload) cache_key = cache_key_for_event(data) default_cache.set(cache_key, data, CACHE_TIMEOUT) if attachments: attachment_objects = [ CachedAttachment(type=attachment.pop("attachment_type"), **attachment) for attachment in attachments ] attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) # Preprocess this event, which spawns either process_event or # save_event. Pass data explicitly to avoid fetching it again from the # cache. preprocess_event(cache_key=cache_key, data=data, start_time=start_time, event_id=event_id, project=project) # remember for an 1 hour that we saved this event (deduplication protection) cache.set(deduplication_key, "", CACHE_TIMEOUT) # emit event_accepted once everything is done event_accepted.send_robust(ip=remote_addr, data=data, project=project, sender=process_event)
def store(self, event: Event, unprocessed: bool = False) -> str: key = cache_key_for_event(event) if unprocessed: key = self.__get_unprocessed_key(key) self.inner.set(key, event, self.timeout) return key
def _key_for_event(self, event): return cache_key_for_event(event)
def delete(self, event: Event) -> None: key = cache_key_for_event(event) self.delete_by_key(key)
def store(self, event, unprocessed=False): key = cache_key_for_event(event) if unprocessed: key = _get_unprocessed_key(key) self.inner.set(key, event, self.timeout) return key
def delete(self, event): key = cache_key_for_event(event) self.delete_by_key(key)
def get_event_attachment(data, attachment_type): cache_key = cache_key_for_event(data) attachments = attachment_cache.get(cache_key) or [] return next((a for a in attachments if a.type == attachment_type), None)
def delete(self, event, unprocessed=False): key = cache_key_for_event(event) if unprocessed: key = _get_unprocessed_key(key) self.delete_by_key(key)