def test_simple(self): event_id = "a" * 32 project = self.create_project() node_id = Event.generate_node_id(project.id, event_id) group = self.create_group(project=project) event = self.create_event(group=group, event_id=event_id) EventAttachment.objects.create( event_id=event.event_id, project_id=event.project_id, file=File.objects.create(name="hello.png", type="image/png"), name="hello.png", ) UserReport.objects.create( event_id=event.event_id, project_id=event.project_id, name="Jane Bloggs" ) assert nodestore.get(node_id) is not None deletion = ScheduledDeletion.schedule(event, days=0) deletion.update(in_progress=True) with self.tasks(): run_deletion(deletion.id) assert not Event.objects.filter(id=event.id).exists() assert not EventAttachment.objects.filter( event_id=event.event_id, project_id=project.id ).exists() assert not UserReport.objects.filter( event_id=event.event_id, project_id=project.id ).exists() assert nodestore.get(node_id) is None
def test_event_node_id(self): # Create an event without specifying node_id. A node_id should be generated e1 = Event(project_id=1, event_id="abc", data={"foo": "bar"}) assert e1.data.id is not None, "We should have generated a node_id for this event" e1_node_id = e1.data.id e1.data.save() e1_body = nodestore.get(e1_node_id) assert e1_body == { "foo": "bar" }, "The event body should be in nodestore" e1 = Event(project_id=1, event_id="abc") assert e1.data.data == { "foo": "bar" }, "The event body should be loaded from nodestore" assert e1.data.id == e1_node_id, "The event's node_id should be the same after load" # Event with no data should not be saved to nodestore e2 = Event(project_id=1, event_id="mno", data=None) e2_node_id = e2.data.id assert e2.data.data == {} # NodeData returns {} by default eventstore.bind_nodes([e2], "data") assert e2.data.data == {} e2_body = nodestore.get(e2_node_id) assert e2_body is None
def setUp(self): super(SnubaEventTest, self).setUp() self.event_id = "f" * 32 self.now = datetime.utcnow().replace(microsecond=0) - timedelta( seconds=10) self.proj1 = self.create_project() self.proj1env1 = self.create_environment(project=self.proj1, name="test") self.proj1group1 = self.create_group(self.proj1, first_seen=self.now, last_seen=self.now + timedelta(seconds=14400)) # Raw event data self.data = { "event_id": self.event_id, "primary_hash": "1" * 32, "project_id": self.proj1.id, "message": "message 1", "platform": "python", "timestamp": calendar.timegm(self.now.timetuple()), "received": calendar.timegm(self.now.timetuple()), "tags": { "foo": "bar", "baz": "quux", "environment": "prod", "sentry:user": u"id:user1", "sentry:release": "release1", }, "user": { "id": u"user1", "email": u"*****@*****.**" }, } # Create a regular django Event from the data, which will save the. # data in nodestore too. Once Postgres events are deprecated, we can # turn this off and just put the payload in nodestore. make_django_event = True if make_django_event: self.create_event( event_id=self.data["event_id"], datetime=self.now, project=self.proj1, group=self.proj1group1, data=self.data, ) nodestore_data = nodestore.get( SnubaEvent.generate_node_id(self.proj1.id, self.event_id)) assert self.data["event_id"] == nodestore_data["event_id"] else: node_id = SnubaEvent.generate_node_id(self.proj1.id, self.event_id) nodestore.set(node_id, self.data) assert nodestore.get(node_id) == self.data
def setUp(self): super(SnubaEventTest, self).setUp() self.event_id = 'f' * 32 self.now = datetime.utcnow().replace(microsecond=0) - timedelta( seconds=10) self.proj1 = self.create_project() self.proj1env1 = self.create_environment(project=self.proj1, name='test') self.proj1group1 = self.create_group(self.proj1, first_seen=self.now, last_seen=self.now + timedelta(seconds=14400)) # Raw event data data = { 'event_id': self.event_id, 'primary_hash': '1' * 32, 'project_id': self.proj1.id, 'message': 'message 1', 'platform': 'python', 'timestamp': calendar.timegm(self.now.timetuple()), 'received': calendar.timegm(self.now.timetuple()), 'tags': { 'foo': 'bar', 'baz': 'quux', 'environment': 'prod', 'sentry:user': u'id:user1', 'sentry:release': 'release1', }, 'user': { 'id': u'user1', 'email': u'*****@*****.**', }, } # Create a regular django Event from the data, which will save the. # data in nodestore too. Once Postgres events are deprecated, we can # turn this off and just put the payload in nodestore. make_django_event = True if make_django_event: self.create_event( event_id=data['event_id'], datetime=self.now, project=self.proj1, group=self.proj1group1, data=data, ) nodestore_data = nodestore.get( SnubaEvent.generate_node_id(self.proj1.id, self.event_id)) assert data['event_id'] == nodestore_data['event_id'] else: node_id = SnubaEvent.generate_node_id(self.proj1.id, self.event_id) nodestore.set(node_id, data) assert nodestore.get(node_id) == data
def test_event_node_id(self): # Create an event without specifying node_id. A node_id should be generated e1 = Event(project_id=1, event_id='abc', data={'foo': 'bar'}) e1.save() e1_node_id = e1.data.id assert e1.data.id is not None, "We should have generated a node_id for this event" e1_body = nodestore.get(e1_node_id) assert e1_body == {'foo': 'bar'}, "The event body should be in nodestore" e1 = Event.objects.get(project_id=1, event_id='abc') assert e1.data.data == {'foo': 'bar'}, "The event body should be loaded from nodestore" assert e1.data.id == e1_node_id, "The event's node_id should be the same after load" # Create another event that references the same nodestore object as the first event. e2 = Event(project_id=1, event_id='def', data={'node_id': e1_node_id}) assert e2.data.id == e1_node_id, "The event should use the provided node_id" e2_body = nodestore.get(e1_node_id) assert e2_body == {'foo': 'bar'}, "The event body should be in nodestore already" e2.save() e2_body = nodestore.get(e1_node_id) assert e2_body == {'foo': 'bar'}, "The event body should not be overwritten by save" e2 = Event.objects.get(project_id=1, event_id='def') assert e2.data.data == {'foo': 'bar'}, "The event body should be loaded from nodestore" assert e2.data.id == e1_node_id, "The event's node_id should be the same after load" # Create an event with a new event body that specifies the node_id to use. e3 = Event(project_id=1, event_id='ghi', data={'baz': 'quux', 'node_id': '1:ghi'}) assert e3.data.id == '1:ghi', "Event should have the specified node_id" assert e3.data.data == {'baz': 'quux'}, "Event body should be the one provided (sans node_id)" e3.save() e3_body = nodestore.get('1:ghi') assert e3_body == {'baz': 'quux'}, "Event body should be saved to nodestore" e3 = Event.objects.get(project_id=1, event_id='ghi') assert e3.data.data == {'baz': 'quux'}, "Event body should be loaded from nodestore" assert e3.data.id == '1:ghi', "Loaded event should have the correct node_id" # Try load it again, but using the pickled/compressed string we would expect to find # in the column e3_pickled_id = compress(pickle.dumps({'node_id': '1:ghi'})) e3 = Event(project_id=1, event_id='jkl', data=e3_pickled_id) assert e3.data.data == {'baz': 'quux'}, "Event body should be loaded from nodestore" # Event with no data should not be saved (or loaded) from nodestore e4 = Event(project_id=1, event_id='mno', data=None) e4.save() assert nodestore.get('1:mno') is None, "We should not have saved anything to nodestore" e4 = Event.objects.get(project_id=1, event_id='mno') assert e4.data.id is None assert e4.data.data == {} # NodeData returns {} by default Event.objects.bind_nodes([e4], 'data') assert e4.data.id is None assert e4.data.data == {}
def test_simple(self): event_id = "a" * 32 event_id_2 = "b" * 32 project = self.create_project() node_id = Event.generate_node_id(project.id, event_id) node_id_2 = Event.generate_node_id(project.id, event_id_2) event = self.store_event( data={ "event_id": event_id, "timestamp": iso_format(before_now(minutes=1)), "fingerprint": ["group1"], }, project_id=project.id, ) self.store_event( data={ "event_id": event_id_2, "timestamp": iso_format(before_now(minutes=1)), "fingerprint": ["group1"], }, project_id=project.id, ) group = event.group group.update(status=GroupStatus.PENDING_DELETION) GroupAssignee.objects.create(group=group, project=project, user=self.user) GroupHash.objects.create(project=project, group=group, hash=uuid4().hex) GroupMeta.objects.create(group=group, key="foo", value="bar") GroupRedirect.objects.create(group_id=group.id, previous_group_id=1) assert nodestore.get(node_id) assert nodestore.get(node_id_2) with self.tasks(): delete_groups(object_ids=[group.id]) assert not Event.objects.filter(id=event.id).exists() assert not GroupRedirect.objects.filter(group_id=group.id).exists() assert not GroupHash.objects.filter(group_id=group.id).exists() assert not Group.objects.filter(id=group.id).exists() assert not nodestore.get(node_id) assert not nodestore.get(node_id_2)
def _get_event_from_storage(self, project_id, event_id): nodestore_sample_rate = options.get("store.nodestore-sample-rate") use_nodestore = random.random() < nodestore_sample_rate if use_nodestore: start = time.time() node_data = nodestore.get( Event.generate_node_id(project_id, event_id)) metrics.timing( "events.store.nodestore.duration", int((time.time() - start) * 1000), tags={"duplicate_found": bool(node_data)}, ) if node_data: return Event(node_data) else: try: event = Event.objects.get(project_id=project_id, event_id=event_id) return event except Event.DoesNotExist: pass return None
def test_event_with_no_body(self): # remove the event from nodestore to simulate an event with no body. node_id = SnubaEvent.generate_node_id(self.proj1.id, self.event_id) nodestore.delete(node_id) assert nodestore.get(node_id) is None # Check that we can still serialize it event = SnubaEvent.get_event(self.proj1.id, self.event_id) serialized = serialize(event) assert event.data == {} # Check that the regular serializer still gives us back tags assert serialized['tags'] == [{ '_meta': None, 'key': 'baz', 'value': 'quux' }, { '_meta': None, 'key': 'foo', 'value': 'bar' }, { '_meta': None, 'key': 'release', 'value': 'release1' }, { '_meta': None, 'key': 'user', 'query': 'user.id:user1', 'value': 'id:user1' }]
def test_dupe_message_id(self, eventstream_insert): # Saves the latest event to nodestore and eventstream project_id = 1 event_id = "a" * 32 node_id = Event.generate_node_id(project_id, event_id) manager = EventManager(make_event(event_id=event_id, message="first")) manager.normalize() manager.save(project_id) assert nodestore.get(node_id)["logentry"]["formatted"] == "first" manager = EventManager(make_event(event_id=event_id, message="second")) manager.normalize() manager.save(project_id) assert nodestore.get(node_id)["logentry"]["formatted"] == "second" assert eventstream_insert.call_count == 2
def test_simple(self): event_id = "a" * 32 project = self.create_project() node_id = Event.generate_node_id(project.id, event_id) group = self.create_group(project=project) event = self.create_event(group=group, event_id=event_id) EventAttachment.objects.create( event_id=event.event_id, project_id=event.project_id, file=File.objects.create(name="hello.png", type="image/png"), name="hello.png", ) UserReport.objects.create(event_id=event.event_id, project_id=event.project_id, name="Jane Doe") key = "key" value = "value" tk = tagstore.create_tag_key(project_id=project.id, environment_id=self.environment.id, key=key) tv = tagstore.create_tag_value(project_id=project.id, environment_id=self.environment.id, key=key, value=value) tagstore.create_event_tags( event_id=event.id, group_id=group.id, project_id=project.id, environment_id=self.environment.id, tags=[(tk.key, tv.value)], ) assert nodestore.get(node_id) is not None deletion = ScheduledDeletion.schedule(event, days=0) deletion.update(in_progress=True) with self.tasks(): run_deletion(deletion.id) assert not Event.objects.filter(id=event.id).exists() assert not EventAttachment.objects.filter( event_id=event.event_id, project_id=project.id).exists() assert not UserReport.objects.filter(event_id=event.event_id, project_id=project.id).exists() assert not EventTag.objects.filter(event_id=event.id).exists() assert nodestore.get(node_id) is None
def bind_node_data(self): # Do not rebind if node_data is already loaded if self.data._node_data: return node_id = Event.generate_node_id(self.project_id, self.event_id) node_data = nodestore.get(node_id) or {} ref = self.data.get_ref(self) self.data.bind_data(node_data, ref=ref)
def capture_nodestore_stats(project_id, event_id): set_current_project(project_id) from sentry import nodestore from sentry.eventstore.compressor import deduplicate from sentry.eventstore.models import Event event = Event(project_id=project_id, event_id=event_id) old_event_size = _json_size(dict(event.data)) if not event.data: metrics.incr("eventstore.compressor.error", tags={"reason": "no_data"}) return platform = event.platform for key, value in six.iteritems(event.interfaces): len_value = _json_size(value.to_json()) metrics.timing( "events.size.interface", len_value, tags={"interface": key, "platform": platform} ) new_data, extra_keys = deduplicate(dict(event.data)) total_size = event_size = _json_size(new_data) for key, value in six.iteritems(extra_keys): if nodestore.get(key) is not None: metrics.incr("eventstore.compressor.hits") # do not continue, nodestore.set() should bump TTL else: metrics.incr("eventstore.compressor.misses") total_size += _json_size(value) # key is md5sum of content # do not store actual value to keep prod impact to a minimum nodestore.set(key, {}) metrics.timing("events.size.deduplicated", event_size) metrics.timing("events.size.deduplicated.total_written", total_size) metrics.timing("events.size.deduplicated.ratio", event_size / old_event_size) metrics.timing("events.size.deduplicated.total_written.ratio", total_size / old_event_size) if total_size > old_event_size: nodestore_stats_logger.info( "events.size.deduplicated.details", extra={ "project_id": project_id, "event_id": event_id, "total_size": total_size, "old_event_size": old_event_size, }, )
def test_simple(self): configure_sdk() Hub.current.bind_client(Hub.main.client) with self.tasks(): event_id = raven.captureMessage("internal client test") event = nodestore.get(Event.generate_node_id(settings.SENTRY_PROJECT, event_id)) assert event["project"] == settings.SENTRY_PROJECT assert event["event_id"] == event_id assert event["logentry"]["formatted"] == "internal client test"
def fetch_and_store(line): project_id, event_id = line.strip().split("\t") node_id = Event.generate_node_id(project_id, event_id) node = nodestore.get(node_id) # pylint: disable=no-member if node is None: print("WARNING: Got None from nodestore for project / event", project_id, event_id, file=sys.stderr) else: store(project_id, event_id, node, global_output_dir)
def data(self): if self._node_data is not None: return self._node_data elif self.id: if settings.DEBUG: raise NodeUnpopulated('You should populate node data before accessing it.') else: warnings.warn('You should populate node data before accessing it.') self.bind_data(nodestore.get(self.id) or {}) return self._node_data return {}
def pull_event_data(project_id, event_id) -> ReprocessableEvent: from sentry.lang.native.processing import get_required_attachment_types with sentry_sdk.start_span(op="reprocess_events.eventstore.get"): event = eventstore.get_event_by_id(project_id, event_id) if event is None: raise CannotReprocess("event.not_found") with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): node_id = Event.generate_node_id(project_id, event_id) data = nodestore.get(node_id, subkey="unprocessed") if data is None: node_id = _generate_unprocessed_event_node_id( project_id=project_id, event_id=event_id) data = nodestore.get(node_id) # Check data after checking presence of event to avoid too many instances. if data is None: raise CannotReprocess("unprocessed_event.not_found") required_attachment_types = get_required_attachment_types(data) attachments = list( models.EventAttachment.objects.filter( project_id=project_id, event_id=event_id, type__in=list(required_attachment_types))) missing_attachment_types = required_attachment_types - { ea.type for ea in attachments } if missing_attachment_types: raise CannotReprocess("attachment.not_found") return ReprocessableEvent(event=event, data=data, attachments=attachments)
def data(self): """ Get the current data object, fetching from nodestore if necessary. """ if self._node_data is not None: return self._node_data elif self.id: self.bind_data(nodestore.get(self.id) or {}) return self._node_data rv = {} if self.wrapper is not None: rv = self.wrapper(rv) return rv
def data(self): """ Get the current data object, fetching from nodestore if necessary. """ if self._node_data is not None: return self._node_data elif self.id: warnings.warn('You should populate node data before accessing it.') self.bind_data(nodestore.get(self.id) or {}) return self._node_data rv = {} if self.field is not None and self.field.wrapper is not None: rv = self.field.wrapper(rv) return rv
def test_encoding(self): configure_sdk() Hub.current.bind_client(Hub.main.client) class NotJSONSerializable: pass with self.tasks(): event_id = raven.captureMessage( "check the req", extra={"request": NotJSONSerializable()} ) event = nodestore.get(Event.generate_node_id(settings.SENTRY_PROJECT, event_id)) assert event["project"] == settings.SENTRY_PROJECT assert event["logentry"]["formatted"] == "check the req" assert "NotJSONSerializable" in event["extra"]["request"]
def test_recursion_breaker(self): configure_sdk() Hub.current.bind_client(Hub.main.client) # If this test terminates at all then we avoided recursion. with self.tasks(): with mock.patch( "sentry.event_manager.EventManager.save", side_effect=ValueError("oh no!") ) as save: event_id = raven.captureMessage("internal client test") event = nodestore.get(Event.generate_node_id(settings.SENTRY_PROJECT, event_id)) assert event is None assert_mock_called_once_with_partial( save, settings.SENTRY_PROJECT, cache_key=u"e:{}:1".format(event_id) )
def test_event_with_no_body(self): # remove the event from nodestore to simulate an event with no body. node_id = SnubaEvent.generate_node_id(self.proj1.id, self.event_id) nodestore.delete(node_id) assert nodestore.get(node_id) is None # Check that we can still serialize it event = eventstore.get_event_by_id( self.proj1.id, self.event_id, additional_columns=eventstore.full_columns ) serialized = serialize(event) assert event.data == {} # Check that the regular serializer still gives us back tags assert serialized["tags"] == [ {"_meta": None, "key": "baz", "value": "quux"}, {"_meta": None, "key": "environment", "value": "prod"}, {"_meta": None, "key": "foo", "value": "bar"}, {"_meta": None, "key": "level", "value": "error"}, {"_meta": None, "key": "release", "value": "release1"}, {"_meta": None, "key": "user", "query": 'user.id:"user1"', "value": "id:user1"}, ]
def test_simple(self): EventDataDeletionTask.DEFAULT_CHUNK_SIZE = 1 # test chunking logic group = self.event.group assert nodestore.get(self.node_id) assert nodestore.get(self.node_id2) assert nodestore.get(self.node_id3) with self.tasks(): delete_groups(object_ids=[group.id]) assert not UserReport.objects.filter(group_id=group.id).exists() assert not UserReport.objects.filter( event_id=self.event.event_id).exists() assert not EventAttachment.objects.filter( event_id=self.event.event_id).exists() assert not GroupRedirect.objects.filter(group_id=group.id).exists() assert not GroupHash.objects.filter(group_id=group.id).exists() assert not Group.objects.filter(id=group.id).exists() assert not nodestore.get(self.node_id) assert not nodestore.get(self.node_id2) assert nodestore.get( self.node_id3), "Does not remove from second group"
def bind_node_data(self): node_id = Event.generate_node_id(self.project_id, self.event_id) node_data = nodestore.get(node_id) or {} ref = self.data.get_ref(self) self.data.bind_data(node_data, ref=ref)
def reprocess_event(project_id, event_id, start_time): node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id) with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): data = nodestore.get(node_id) if data is None: return from sentry.event_manager import set_tag from sentry.tasks.store import preprocess_event_from_reprocessing from sentry.ingest.ingest_consumer import CACHE_TIMEOUT # Take unprocessed data from old event and save it as unprocessed data # under a new event ID. The second step happens in pre-process. We could # save the "original event ID" instead and get away with writing less to # nodestore, but doing it this way makes the logic slightly simpler. # Step 1: Fix up the event payload for reprocessing and put it in event # cache/event_processing_store orig_event_id = data["event_id"] set_tag(data, "original_event_id", orig_event_id) event = eventstore.get_event_by_id(project_id, orig_event_id) if event is None: return set_tag(data, "original_group_id", event.group_id) # XXX: reuse event IDs event_id = data["event_id"] = uuid.uuid4().hex cache_key = event_processing_store.store(data) # Step 2: Copy attachments into attachment cache queryset = models.EventAttachment.objects.filter( project_id=project_id, event_id=orig_event_id).select_related("file") attachment_objects = [] for attachment_id, attachment in enumerate(queryset): with sentry_sdk.start_span( op="reprocess_event._copy_attachment_into_cache") as span: span.set_data("attachment_id", attachment.id) attachment_objects.append( _copy_attachment_into_cache( attachment_id=attachment_id, attachment=attachment, cache_key=cache_key, cache_timeout=CACHE_TIMEOUT, )) if attachment_objects: with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"): attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) preprocess_event_from_reprocessing(cache_key=cache_key, start_time=start_time, event_id=event_id)
def test_simple(self): EventDataDeletionTask.DEFAULT_CHUNK_SIZE = 1 # test chunking logic event_id = "a" * 32 event_id2 = "b" * 32 event_id3 = "c" * 32 project = self.create_project() node_id = Event.generate_node_id(project.id, event_id) node_id2 = Event.generate_node_id(project.id, event_id2) node_id3 = Event.generate_node_id(project.id, event_id3) event = self.store_event( data={ "event_id": event_id, "tags": { "foo": "bar" }, "timestamp": iso_format(before_now(minutes=1)), "fingerprint": ["group1"], }, project_id=project.id, ) self.store_event( data={ "event_id": event_id2, "timestamp": iso_format(before_now(minutes=1)), "fingerprint": ["group1"], }, project_id=project.id, ) self.store_event( data={ "event_id": event_id3, "timestamp": iso_format(before_now(minutes=1)), "fingerprint": ["group2"], }, project_id=project.id, ) group = event.group group.update(status=GroupStatus.PENDING_DELETION) project = self.create_project() UserReport.objects.create(group_id=group.id, project_id=event.project_id, name="With group id") UserReport.objects.create(event_id=event.event_id, project_id=event.project_id, name="With event id") EventAttachment.objects.create( event_id=event.event_id, project_id=event.project_id, file=File.objects.create(name="hello.png", type="image/png"), name="hello.png", ) GroupAssignee.objects.create(group=group, project=project, user=self.user) GroupHash.objects.create(project=project, group=group, hash=uuid4().hex) GroupMeta.objects.create(group=group, key="foo", value="bar") GroupRedirect.objects.create(group_id=group.id, previous_group_id=1) deletion = ScheduledDeletion.schedule(group, days=0) deletion.update(in_progress=True) assert nodestore.get(node_id) assert nodestore.get(node_id2) assert nodestore.get(node_id3) with self.tasks(): run_deletion(deletion.id) assert not Event.objects.filter(id=event.id).exists() assert not UserReport.objects.filter(group_id=group.id).exists() assert not UserReport.objects.filter(event_id=event.event_id).exists() assert not EventAttachment.objects.filter( event_id=event.event_id).exists() assert not GroupRedirect.objects.filter(group_id=group.id).exists() assert not GroupHash.objects.filter(group_id=group.id).exists() assert not Group.objects.filter(id=group.id).exists() assert not nodestore.get(node_id) assert not nodestore.get(node_id2) assert nodestore.get(node_id3), "Does not remove from second group"
def reprocess_event(project_id, event_id, start_time): from sentry.ingest.ingest_consumer import CACHE_TIMEOUT from sentry.lang.native.processing import get_required_attachment_types from sentry.tasks.store import preprocess_event_from_reprocessing with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): node_id = Event.generate_node_id(project_id, event_id) data = nodestore.get(node_id, subkey="unprocessed") if data is None: node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id) data = nodestore.get(node_id) if data is None: raise CannotReprocess("reprocessing_nodestore.not_found") with sentry_sdk.start_span(op="reprocess_events.eventstore.get"): event = eventstore.get_event_by_id(project_id, event_id) if event is None: raise CannotReprocess("event.not_found") required_attachment_types = get_required_attachment_types(data) attachments = list( models.EventAttachment.objects.filter( project_id=project_id, event_id=event_id, type__in=list(required_attachment_types) ) ) missing_attachment_types = required_attachment_types - {ea.type for ea in attachments} if missing_attachment_types: raise CannotReprocess( f"attachment.not_found.{'_and_'.join(sorted(missing_attachment_types))}" ) # Step 1: Fix up the event payload for reprocessing and put it in event # cache/event_processing_store set_path(data, "contexts", "reprocessing", "original_issue_id", value=event.group_id) set_path( data, "contexts", "reprocessing", "original_primary_hash", value=event.get_primary_hash() ) cache_key = event_processing_store.store(data) # Step 2: Copy attachments into attachment cache. Note that we can only # consider minidumps because filestore just stays as-is after reprocessing # (we simply update group_id on the EventAttachment models in post_process) attachment_objects = [] files = {f.id: f for f in models.File.objects.filter(id__in=[ea.file_id for ea in attachments])} for attachment_id, attachment in enumerate(attachments): with sentry_sdk.start_span(op="reprocess_event._copy_attachment_into_cache") as span: span.set_data("attachment_id", attachment.id) attachment_objects.append( _copy_attachment_into_cache( attachment_id=attachment_id, attachment=attachment, file=files[attachment.file_id], cache_key=cache_key, cache_timeout=CACHE_TIMEOUT, ) ) if attachment_objects: with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"): attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) preprocess_event_from_reprocessing( cache_key=cache_key, start_time=start_time, event_id=event_id, data=data, )
def capture_nodestore_stats(cache_key, project_id, event_id): set_current_project(project_id) from sentry.eventstore.compressor import deduplicate from sentry.eventstore.models import Event node_id = Event.generate_node_id(project_id, event_id) data = nodestore.get(node_id) if not data: metrics.incr("eventstore.compressor.error", tags={"reason": "no_data"}) return old_event_size = _json_size(data) unprocessed_data = event_processing_store.get( _get_unprocessed_key(cache_key)) event_processing_store.delete_by_key(_get_unprocessed_key(cache_key)) tags = { "with_reprocessing": bool(unprocessed_data), "platform": data.get("platform") or "none", "is_minidump": is_minidump_event(data), } if unprocessed_data: metrics.incr("nodestore_stats.with_reprocessing") concatenated_size = _json_size(data, unprocessed_data) metrics.timing("events.size.concatenated", concatenated_size, tags=tags) metrics.timing("events.size.concatenated.ratio", concatenated_size / old_event_size, tags=tags) _data = dict(data) _data["__nodestore_reprocessing"] = unprocessed_data simple_concatenated_size = _json_size(_data) metrics.timing("events.size.simple_concatenated", simple_concatenated_size, tags=tags) metrics.timing( "events.size.simple_concatenated.ratio", simple_concatenated_size / old_event_size, tags=tags, ) else: metrics.incr("nodestore_stats.without_reprocessing") new_data, extra_keys = deduplicate(dict(data)) total_size = event_size = _json_size(new_data) for key, value in six.iteritems(extra_keys): if nodestore.get(key) is not None: metrics.incr("eventstore.compressor.hits", tags=tags) # do not continue, nodestore.set() should bump TTL else: metrics.incr("eventstore.compressor.misses", tags=tags) total_size += _json_size(value) # key is md5sum of content # do not store actual value to keep prod impact to a minimum nodestore.set(key, {}) metrics.timing("events.size.deduplicated", event_size, tags=tags) metrics.timing("events.size.deduplicated.total_written", total_size, tags=tags) metrics.timing("events.size.deduplicated.ratio", event_size / old_event_size, tags=tags) metrics.timing("events.size.deduplicated.total_written.ratio", total_size / old_event_size, tags=tags) if total_size > old_event_size: nodestore_stats_logger.info( "events.size.deduplicated.details", extra={ "project_id": project_id, "event_id": event_id, "total_size": total_size, "old_event_size": old_event_size, }, )
def reprocess_event(project_id, event_id, start_time): from sentry.event_manager import set_tag from sentry.tasks.store import preprocess_event_from_reprocessing from sentry.ingest.ingest_consumer import CACHE_TIMEOUT # Take unprocessed data from old event and save it as unprocessed data # under a new event ID. The second step happens in pre-process. We could # save the "original event ID" instead and get away with writing less to # nodestore, but doing it this way makes the logic slightly simpler. node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id) with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): data = nodestore.get(node_id) with sentry_sdk.start_span(op="reprocess_events.eventstore.get"): event = eventstore.get_event_by_id(project_id, event_id) if event is None: logger.error("reprocessing2.event.not_found", extra={ "project_id": project_id, "event_id": event_id }) return if data is None: logger.error( "reprocessing2.reprocessing_nodestore.not_found", extra={ "project_id": project_id, "event_id": event_id }, ) # We have no real data for reprocessing. We assume this event goes # straight to save_event, and hope that the event data can be # reingested like that. It's better than data loss. # # XXX: Ideally we would run a "save-lite" for this that only updates # the group ID in-place. Like a snuba merge message. data = dict(event.data) # Step 1: Fix up the event payload for reprocessing and put it in event # cache/event_processing_store set_tag(data, "original_group_id", event.group_id) cache_key = event_processing_store.store(data) # Step 2: Copy attachments into attachment cache queryset = models.EventAttachment.objects.filter( project_id=project_id, event_id=event_id).select_related("file") attachment_objects = [] for attachment_id, attachment in enumerate(queryset): with sentry_sdk.start_span( op="reprocess_event._copy_attachment_into_cache") as span: span.set_data("attachment_id", attachment.id) attachment_objects.append( _copy_attachment_into_cache( attachment_id=attachment_id, attachment=attachment, cache_key=cache_key, cache_timeout=CACHE_TIMEOUT, )) if attachment_objects: with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"): attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) preprocess_event_from_reprocessing(cache_key=cache_key, start_time=start_time, event_id=event_id)
def reprocess_event(project_id, event_id, start_time): from sentry.tasks.store import preprocess_event_from_reprocessing from sentry.ingest.ingest_consumer import CACHE_TIMEOUT with sentry_sdk.start_span(op="reprocess_events.nodestore.get"): node_id = Event.generate_node_id(project_id, event_id) data = nodestore.get(node_id, subkey="unprocessed") if data is None: node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id) data = nodestore.get(node_id) with sentry_sdk.start_span(op="reprocess_events.eventstore.get"): event = eventstore.get_event_by_id(project_id, event_id) if event is None: logger.error( "reprocessing2.event.not_found", extra={"project_id": project_id, "event_id": event_id} ) return if data is None: logger.error( "reprocessing2.reprocessing_nodestore.not_found", extra={"project_id": project_id, "event_id": event_id}, ) # We have no real data for reprocessing. We assume this event goes # straight to save_event, and hope that the event data can be # reingested like that. It's better than data loss. # # XXX: Ideally we would run a "save-lite" for this that only updates # the group ID in-place. Like a snuba merge message. data = dict(event.data) # Step 1: Fix up the event payload for reprocessing and put it in event # cache/event_processing_store set_path(data, "contexts", "reprocessing", "original_issue_id", value=event.group_id) cache_key = event_processing_store.store(data) # Step 2: Copy attachments into attachment cache queryset = models.EventAttachment.objects.filter(project_id=project_id, event_id=event_id) files = {f.id: f for f in models.File.objects.filter(id__in=[ea.file_id for ea in queryset])} attachment_objects = [] for attachment_id, attachment in enumerate(queryset): with sentry_sdk.start_span(op="reprocess_event._copy_attachment_into_cache") as span: span.set_data("attachment_id", attachment.id) attachment_objects.append( _copy_attachment_into_cache( attachment_id=attachment_id, attachment=attachment, file=files[attachment.file_id], cache_key=cache_key, cache_timeout=CACHE_TIMEOUT, ) ) if attachment_objects: with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"): attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT) preprocess_event_from_reprocessing( cache_key=cache_key, start_time=start_time, event_id=event_id )
def test_event_node_id(self): # Create an event without specifying node_id. A node_id should be generated e1 = Event(project_id=1, event_id="abc", data={"foo": "bar"}) e1.save() e1_node_id = e1.data.id assert e1.data.id is not None, "We should have generated a node_id for this event" e1_body = nodestore.get(e1_node_id) e1.data.save() e1_body = nodestore.get(e1_node_id) assert e1_body == { "foo": "bar" }, "The event body should be in nodestore" e1 = Event.objects.get(project_id=1, event_id="abc") assert e1.data.data == { "foo": "bar" }, "The event body should be loaded from nodestore" assert e1.data.id == e1_node_id, "The event's node_id should be the same after load" # Create another event that references the same nodestore object as the first event. e2 = Event(project_id=1, event_id="def", data={"node_id": e1_node_id}) assert e2.data.id == e1_node_id, "The event should use the provided node_id" e2_body = nodestore.get(e1_node_id) assert e2_body == { "foo": "bar" }, "The event body should be in nodestore already" e2.save() e2_body = nodestore.get(e1_node_id) assert e2_body == { "foo": "bar" }, "The event body should not be overwritten by save" e2 = Event.objects.get(project_id=1, event_id="def") assert e2.data.data == { "foo": "bar" }, "The event body should be loaded from nodestore" assert e2.data.id == e1_node_id, "The event's node_id should be the same after load" # Create an event with a new event body that specifies the node_id to use. e3 = Event(project_id=1, event_id="ghi", data={ "baz": "quux", "node_id": "1:ghi" }) assert e3.data.id == "1:ghi", "Event should have the specified node_id" assert e3.data.data == { "baz": "quux" }, "Event body should be the one provided (sans node_id)" e3.save() e3_body = nodestore.get("1:ghi") e3.data.save() e3_body = nodestore.get("1:ghi") assert e3_body == { "baz": "quux" }, "Event body should be saved to nodestore" e3 = Event.objects.get(project_id=1, event_id="ghi") assert e3.data.data == { "baz": "quux" }, "Event body should be loaded from nodestore" assert e3.data.id == "1:ghi", "Loaded event should have the correct node_id" # Try load it again, but using the pickled/compressed string we would expect to find # in the column e3_pickled_id = compress(pickle.dumps({"node_id": "1:ghi"})) e3 = Event(project_id=1, event_id="jkl", data=e3_pickled_id) assert e3.data.data == { "baz": "quux" }, "Event body should be loaded from nodestore" # Event with no data should not be saved (or loaded) from nodestore e4 = Event(project_id=1, event_id="mno", data=None) e4.save() e4.data.save() assert nodestore.get( "1:mno") is None, "We should not have saved anything to nodestore" e4 = Event.objects.get(project_id=1, event_id="mno") assert e4.data.id is None assert e4.data.data == {} # NodeData returns {} by default e4.bind_node_data() assert e4.data.id is None assert e4.data.data == {}
def test_event_node_id(self): # Create an event without specifying node_id. A node_id should be generated e1 = Event(project_id=1, event_id='abc', data={'foo': 'bar'}) e1.save() e1_node_id = e1.data.id assert e1.data.id is not None, "We should have generated a node_id for this event" e1_body = nodestore.get(e1_node_id) assert e1_body == { 'foo': 'bar' }, "The event body should be in nodestore" e1 = Event.objects.get(project_id=1, event_id='abc') assert e1.data.data == { 'foo': 'bar' }, "The event body should be loaded from nodestore" assert e1.data.id == e1_node_id, "The event's node_id should be the same after load" # Create another event that references the same nodestore object as the first event. e2 = Event(project_id=1, event_id='def', data={'node_id': e1_node_id}) assert e2.data.id == e1_node_id, "The event should use the provided node_id" e2_body = nodestore.get(e1_node_id) assert e2_body == { 'foo': 'bar' }, "The event body should be in nodestore already" e2.save() e2_body = nodestore.get(e1_node_id) assert e2_body == { 'foo': 'bar' }, "The event body should not be overwritten by save" e2 = Event.objects.get(project_id=1, event_id='def') assert e2.data.data == { 'foo': 'bar' }, "The event body should be loaded from nodestore" assert e2.data.id == e1_node_id, "The event's node_id should be the same after load" # Create an event with a new event body that specifies the node_id to use. e3 = Event(project_id=1, event_id='ghi', data={ 'baz': 'quux', 'node_id': '1:ghi' }) assert e3.data.id == '1:ghi', "Event should have the specified node_id" assert e3.data.data == { 'baz': 'quux' }, "Event body should be the one provided (sans node_id)" e3.save() e3_body = nodestore.get('1:ghi') assert e3_body == { 'baz': 'quux' }, "Event body should be saved to nodestore" e3 = Event.objects.get(project_id=1, event_id='ghi') assert e3.data.data == { 'baz': 'quux' }, "Event body should be loaded from nodestore" assert e3.data.id == '1:ghi', "Loaded event should have the correct node_id" # Try load it again, but using the pickled/compressed string we would expect to find # in the column e3_pickled_id = compress(pickle.dumps({'node_id': '1:ghi'})) e3 = Event(project_id=1, event_id='jkl', data=e3_pickled_id) assert e3.data.data == { 'baz': 'quux' }, "Event body should be loaded from nodestore" # Event with no data should not be saved (or loaded) from nodestore e4 = Event(project_id=1, event_id='mno', data=None) e4.save() assert nodestore.get( '1:mno') is None, "We should not have saved anything to nodestore" e4 = Event.objects.get(project_id=1, event_id='mno') assert e4.data.id is None assert e4.data.data == {} # NodeData returns {} by default Event.objects.bind_nodes([e4], 'data') assert e4.data.id is None assert e4.data.data == {}