Beispiel #1
0
def insert_data_to_database_legacy(data,
                                   start_time=None,
                                   from_reprocessing=False,
                                   attachments=None):
    """
    Yet another "fast path" to ingest an event without making it go
    through Relay. Please consider using functions from the ingest consumer
    instead, or, if you're within tests, to use `TestCase.store_event`.
    """

    # XXX(markus): Delete this function and merge with ingest consumer logic.

    if start_time is None:
        start_time = time()

    # we might be passed some subclasses of dict that fail dumping
    if isinstance(data, CANONICAL_TYPES):
        data = dict(data.items())

    cache_timeout = 3600
    cache_key = cache_key_for_event(data)
    default_cache.set(cache_key, data, cache_timeout)

    # Attachments will be empty or None if the "event-attachments" feature
    # is turned off. For native crash reports it will still contain the
    # crash dump (e.g. minidump) so we can load it during processing.
    if attachments is not None:
        attachment_cache.set(cache_key, attachments, cache_timeout)

    task = from_reprocessing and preprocess_event_from_reprocessing or preprocess_event
    task.delay(cache_key=cache_key,
               start_time=start_time,
               event_id=data["event_id"])
Beispiel #2
0
    def insert_data_to_database(self,
                                data,
                                start_time=None,
                                from_reprocessing=False,
                                attachments=None):
        if start_time is None:
            start_time = time()

        # we might be passed some sublcasses of dict that fail dumping
        if isinstance(data, CANONICAL_TYPES):
            data = dict(data.items())

        cache_timeout = 3600
        cache_key = cache_key_for_event(data)
        default_cache.set(cache_key, data, cache_timeout)

        # Attachments will be empty or None if the "event-attachments" feature
        # is turned off. For native crash reports it will still contain the
        # crash dump (e.g. minidump) so we can load it during processing.
        if attachments is not None:
            attachment_cache.set(cache_key, attachments, cache_timeout)

        task = from_reprocessing and \
            preprocess_event_from_reprocessing or preprocess_event
        task.delay(cache_key=cache_key,
                   start_time=start_time,
                   event_id=data['event_id'])
Beispiel #3
0
    def dispatch_task(cache_key: str) -> None:
        if attachments:
            with sentry_sdk.start_span(op="ingest_consumer.set_attachment_cache"):
                attachment_objects = [
                    CachedAttachment(type=attachment.pop("attachment_type"), **attachment)
                    for attachment in attachments
                ]

                attachment_cache.set(
                    cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT
                )

        # Preprocess this event, which spawns either process_event or
        # save_event. Pass data explicitly to avoid fetching it again from the
        # cache.
        with sentry_sdk.start_span(op="ingest_consumer.process_event.preprocess_event"):
            preprocess_event(
                cache_key=cache_key,
                data=data,
                start_time=start_time,
                event_id=event_id,
                project=project,
            )

        # remember for an 1 hour that we saved this event (deduplication protection)
        cache.set(deduplication_key, "", CACHE_TIMEOUT)

        # emit event_accepted once everything is done
        event_accepted.send_robust(ip=remote_addr, data=data, project=project, sender=process_event)
Beispiel #4
0
    def test_attachment_outcomes(self):
        manager = EventManager(make_event(message="foo"), project=self.project)
        manager.normalize()

        a1 = CachedAttachment(name="a1", data=b"hello")
        a2 = CachedAttachment(name="a2", data=b"limited", rate_limited=True)
        a3 = CachedAttachment(name="a3", data=b"world")

        cache_key = cache_key_for_event(manager.get_data())
        attachment_cache.set(cache_key, attachments=[a1, a2, a3])

        mock_track_outcome = mock.Mock()
        with mock.patch("sentry.event_manager.track_outcome", mock_track_outcome):
            with self.feature("organizations:event-attachments"):
                manager.save(1, cache_key=cache_key)

        assert mock_track_outcome.call_count == 3

        for o in mock_track_outcome.mock_calls:
            assert o.kwargs["outcome"] == Outcome.ACCEPTED

        for o in mock_track_outcome.mock_calls[:2]:
            assert o.kwargs["category"] == DataCategory.ATTACHMENT
            assert o.kwargs["quantity"] == 5

        final = mock_track_outcome.mock_calls[2]
        assert final.kwargs["category"] == DataCategory.DEFAULT
Beispiel #5
0
    def insert_data_to_database(self,
                                data,
                                start_time=None,
                                from_reprocessing=False,
                                attachments=None):
        if start_time is None:
            start_time = time()

        # we might be passed some subclasses of dict that fail dumping
        if isinstance(data, CANONICAL_TYPES):
            data = dict(data.items())

        cache_timeout = 3600
        cache_key = cache_key_for_event(data)
        default_cache.set(cache_key, data, cache_timeout)

        # Attachments will be empty or None if the "event-attachments" feature
        # is turned off. For native crash reports it will still contain the
        # crash dump (e.g. minidump) so we can load it during processing.
        if attachments is not None:
            attachment_cache.set(cache_key, attachments, cache_timeout)

        # NOTE: Project is bound to the context in most cases in production, which
        # is enough for us to do `projects:kafka-ingest` testing.
        project = self.context and self.context.project

        if project and features.has('projects:kafka-ingest', project=project):
            kafka.produce_sync(
                settings.KAFKA_PREPROCESS,
                value=json.dumps({
                    'cache_key': cache_key,
                    'start_time': start_time,
                    'from_reprocessing': from_reprocessing,
                    'data': data,
                }),
            )
        else:
            task = from_reprocessing and \
                preprocess_event_from_reprocessing or preprocess_event
            task.delay(cache_key=cache_key,
                       start_time=start_time,
                       event_id=data['event_id'])
Beispiel #6
0
    def insert_data_to_database(self, data, start_time=None,
                                from_reprocessing=False, attachments=None):
        if start_time is None:
            start_time = time()

        # we might be passed some subclasses of dict that fail dumping
        if isinstance(data, CANONICAL_TYPES):
            data = dict(data.items())

        cache_timeout = 3600
        cache_key = cache_key_for_event(data)
        default_cache.set(cache_key, data, cache_timeout)

        # Attachments will be empty or None if the "event-attachments" feature
        # is turned off. For native crash reports it will still contain the
        # crash dump (e.g. minidump) so we can load it during processing.
        if attachments is not None:
            attachment_cache.set(cache_key, attachments, cache_timeout)

        # NOTE: Project is bound to the context in most cases in production, which
        # is enough for us to do `projects:kafka-ingest` testing.
        project = self.context and self.context.project

        if project and features.has('projects:kafka-ingest', project=project):
            kafka.produce_sync(
                settings.KAFKA_PREPROCESS,
                value=json.dumps({
                    'cache_key': cache_key,
                    'start_time': start_time,
                    'from_reprocessing': from_reprocessing,
                    'data': data,
                }),
            )
        else:
            task = from_reprocessing and \
                preprocess_event_from_reprocessing or preprocess_event
            task.delay(cache_key=cache_key, start_time=start_time,
                       event_id=data['event_id'])
Beispiel #7
0
    def insert_data_to_database(self, data, start_time=None,
                                from_reprocessing=False, attachments=None):
        if start_time is None:
            start_time = time()

        # we might be passed some sublcasses of dict that fail dumping
        if isinstance(data, DOWNGRADE_DATA_TYPES):
            data = dict(data.items())

        cache_timeout = 3600
        cache_key = u'e:{1}:{0}'.format(data['project'], data['event_id'])
        default_cache.set(cache_key, data, cache_timeout)

        # Attachments will be empty or None if the "event-attachments" feature
        # is turned off. For native crash reports it will still contain the
        # crash dump (e.g. minidump) so we can load it during processing.
        if attachments is not None:
            attachment_cache.set(cache_key, attachments, cache_timeout)

        task = from_reprocessing and \
            preprocess_event_from_reprocessing or preprocess_event
        task.delay(cache_key=cache_key, start_time=start_time,
                   event_id=data['event_id'])
Beispiel #8
0
def reprocess_event(project_id, event_id, start_time):

    from sentry.event_manager import set_tag
    from sentry.tasks.store import preprocess_event_from_reprocessing
    from sentry.ingest.ingest_consumer import CACHE_TIMEOUT

    # Take unprocessed data from old event and save it as unprocessed data
    # under a new event ID. The second step happens in pre-process. We could
    # save the "original event ID" instead and get away with writing less to
    # nodestore, but doing it this way makes the logic slightly simpler.
    node_id = _generate_unprocessed_event_node_id(project_id=project_id,
                                                  event_id=event_id)

    with sentry_sdk.start_span(op="reprocess_events.nodestore.get"):
        data = nodestore.get(node_id)

    with sentry_sdk.start_span(op="reprocess_events.eventstore.get"):
        event = eventstore.get_event_by_id(project_id, event_id)

    if event is None:
        logger.error("reprocessing2.event.not_found",
                     extra={
                         "project_id": project_id,
                         "event_id": event_id
                     })
        return

    if data is None:
        logger.error(
            "reprocessing2.reprocessing_nodestore.not_found",
            extra={
                "project_id": project_id,
                "event_id": event_id
            },
        )
        # We have no real data for reprocessing. We assume this event goes
        # straight to save_event, and hope that the event data can be
        # reingested like that. It's better than data loss.
        #
        # XXX: Ideally we would run a "save-lite" for this that only updates
        # the group ID in-place. Like a snuba merge message.
        data = dict(event.data)

    # Step 1: Fix up the event payload for reprocessing and put it in event
    # cache/event_processing_store
    set_tag(data, "original_group_id", event.group_id)
    cache_key = event_processing_store.store(data)

    # Step 2: Copy attachments into attachment cache
    queryset = models.EventAttachment.objects.filter(
        project_id=project_id, event_id=event_id).select_related("file")

    attachment_objects = []

    for attachment_id, attachment in enumerate(queryset):
        with sentry_sdk.start_span(
                op="reprocess_event._copy_attachment_into_cache") as span:
            span.set_data("attachment_id", attachment.id)
            attachment_objects.append(
                _copy_attachment_into_cache(
                    attachment_id=attachment_id,
                    attachment=attachment,
                    cache_key=cache_key,
                    cache_timeout=CACHE_TIMEOUT,
                ))

    if attachment_objects:
        with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"):
            attachment_cache.set(cache_key,
                                 attachments=attachment_objects,
                                 timeout=CACHE_TIMEOUT)

    preprocess_event_from_reprocessing(cache_key=cache_key,
                                       start_time=start_time,
                                       event_id=event_id)
Beispiel #9
0
def process_event(message, projects):
    payload = message["payload"]
    start_time = float(message["start_time"])
    event_id = message["event_id"]
    project_id = int(message["project_id"])
    remote_addr = message.get("remote_addr")
    attachments = message.get("attachments") or ()

    # check that we haven't already processed this event (a previous instance of the forwarder
    # died before it could commit the event queue offset)
    deduplication_key = "ev:{}:{}".format(project_id, event_id)
    if cache.get(deduplication_key) is not None:
        logger.warning(
            "pre-process-forwarder detected a duplicated event"
            " with id:%s for project:%s.",
            event_id,
            project_id,
        )
        return  # message already processed do not reprocess

    try:
        project = projects[project_id]
    except KeyError:
        logger.error("Project for ingested event does not exist: %s",
                     project_id)
        return

    # Parse the JSON payload. This is required to compute the cache key and
    # call process_event. The payload will be put into Kafka raw, to avoid
    # serializing it again.
    # XXX: Do not use CanonicalKeyDict here. This may break preprocess_event
    # which assumes that data passed in is a raw dictionary.
    data = json.loads(payload)

    cache_key = cache_key_for_event(data)
    default_cache.set(cache_key, data, CACHE_TIMEOUT)

    if attachments:
        attachment_objects = [
            CachedAttachment(type=attachment.pop("attachment_type"),
                             **attachment) for attachment in attachments
        ]

        attachment_cache.set(cache_key,
                             attachments=attachment_objects,
                             timeout=CACHE_TIMEOUT)

    # Preprocess this event, which spawns either process_event or
    # save_event. Pass data explicitly to avoid fetching it again from the
    # cache.
    preprocess_event(cache_key=cache_key,
                     data=data,
                     start_time=start_time,
                     event_id=event_id,
                     project=project)

    # remember for an 1 hour that we saved this event (deduplication protection)
    cache.set(deduplication_key, "", CACHE_TIMEOUT)

    # emit event_accepted once everything is done
    event_accepted.send_robust(ip=remote_addr,
                               data=data,
                               project=project,
                               sender=process_event)
def _do_process_event(message, projects):
    payload = message["payload"]
    start_time = float(message["start_time"])
    event_id = message["event_id"]
    project_id = int(message["project_id"])
    remote_addr = message.get("remote_addr")
    attachments = message.get("attachments") or ()

    # check that we haven't already processed this event (a previous instance of the forwarder
    # died before it could commit the event queue offset)
    #
    # XXX(markus): I believe this code is extremely broken:
    #
    # * it practically uses memcached in prod which has no consistency
    #   guarantees (no idea how we don't run into issues there)
    #
    # * a TTL of 1h basically doesn't guarantee any deduplication at all. It
    #   just guarantees a good error message... for one hour.
    #
    # This code has been ripped from the old python store endpoint. We're
    # keeping it around because it does provide some protection against
    # reprocessing good events if a single consumer is in a restart loop.
    deduplication_key = f"ev:{project_id}:{event_id}"
    if cache.get(deduplication_key) is not None:
        logger.warning(
            "pre-process-forwarder detected a duplicated event" " with id:%s for project:%s.",
            event_id,
            project_id,
        )
        return  # message already processed do not reprocess

    try:
        project = projects[project_id]
    except KeyError:
        logger.error("Project for ingested event does not exist: %s", project_id)
        return

    # Parse the JSON payload. This is required to compute the cache key and
    # call process_event. The payload will be put into Kafka raw, to avoid
    # serializing it again.
    # XXX: Do not use CanonicalKeyDict here. This may break preprocess_event
    # which assumes that data passed in is a raw dictionary.
    data = json.loads(payload)

    cache_key = event_processing_store.store(data)

    if attachments:
        attachment_objects = [
            CachedAttachment(type=attachment.pop("attachment_type"), **attachment)
            for attachment in attachments
        ]

        attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT)

    # Preprocess this event, which spawns either process_event or
    # save_event. Pass data explicitly to avoid fetching it again from the
    # cache.
    with sentry_sdk.start_span(op="ingest_consumer.process_event.preprocess_event"):
        preprocess_event(
            cache_key=cache_key,
            data=data,
            start_time=start_time,
            event_id=event_id,
            project=project,
        )

    # remember for an 1 hour that we saved this event (deduplication protection)
    cache.set(deduplication_key, "", CACHE_TIMEOUT)

    # emit event_accepted once everything is done
    event_accepted.send_robust(ip=remote_addr, data=data, project=project, sender=process_event)
Beispiel #11
0
def reprocess_event(project_id, event_id, start_time):

    from sentry.tasks.store import preprocess_event_from_reprocessing
    from sentry.ingest.ingest_consumer import CACHE_TIMEOUT

    with sentry_sdk.start_span(op="reprocess_events.nodestore.get"):
        node_id = Event.generate_node_id(project_id, event_id)
        data = nodestore.get(node_id, subkey="unprocessed")
        if data is None:
            node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id)
            data = nodestore.get(node_id)

    with sentry_sdk.start_span(op="reprocess_events.eventstore.get"):
        event = eventstore.get_event_by_id(project_id, event_id)

    if event is None:
        logger.error(
            "reprocessing2.event.not_found", extra={"project_id": project_id, "event_id": event_id}
        )
        return

    if data is None:
        logger.error(
            "reprocessing2.reprocessing_nodestore.not_found",
            extra={"project_id": project_id, "event_id": event_id},
        )
        # We have no real data for reprocessing. We assume this event goes
        # straight to save_event, and hope that the event data can be
        # reingested like that. It's better than data loss.
        #
        # XXX: Ideally we would run a "save-lite" for this that only updates
        # the group ID in-place. Like a snuba merge message.
        data = dict(event.data)

    # Step 1: Fix up the event payload for reprocessing and put it in event
    # cache/event_processing_store
    set_path(data, "contexts", "reprocessing", "original_issue_id", value=event.group_id)
    cache_key = event_processing_store.store(data)

    # Step 2: Copy attachments into attachment cache
    queryset = models.EventAttachment.objects.filter(project_id=project_id, event_id=event_id)
    files = {f.id: f for f in models.File.objects.filter(id__in=[ea.file_id for ea in queryset])}

    attachment_objects = []

    for attachment_id, attachment in enumerate(queryset):
        with sentry_sdk.start_span(op="reprocess_event._copy_attachment_into_cache") as span:
            span.set_data("attachment_id", attachment.id)
            attachment_objects.append(
                _copy_attachment_into_cache(
                    attachment_id=attachment_id,
                    attachment=attachment,
                    file=files[attachment.file_id],
                    cache_key=cache_key,
                    cache_timeout=CACHE_TIMEOUT,
                )
            )

    if attachment_objects:
        with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"):
            attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT)

    preprocess_event_from_reprocessing(
        cache_key=cache_key, start_time=start_time, event_id=event_id
    )
Beispiel #12
0
def reprocess_event(project_id, event_id, start_time):

    from sentry.ingest.ingest_consumer import CACHE_TIMEOUT
    from sentry.lang.native.processing import get_required_attachment_types
    from sentry.tasks.store import preprocess_event_from_reprocessing

    with sentry_sdk.start_span(op="reprocess_events.nodestore.get"):
        node_id = Event.generate_node_id(project_id, event_id)
        data = nodestore.get(node_id, subkey="unprocessed")
        if data is None:
            node_id = _generate_unprocessed_event_node_id(project_id=project_id, event_id=event_id)
            data = nodestore.get(node_id)

    if data is None:
        raise CannotReprocess("reprocessing_nodestore.not_found")

    with sentry_sdk.start_span(op="reprocess_events.eventstore.get"):
        event = eventstore.get_event_by_id(project_id, event_id)

    if event is None:
        raise CannotReprocess("event.not_found")

    required_attachment_types = get_required_attachment_types(data)
    attachments = list(
        models.EventAttachment.objects.filter(
            project_id=project_id, event_id=event_id, type__in=list(required_attachment_types)
        )
    )
    missing_attachment_types = required_attachment_types - {ea.type for ea in attachments}

    if missing_attachment_types:
        raise CannotReprocess(
            f"attachment.not_found.{'_and_'.join(sorted(missing_attachment_types))}"
        )

    # Step 1: Fix up the event payload for reprocessing and put it in event
    # cache/event_processing_store
    set_path(data, "contexts", "reprocessing", "original_issue_id", value=event.group_id)
    set_path(
        data, "contexts", "reprocessing", "original_primary_hash", value=event.get_primary_hash()
    )
    cache_key = event_processing_store.store(data)

    # Step 2: Copy attachments into attachment cache. Note that we can only
    # consider minidumps because filestore just stays as-is after reprocessing
    # (we simply update group_id on the EventAttachment models in post_process)
    attachment_objects = []

    files = {f.id: f for f in models.File.objects.filter(id__in=[ea.file_id for ea in attachments])}

    for attachment_id, attachment in enumerate(attachments):
        with sentry_sdk.start_span(op="reprocess_event._copy_attachment_into_cache") as span:
            span.set_data("attachment_id", attachment.id)
            attachment_objects.append(
                _copy_attachment_into_cache(
                    attachment_id=attachment_id,
                    attachment=attachment,
                    file=files[attachment.file_id],
                    cache_key=cache_key,
                    cache_timeout=CACHE_TIMEOUT,
                )
            )

    if attachment_objects:
        with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"):
            attachment_cache.set(cache_key, attachments=attachment_objects, timeout=CACHE_TIMEOUT)

    preprocess_event_from_reprocessing(
        cache_key=cache_key,
        start_time=start_time,
        event_id=event_id,
        data=data,
    )
Beispiel #13
0
def reprocess_event(project_id, event_id, start_time):
    node_id = _generate_unprocessed_event_node_id(project_id=project_id,
                                                  event_id=event_id)

    with sentry_sdk.start_span(op="reprocess_events.nodestore.get"):
        data = nodestore.get(node_id)
    if data is None:
        return

    from sentry.event_manager import set_tag
    from sentry.tasks.store import preprocess_event_from_reprocessing
    from sentry.ingest.ingest_consumer import CACHE_TIMEOUT

    # Take unprocessed data from old event and save it as unprocessed data
    # under a new event ID. The second step happens in pre-process. We could
    # save the "original event ID" instead and get away with writing less to
    # nodestore, but doing it this way makes the logic slightly simpler.

    # Step 1: Fix up the event payload for reprocessing and put it in event
    # cache/event_processing_store
    orig_event_id = data["event_id"]
    set_tag(data, "original_event_id", orig_event_id)

    event = eventstore.get_event_by_id(project_id, orig_event_id)
    if event is None:
        return

    set_tag(data, "original_group_id", event.group_id)

    # XXX: reuse event IDs
    event_id = data["event_id"] = uuid.uuid4().hex

    cache_key = event_processing_store.store(data)

    # Step 2: Copy attachments into attachment cache
    queryset = models.EventAttachment.objects.filter(
        project_id=project_id, event_id=orig_event_id).select_related("file")

    attachment_objects = []

    for attachment_id, attachment in enumerate(queryset):
        with sentry_sdk.start_span(
                op="reprocess_event._copy_attachment_into_cache") as span:
            span.set_data("attachment_id", attachment.id)
            attachment_objects.append(
                _copy_attachment_into_cache(
                    attachment_id=attachment_id,
                    attachment=attachment,
                    cache_key=cache_key,
                    cache_timeout=CACHE_TIMEOUT,
                ))

    if attachment_objects:
        with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"):
            attachment_cache.set(cache_key,
                                 attachments=attachment_objects,
                                 timeout=CACHE_TIMEOUT)

    preprocess_event_from_reprocessing(cache_key=cache_key,
                                       start_time=start_time,
                                       event_id=event_id)
Beispiel #14
0
def reprocess_event(project_id, event_id, start_time):

    from sentry.ingest.ingest_consumer import CACHE_TIMEOUT
    from sentry.tasks.store import preprocess_event_from_reprocessing

    reprocessable_event = pull_event_data(project_id, event_id)

    data = reprocessable_event.data
    event = reprocessable_event.event
    attachments = reprocessable_event.attachments

    # Step 1: Fix up the event payload for reprocessing and put it in event
    # cache/event_processing_store
    set_path(data,
             "contexts",
             "reprocessing",
             "original_issue_id",
             value=event.group_id)
    set_path(data,
             "contexts",
             "reprocessing",
             "original_primary_hash",
             value=event.get_primary_hash())
    cache_key = event_processing_store.store(data)

    # Step 2: Copy attachments into attachment cache. Note that we can only
    # consider minidumps because filestore just stays as-is after reprocessing
    # (we simply update group_id on the EventAttachment models in post_process)
    attachment_objects = []

    files = {
        f.id: f
        for f in models.File.objects.filter(
            id__in=[ea.file_id for ea in attachments])
    }

    for attachment_id, attachment in enumerate(attachments):
        with sentry_sdk.start_span(
                op="reprocess_event._copy_attachment_into_cache") as span:
            span.set_data("attachment_id", attachment.id)
            attachment_objects.append(
                _copy_attachment_into_cache(
                    attachment_id=attachment_id,
                    attachment=attachment,
                    file=files[attachment.file_id],
                    cache_key=cache_key,
                    cache_timeout=CACHE_TIMEOUT,
                ))

    if attachment_objects:
        with sentry_sdk.start_span(op="reprocess_event.set_attachment_meta"):
            attachment_cache.set(cache_key,
                                 attachments=attachment_objects,
                                 timeout=CACHE_TIMEOUT)

    preprocess_event_from_reprocessing(
        cache_key=cache_key,
        start_time=start_time,
        event_id=event_id,
        data=data,
    )