def test_extract_required(self):
     now = datetime.utcnow()
     event = {
         "event_id": "1" * 32,
         "project_id": 100,
         "group_id": 10,
         "datetime": now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
     }
     output = {}
     extract_base(output, event)
     output["retention_days"] = enforce_retention(
         event,
         datetime.strptime(event["datetime"],
                           settings.PAYLOAD_DATETIME_FORMAT),
     )
     enforce_table_writer(
         self.dataset).get_stream_loader().get_processor().extract_required(
             output, event)
     assert output == {
         "event_id": "11111111111111111111111111111111",
         "project_id": 100,
         "group_id": 10,
         "timestamp": now,
         "retention_days": settings.DEFAULT_RETENTION_DAYS,
     }
    def __init_span(self, event: Mapping[str,
                                         Any]) -> MutableMapping[str, Any]:
        """
        Initializes the fields that are the same for all spans within a transaction.
        """
        data = event["data"]
        transaction_ctx = data["contexts"]["trace"]

        return {
            "deleted":
            0,
            "project_id":
            event["project_id"],
            "transaction_id":
            str(uuid.UUID(event["event_id"])),
            "retention_days":
            enforce_retention(event,
                              datetime.fromtimestamp(data["timestamp"])),
            "transaction_span_id":
            int(transaction_ctx["span_id"], 16),
            "trace_id":
            str(uuid.UUID(transaction_ctx["trace_id"])),
            "transaction_name":
            _unicodify(data.get("transaction") or ""),
        }
Exemple #3
0
    def process_insert(
        self,
        event: Mapping[str, Any],
        metadata: Optional[KafkaMessageMetadata] = None
    ) -> Optional[Mapping[str, Any]]:
        if not self._should_process(event):
            return None

        processed = {"deleted": 0}
        extract_project_id(processed, event)
        self._extract_event_id(processed, event)
        processed["retention_days"] = enforce_retention(
            event,
            datetime.strptime(event["datetime"],
                              settings.PAYLOAD_DATETIME_FORMAT),
        )

        self.extract_required(processed, event)

        data = event.get("data", {})
        # HACK: https://sentry.io/sentry/snuba/issues/802102397/
        if not data:
            logger.error("No data for event: %s", event, exc_info=True)
            return None
        self.extract_common(processed, event, metadata)
        self.extract_custom(processed, event, metadata)

        sdk = data.get("sdk", None) or {}
        self.extract_sdk(processed, sdk)

        tags = _as_dict_safe(data.get("tags", None))
        self.extract_promoted_tags(processed, tags)
        self.extract_tags_custom(processed, event, tags, metadata)

        contexts = data.get("contexts", None) or {}
        self.extract_promoted_contexts(processed, contexts, tags)
        self.extract_contexts_custom(processed, event, contexts, metadata)

        processed["contexts.key"], processed[
            "contexts.value"] = extract_extra_contexts(contexts)
        processed["tags.key"], processed["tags.value"] = extract_extra_tags(
            tags)
        processed["_tags_flattened"] = flatten_nested_field(
            processed["tags.key"], processed["tags.value"])

        exception = (data.get("exception",
                              data.get("sentry.interfaces.Exception", None))
                     or {})
        stacks = exception.get("values", None) or []
        self.extract_stacktraces(processed, stacks)

        if metadata is not None:
            processed["offset"] = metadata.offset
            processed["partition"] = metadata.partition

        return processed
Exemple #4
0
    def process_message(
            self, message: Mapping[str, Any],
            metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]:
        if not self._should_process(message):
            return None

        timestamp = _ensure_valid_date(
            datetime.utcfromtimestamp(message["timestamp"]))
        assert timestamp is not None

        keys = []
        values = []
        tags = message["tags"]
        assert isinstance(tags, Mapping)
        for key, value in sorted(tags.items()):
            assert key.isdigit()
            keys.append(int(key))
            assert isinstance(value, int)
            values.append(value)

        try:
            retention_days = enforce_retention(message["retention_days"],
                                               timestamp)
        except EventTooOld:
            return None

        processed = [{
            "org_id":
            _literal(message["org_id"]),
            "project_id":
            _literal(message["project_id"]),
            "metric_id":
            _literal(message["metric_id"]),
            "timestamp":
            _call(
                "toDateTime",
                (_literal(
                    timestamp_to_bucket(timestamp,
                                        granularity).isoformat()), ),
            ),
            "tags.key":
            _array_literal(keys),
            "tags.value":
            _array_literal(values),
            **self._process_values(message),
            "retention_days":
            _literal(retention_days),
            "granularity":
            _literal(granularity),
        } for granularity in self.GRANULARITIES_SECONDS]
        return AggregateInsertBatch(processed, None)
Exemple #5
0
    def process_message(
            self, message: Mapping[str, Any],
            metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]:
        # TODO: Support messages with multiple buckets

        if not self._should_process(message):
            return None

        timestamp = _ensure_valid_date(
            datetime.utcfromtimestamp(message["timestamp"]))
        assert timestamp is not None, "Invalid timestamp"

        keys = []
        values = []
        tags = message["tags"]
        assert isinstance(tags, Mapping), "Invalid tags type"
        for key, value in sorted(tags.items()):
            assert key.isdigit() and isinstance(value,
                                                int), "Tag key/value invalid"
            keys.append(int(key))
            values.append(value)

        mat_version = (DISABLED_MATERIALIZATION_VERSION
                       if settings.WRITE_METRICS_AGG_DIRECTLY else
                       settings.ENABLED_MATERIALIZATION_VERSION)

        try:
            retention_days = enforce_retention(message["retention_days"],
                                               timestamp)
        except EventTooOld:
            return None

        processed = {
            "org_id": message["org_id"],
            "project_id": message["project_id"],
            "metric_id": message["metric_id"],
            "timestamp": timestamp,
            "tags.key": keys,
            "tags.value": values,
            **self._process_values(message),
            "materialization_version": mat_version,
            "retention_days": retention_days,
            "partition": metadata.partition,
            "offset": metadata.offset,
        }
        return InsertBatch([processed], None)
Exemple #6
0
    def _structure_and_validate_message(
        self,
        message: Mapping[Any,
                         Any]) -> Optional[Tuple[EventDict, RetentionDays]]:

        event = message
        data = event["data"]

        try:
            # We are purposely using a naive datetime here to work with the
            # rest of the codebase. We can be confident that clients are only
            # sending UTC dates.
            retention_days = enforce_retention(
                message["retention_days"],
                datetime.utcfromtimestamp(data["timestamp"]))
        except EventTooOld:
            return None

        return event, retention_days
    def process_message(self, message, metadata) -> Optional[ProcessedMessage]:
        processed = {"deleted": 0}
        if not (isinstance(message, (list, tuple)) and len(message) >= 2):
            return None
        version = message[0]
        if version not in (0, 1, 2):
            return None
        type_, event = message[1:3]
        if type_ != "insert":
            return None

        data = event["data"]
        event_type = data.get("type")
        if event_type != "transaction":
            return None
        extract_base(processed, event)
        processed["retention_days"] = enforce_retention(
            event,
            datetime.fromtimestamp(data["timestamp"]),
        )
        if not data.get("contexts", {}).get("trace"):
            return None

        transaction_ctx = data["contexts"]["trace"]
        trace_id = transaction_ctx["trace_id"]
        try:
            processed["event_id"] = str(uuid.UUID(processed["event_id"]))
            processed["trace_id"] = str(uuid.UUID(trace_id))
            processed["span_id"] = int(transaction_ctx["span_id"], 16)
            processed["transaction_op"] = _unicodify(
                transaction_ctx.get("op") or "")
            processed["transaction_name"] = _unicodify(
                data.get("transaction") or "")
            processed[
                "start_ts"], processed["start_ms"] = self.__extract_timestamp(
                    data["start_timestamp"], )

            status = transaction_ctx.get("status", None)
            if status:
                int_status = SPAN_STATUS_NAME_TO_CODE.get(
                    status, UNKNOWN_SPAN_STATUS)
            else:
                int_status = UNKNOWN_SPAN_STATUS

            processed["transaction_status"] = int_status

            if data["timestamp"] - data["start_timestamp"] < 0:
                # Seems we have some negative durations in the DB
                metrics.increment("negative_duration")
        except Exception:
            # all these fields are required but we saw some events go through here
            # in the past.  For now bail.
            return
        processed["finish_ts"], processed[
            "finish_ms"] = self.__extract_timestamp(data["timestamp"], )

        duration_secs = (processed["finish_ts"] -
                         processed["start_ts"]).total_seconds()
        processed["duration"] = max(int(duration_secs * 1000), 0)

        processed["platform"] = _unicodify(event["platform"])

        tags = _as_dict_safe(data.get("tags", None))
        processed["tags.key"], processed["tags.value"] = extract_extra_tags(
            tags)
        processed["_tags_flattened"] = flatten_nested_field(
            processed["tags.key"], processed["tags.value"])

        promoted_tags = {
            col: tags[col]
            for col in self.PROMOTED_TAGS if col in tags
        }
        processed["release"] = promoted_tags.get(
            "sentry:release",
            event.get("release"),
        )
        processed["environment"] = promoted_tags.get("environment")

        contexts = _as_dict_safe(data.get("contexts", None))

        user_dict = data.get("user", data.get("sentry.interfaces.User",
                                              None)) or {}
        geo = user_dict.get("geo", None) or {}
        if "geo" not in contexts and isinstance(geo, dict):
            contexts["geo"] = geo

        measurements = data.get("measurements")
        if measurements is not None:
            try:
                (
                    processed["measurements.key"],
                    processed["measurements.value"],
                ) = extract_nested(measurements,
                                   lambda value: float(value["value"]))
            except Exception:
                # Not failing the event in this case just yet, because we are still
                # developing this feature.
                logger.error(
                    "Invalid measurements field.",
                    extra={"measurements": measurements},
                    exc_info=True,
                )
        request = data.get("request", data.get("sentry.interfaces.Http",
                                               None)) or {}
        http_data: MutableMapping[str, Any] = {}
        extract_http(http_data, request)
        processed["http_method"] = http_data["http_method"]
        processed["http_referer"] = http_data["http_referer"]

        processed["contexts.key"], processed[
            "contexts.value"] = extract_extra_contexts(contexts)
        processed["_contexts_flattened"] = flatten_nested_field(
            processed["contexts.key"], processed["contexts.value"])

        processed["dist"] = _unicodify(
            promoted_tags.get("sentry:dist", data.get("dist")), )

        user_data = {}
        extract_user(user_data, user_dict)
        processed["user"] = promoted_tags.get("sentry:user", "")
        processed["user_name"] = user_data["username"]
        processed["user_id"] = user_data["user_id"]
        processed["user_email"] = user_data["email"]
        ip_address = _ensure_valid_ip(user_data["ip_address"])

        if ip_address:
            if ip_address.version == 4:
                processed["ip_address_v4"] = str(ip_address)
            elif ip_address.version == 6:
                processed["ip_address_v6"] = str(ip_address)

        processed["partition"] = metadata.partition
        processed["offset"] = metadata.offset

        sdk = data.get("sdk", None) or {}
        processed["sdk_name"] = _unicodify(sdk.get("name") or "")
        processed["sdk_version"] = _unicodify(sdk.get("version") or "")

        if processed["sdk_name"] == "":
            metrics.increment("missing_sdk_name")
        if processed["sdk_version"] == "":
            metrics.increment("missing_sdk_version")

        return InsertBatch([processed])
    def process_message(
            self, message: Tuple[int, str, Any],
            metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]:
        processed: MutableMapping[str, Any] = {"deleted": 0}
        if not (isinstance(message, (list, tuple)) and len(message) >= 2):
            return None
        version = message[0]
        if version not in (0, 1, 2):
            return None
        type_, event = message[1:3]
        if type_ != "insert":
            return None

        data = event["data"]
        event_type = data.get("type")
        if event_type != "transaction":
            return None
        extract_base(processed, event)

        try:
            # We are purposely using a naive datetime here to work with the
            # rest of the codebase. We can be confident that clients are only
            # sending UTC dates.
            processed["retention_days"] = enforce_retention(
                event,
                datetime.utcfromtimestamp(data["timestamp"]),
            )
        except EventTooOld:
            return None

        if not data.get("contexts", {}).get("trace"):
            return None

        transaction_ctx = data["contexts"]["trace"]
        trace_id = transaction_ctx["trace_id"]
        processed["event_id"] = str(uuid.UUID(processed["event_id"]))
        processed["trace_id"] = str(uuid.UUID(trace_id))
        processed["span_id"] = int(transaction_ctx["span_id"], 16)
        processed["transaction_op"] = _unicodify(
            transaction_ctx.get("op") or "")
        processed["transaction_name"] = _unicodify(
            data.get("transaction") or "")
        processed["start_ts"], processed[
            "start_ms"] = self.__extract_timestamp(data["start_timestamp"], )
        status = transaction_ctx.get("status", None)
        if status:
            int_status = SPAN_STATUS_NAME_TO_CODE.get(status,
                                                      UNKNOWN_SPAN_STATUS)
        else:
            int_status = UNKNOWN_SPAN_STATUS

        processed["transaction_status"] = int_status
        if data["timestamp"] - data["start_timestamp"] < 0:
            # Seems we have some negative durations in the DB
            metrics.increment("negative_duration")

        processed["finish_ts"], processed[
            "finish_ms"] = self.__extract_timestamp(data["timestamp"], )

        duration_secs = (processed["finish_ts"] -
                         processed["start_ts"]).total_seconds()
        processed["duration"] = max(int(duration_secs * 1000), 0)

        processed["platform"] = _unicodify(event["platform"])

        tags: Mapping[str, Any] = _as_dict_safe(data.get("tags", None))
        processed["tags.key"], processed["tags.value"] = extract_extra_tags(
            tags)

        promoted_tags = {
            col: tags[col]
            for col in self.PROMOTED_TAGS if col in tags
        }
        processed["release"] = promoted_tags.get(
            "sentry:release",
            event.get("release"),
        )
        processed["environment"] = promoted_tags.get("environment")

        contexts: MutableMapping[str, Any] = _as_dict_safe(
            data.get("contexts", None))

        user_dict = data.get("user", data.get("sentry.interfaces.User",
                                              None)) or {}
        geo = user_dict.get("geo", None) or {}
        if "geo" not in contexts and isinstance(geo, dict):
            contexts["geo"] = geo

        measurements = data.get("measurements")
        if measurements is not None:
            try:
                (
                    processed["measurements.key"],
                    processed["measurements.value"],
                ) = extract_nested(
                    measurements,
                    lambda value: float(value["value"])
                    if (value is not None and isinstance(
                        value.get("value"), numbers.Number)) else None,
                )
            except Exception:
                # Not failing the event in this case just yet, because we are still
                # developing this feature.
                logger.error(
                    "Invalid measurements field.",
                    extra={"measurements": measurements},
                    exc_info=True,
                )

        breakdowns = data.get("breakdowns")
        if breakdowns is not None:
            span_op_breakdowns = breakdowns.get("span_ops")
            if span_op_breakdowns is not None:
                try:
                    (
                        processed["span_op_breakdowns.key"],
                        processed["span_op_breakdowns.value"],
                    ) = extract_nested(
                        span_op_breakdowns,
                        lambda value: float(value["value"])
                        if (value is not None and isinstance(
                            value.get("value"), numbers.Number)) else None,
                    )
                except Exception:
                    # Not failing the event in this case just yet, because we are still
                    # developing this feature.
                    logger.error(
                        "Invalid breakdowns.span_ops field.",
                        extra={"span_op_breakdowns": span_op_breakdowns},
                        exc_info=True,
                    )

        request = data.get("request", data.get("sentry.interfaces.Http",
                                               None)) or {}
        http_data: MutableMapping[str, Any] = {}
        extract_http(http_data, request)
        processed["http_method"] = http_data["http_method"]
        processed["http_referer"] = http_data["http_referer"]

        skipped_contexts = settings.TRANSACT_SKIP_CONTEXT_STORE.get(
            processed["project_id"], set())
        for context in skipped_contexts:
            if context in contexts:
                del contexts[context]

        processed["contexts.key"], processed[
            "contexts.value"] = extract_extra_contexts(contexts)

        processed["dist"] = _unicodify(
            promoted_tags.get("sentry:dist", data.get("dist")), )

        user_data: MutableMapping[str, Any] = {}
        extract_user(user_data, user_dict)
        processed["user"] = promoted_tags.get("sentry:user", "")
        processed["user_name"] = user_data["username"]
        processed["user_id"] = user_data["user_id"]
        processed["user_email"] = user_data["email"]
        ip_address = _ensure_valid_ip(user_data["ip_address"])

        if ip_address:
            if ip_address.version == 4:
                processed["ip_address_v4"] = str(ip_address)
            elif ip_address.version == 6:
                processed["ip_address_v6"] = str(ip_address)

        processed["partition"] = metadata.partition
        processed["offset"] = metadata.offset

        sdk = data.get("sdk", None) or {}
        processed["sdk_name"] = _unicodify(sdk.get("name") or "")
        processed["sdk_version"] = _unicodify(sdk.get("version") or "")

        if processed["sdk_name"] == "":
            metrics.increment("missing_sdk_name")
        if processed["sdk_version"] == "":
            metrics.increment("missing_sdk_version")

        return InsertBatch([processed], None)
Exemple #9
0
    def process_message(
            self, message: Mapping[str, Any],
            metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]:
        # some old relays accidentally emit rows without release
        if message["release"] is None:
            return None
        if message["duration"] is None:
            duration = None
        else:
            duration = _collapse_uint32(int(message["duration"] * 1000))

        # since duration is not nullable, the max duration means no duration
        if duration is None:
            duration = MAX_UINT32

        errors = _collapse_uint16(message["errors"]) or 0
        quantity = _collapse_uint32(message.get("quantity")) or 1

        # If a session ends in crashed or abnormal we want to make sure that
        # they count as errored too, so we can get the number of health and
        # errored sessions correctly.
        if message["status"] in ("crashed", "abnormal"):
            errors = max(errors, 1)

        received = _ensure_valid_date(
            datetime.utcfromtimestamp(message["received"]))
        started = _ensure_valid_date(
            datetime.utcfromtimestamp(message["started"]))

        if started is None:
            metrics.increment("empty_started_date")
        if received is None:
            metrics.increment("empty_received_date")

        processed = {
            "session_id":
            str(uuid.UUID(message["session_id"])),
            "distinct_id":
            str(uuid.UUID(message.get("distinct_id") or NIL_UUID)),
            "quantity":
            quantity,
            "seq":
            message["seq"],
            "org_id":
            message["org_id"],
            "project_id":
            message["project_id"],
            "retention_days":
            enforce_retention(message["retention_days"], received),
            "duration":
            duration,
            "status":
            STATUS_MAPPING[message["status"]],
            "errors":
            errors,
            "received":
            received if received is not None else datetime.now(),
            "started":
            started if started is not None else datetime.now(),
            "release":
            message["release"],
            "environment":
            message.get("environment") or "",
        }
        return InsertBatch([processed], None)