def process_message(self, message, metadata=None) -> Optional[ProcessedMessage]: # some old relays accidentally emit rows without release if message["release"] is None: return None if message["duration"] is None: duration = None else: duration = _collapse_uint32(int(message["duration"] * 1000)) # since duration is not nullable, the max duration means no duration if duration is None: duration = MAX_UINT32 processed = { "session_id": str(uuid.UUID(message["session_id"])), "distinct_id": str(uuid.UUID(message.get("distinct_id") or NIL_UUID)), "seq": message["seq"], "org_id": message["org_id"], "project_id": message["project_id"], "retention_days": message["retention_days"], "duration": duration, "status": STATUS_MAPPING[message["status"]], "errors": _collapse_uint16(message["errors"]) or 0, "received": _ensure_valid_date( datetime.utcfromtimestamp(message["received"]) ), "started": _ensure_valid_date( datetime.utcfromtimestamp(message["started"]) ), "release": message["release"], "environment": message.get("environment") or "", } return ProcessedMessage(action=ProcessorAction.INSERT, data=[processed])
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: # some old relays accidentally emit rows without release if message["release"] is None: return None if message["duration"] is None: duration = None else: duration = _collapse_uint32(int(message["duration"] * 1000)) # since duration is not nullable, the max duration means no duration if duration is None: duration = MAX_UINT32 errors = _collapse_uint16(message["errors"]) or 0 quantity = _collapse_uint32(message.get("quantity")) or 1 # If a session ends in crashed or abnormal we want to make sure that # they count as errored too, so we can get the number of health and # errored sessions correctly. if message["status"] in ("crashed", "abnormal"): errors = max(errors, 1) received = _ensure_valid_date( datetime.utcfromtimestamp(message["received"])) started = _ensure_valid_date( datetime.utcfromtimestamp(message["started"])) if started is None: metrics.increment("empty_started_date") if received is None: metrics.increment("empty_received_date") processed = { "session_id": str(uuid.UUID(message["session_id"])), "distinct_id": str(uuid.UUID(message.get("distinct_id") or NIL_UUID)), "quantity": quantity, "seq": message["seq"], "org_id": message["org_id"], "project_id": message["project_id"], "retention_days": message["retention_days"], "duration": duration, "status": STATUS_MAPPING[message["status"]], "errors": errors, "received": received if received is not None else datetime.now(), "started": started if started is not None else datetime.now(), "release": message["release"], "environment": message.get("environment") or "", } return InsertBatch([processed], None)
def process_message( self, value: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: assert isinstance(value, dict) v_uuid = value.get("event_id") reason = value.get("reason") # relays let arbitrary outcome reasons through do the topic. We # reject undesired values only in the processor so that we can # add new ones without having to update relays through the entire # chain. if value["outcome"] == OUTCOME_CLIENT_DISCARD: if reason is not None and reason not in CLIENT_DISCARD_REASONS: reason = None if (value["outcome"] != OUTCOME_ABUSE ): # we dont care about abuse outcomes for these metrics if "category" not in value: metrics.increment("missing_category") if "quantity" not in value: metrics.increment("missing_quantity") message = None try: timestamp = _ensure_valid_date( datetime.strptime(value["timestamp"], settings.PAYLOAD_DATETIME_FORMAT), ) except Exception: metrics.increment("bad_outcome_timestamp") timestamp = _ensure_valid_date(datetime.utcnow()) try: message = { "org_id": value.get("org_id", 0), "project_id": value.get("project_id", 0), "key_id": value.get("key_id"), "timestamp": timestamp, "outcome": value["outcome"], "category": value.get("category", DataCategory.ERROR), "quantity": value.get("quantity", 1), "reason": _unicodify(reason), "event_id": str(uuid.UUID(v_uuid)) if v_uuid is not None else None, } except Exception: metrics.increment("bad_outcome") return None return InsertBatch([message], None)
def enforce_retention( retention_days: Optional[int], timestamp: Optional[datetime] ) -> int: if not isinstance(retention_days, int): retention_days = settings.DEFAULT_RETENTION_DAYS if settings.ENFORCE_RETENTION: retention_days = ( settings.LOWER_RETENTION_DAYS if retention_days <= settings.LOWER_RETENTION_DAYS else settings.DEFAULT_RETENTION_DAYS ) # This is not ideal but it should never happen anyways timestamp = _ensure_valid_date(timestamp) if timestamp is None: timestamp = datetime.utcnow() # TODO: We may need to allow for older events in the future when post # processing triggers are based off of Snuba. Or this branch could be put # behind a "backfill-only" optional switch. if settings.DISCARD_OLD_EVENTS and timestamp < ( datetime.utcnow() - timedelta(days=retention_days) ): raise EventTooOld return retention_days
def process_message( self, value: Mapping[str, Any], metadata: KafkaMessageMetadata ) -> Optional[ProcessedMessage]: assert isinstance(value, dict) # Only record outcomes from traditional error tracking events, which # excludes transactions, attachments and sessions. Once TSDB defines # models for these, we can start recording again. category = value.get("category") if category is not None and category not in DataCategory.error_categories(): return None v_uuid = value.get("event_id") message = { "org_id": value.get("org_id", 0), "project_id": value.get("project_id", 0), "key_id": value.get("key_id"), "timestamp": _ensure_valid_date( datetime.strptime(value["timestamp"], settings.PAYLOAD_DATETIME_FORMAT), ), "outcome": value["outcome"], "reason": _unicodify(value.get("reason")), "event_id": str(uuid.UUID(v_uuid)) if v_uuid is not None else None, } return InsertBatch([message], None)
def process_message(self, value, metadata=None) -> Optional[ProcessedMessage]: assert isinstance(value, dict) v_uuid = value.get("event_id") message = { "org_id": value.get("org_id", 0), "project_id": value.get("project_id", 0), "key_id": value.get("key_id"), "timestamp": _ensure_valid_date( datetime.strptime(value["timestamp"], settings.PAYLOAD_DATETIME_FORMAT), ), "outcome": value["outcome"], "reason": _unicodify(value.get("reason")), "event_id": str(uuid.UUID(v_uuid)) if v_uuid is not None else None, } return ProcessedMessage( action=ProcessorAction.INSERT, data=[message], )
def __extract_timestamp(self, field: int) -> Tuple[datetime, int]: # We are purposely using a naive datetime here to work with the rest of the codebase. # We can be confident that clients are only sending UTC dates. timestamp = _ensure_valid_date(datetime.utcfromtimestamp(field)) if timestamp is None: timestamp = datetime.utcnow() milliseconds = int(timestamp.microsecond / 1000) return (timestamp, milliseconds)
def extract_required(self, output, message): output['group_id'] = message['group_id'] or 0 # This is not ideal but it should never happen anyways timestamp = _ensure_valid_date( datetime.strptime(message['datetime'], settings.PAYLOAD_DATETIME_FORMAT)) if timestamp is None: timestamp = datetime.utcnow() output['timestamp'] = timestamp
def extract_required(self, output: MutableMapping[str, Any], event: Mapping[str, Any]) -> None: output["group_id"] = event["group_id"] or 0 # This is not ideal but it should never happen anyways timestamp = _ensure_valid_date( datetime.strptime(event["datetime"], settings.PAYLOAD_DATETIME_FORMAT)) if timestamp is None: timestamp = datetime.utcnow() output["timestamp"] = timestamp
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: if not self._should_process(message): return None timestamp = _ensure_valid_date( datetime.utcfromtimestamp(message["timestamp"])) assert timestamp is not None keys = [] values = [] tags = message["tags"] assert isinstance(tags, Mapping) for key, value in sorted(tags.items()): assert key.isdigit() keys.append(int(key)) assert isinstance(value, int) values.append(value) try: retention_days = enforce_retention(message["retention_days"], timestamp) except EventTooOld: return None processed = [{ "org_id": _literal(message["org_id"]), "project_id": _literal(message["project_id"]), "metric_id": _literal(message["metric_id"]), "timestamp": _call( "toDateTime", (_literal( timestamp_to_bucket(timestamp, granularity).isoformat()), ), ), "tags.key": _array_literal(keys), "tags.value": _array_literal(values), **self._process_values(message), "retention_days": _literal(retention_days), "granularity": _literal(granularity), } for granularity in self.GRANULARITIES_SECONDS] return AggregateInsertBatch(processed, None)
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: # TODO: Support messages with multiple buckets if not self._should_process(message): return None timestamp = _ensure_valid_date( datetime.utcfromtimestamp(message["timestamp"])) assert timestamp is not None, "Invalid timestamp" keys = [] values = [] tags = message["tags"] assert isinstance(tags, Mapping), "Invalid tags type" for key, value in sorted(tags.items()): assert key.isdigit() and isinstance(value, int), "Tag key/value invalid" keys.append(int(key)) values.append(value) mat_version = (DISABLED_MATERIALIZATION_VERSION if settings.WRITE_METRICS_AGG_DIRECTLY else settings.ENABLED_MATERIALIZATION_VERSION) try: retention_days = enforce_retention(message["retention_days"], timestamp) except EventTooOld: return None processed = { "org_id": message["org_id"], "project_id": message["project_id"], "metric_id": message["metric_id"], "timestamp": timestamp, "tags.key": keys, "tags.value": values, **self._process_values(message), "materialization_version": mat_version, "retention_days": retention_days, "partition": metadata.partition, "offset": metadata.offset, } return InsertBatch([processed], None)
def enforce_retention(message, timestamp): project_id = message['project_id'] retention_days = settings.RETENTION_OVERRIDES.get(project_id) if retention_days is None: retention_days = int( message.get('retention_days') or settings.DEFAULT_RETENTION_DAYS) # This is not ideal but it should never happen anyways timestamp = _ensure_valid_date(timestamp) if timestamp is None: timestamp = datetime.utcnow() # TODO: We may need to allow for older events in the future when post # processing triggers are based off of Snuba. Or this branch could be put # behind a "backfill-only" optional switch. if settings.DISCARD_OLD_EVENTS and timestamp < ( datetime.utcnow() - timedelta(days=retention_days)): raise EventTooOld return retention_days
def process_message(self, value, metadata): assert isinstance(value, dict) v_uuid = value.get('event_id') message = { 'org_id': value.get('org_id', 0), 'project_id': value.get('project_id', 0), 'key_id': value.get('key_id'), 'timestamp': _ensure_valid_date( datetime.strptime(value['timestamp'], settings.PAYLOAD_DATETIME_FORMAT), ), 'outcome': value['outcome'], 'reason': _unicodify(value.get('reason')), 'event_id': str(uuid.UUID(v_uuid)) if v_uuid is not None else None, } return (self.INSERT, message)
def process_message( self, value: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: assert isinstance(value, dict) v_uuid = value.get("event_id") if value["outcome"] != 4: # we dont care about abuse outcomes for these metrics if "category" not in value: metrics.increment("missing_category") if "quantity" not in value: metrics.increment("missing_quantity") message = { "org_id": value.get("org_id", 0), "project_id": value.get("project_id", 0), "key_id": value.get("key_id"), "timestamp": _ensure_valid_date( datetime.strptime(value["timestamp"], settings.PAYLOAD_DATETIME_FORMAT), ), "outcome": value["outcome"], "category": value.get("category", DataCategory.ERROR), "quantity": value.get("quantity", 1), "reason": _unicodify(value.get("reason")), "event_id": str(uuid.UUID(v_uuid)) if v_uuid is not None else None, } return InsertBatch([message], None)
def process_message( self, message: Mapping[str, Any], metadata: KafkaMessageMetadata) -> Optional[ProcessedMessage]: # TODO: Support messages with multiple buckets if not self._should_process(message): return None timestamp = _ensure_valid_date( datetime.utcfromtimestamp(message["timestamp"])) assert timestamp is not None keys = [] values = [] tags = message["tags"] assert isinstance(tags, Mapping) for key, value in sorted(tags.items()): assert key.isdigit() keys.append(int(key)) assert isinstance(value, int) values.append(value) processed = { "org_id": message["org_id"], "project_id": message["project_id"], "metric_id": message["metric_id"], "timestamp": timestamp, "tags.key": keys, "tags.value": values, **self._process_values(message), "materialization_version": 0, "retention_days": message["retention_days"], "partition": metadata.partition, "offset": metadata.offset, } return InsertBatch([processed], None)
def __extract_timestamp(self, field: float) -> Tuple[datetime, int]: timestamp = _ensure_valid_date(datetime.fromtimestamp(field)) if timestamp is None: timestamp = datetime.utcnow() nanoseconds = int(timestamp.microsecond * 1000) return (timestamp, nanoseconds)
def __extract_timestamp(self, field): timestamp = _ensure_valid_date(datetime.fromtimestamp(field)) if timestamp is None: timestamp = datetime.utcnow() milliseconds = int(timestamp.microsecond / 1000) return (timestamp, milliseconds)
def __extract_timestamp(self, field: int) -> datetime: timestamp = _ensure_valid_date(datetime.utcfromtimestamp(field)) if timestamp is None: timestamp = datetime.utcnow() return timestamp