Example #1
0
class EventsEntity(Entity):
    """
    Represents the collection of classic sentry "error" type events
    and the particular quirks of storing and querying them.
    """
    def __init__(self) -> None:
        storage = get_writable_storage(StorageKey.EVENTS)
        schema = storage.get_table_writer().get_schema()
        columns = schema.get_columns()
        ro_storage = get_storage(StorageKey.EVENTS_RO)

        self.__time_group_columns = {"time": "timestamp", "rtime": "received"}
        self.__time_parse_columns = ("timestamp", "received")
        super().__init__(
            storages=[storage],
            query_plan_builder=SelectedStorageQueryPlanBuilder(
                selector=EventsQueryStorageSelector(
                    events_table=storage,
                    events_ro_table=ro_storage,
                )),
            abstract_column_set=columns,
            writable_storage=storage,
        )

        self.__tags_processor = TagColumnProcessor(
            columns=columns,
            promoted_columns=get_promoted_columns(),
            column_tag_map=get_column_tag_map(),
        )

    def column_expr(
        self,
        column_name: str,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ) -> Union[None, Any]:
        processed_column = self.__tags_processor.process_column_expression(
            column_name, query, parsing_context, table_alias)
        if processed_column:
            # If processed_column is None, this was not a tag/context expression

            # This conversion must not be ported to the errors dataset. We should
            # not support promoting tags/contexts with boolean values. There is
            # no way to convert them back consistently to the value provided by
            # the client when the event is ingested, in all ways to access
            # tags/contexts. Once the errors dataset is in use, we will not have
            # boolean promoted tags/contexts so this constraint will be easy to enforce.
            boolean_contexts = {
                "contexts[device.simulator]",
                "contexts[device.online]",
                "contexts[device.charging]",
            }
            boolean_context_template = (
                "multiIf(equals(%(processed_column)s, ''), '', "
                "in(%(processed_column)s, tuple('1', 'True')), 'True', 'False')"
            )
            if column_name in boolean_contexts:
                return boolean_context_template % ({
                    "processed_column":
                    processed_column
                })
            return processed_column
        elif column_name == "group_id":
            return f"nullIf({qualified_column('group_id', table_alias)}, 0)"
        elif column_name == "message":
            # Because of the rename from message->search_message without backfill,
            # records will have one or the other of these fields.
            # TODO this can be removed once all data has search_message filled in.
            search_message = qualified_column("search_message", table_alias)
            message = qualified_column("message", table_alias)
            return f"coalesce({search_message}, {message})"
        else:
            return super().column_expr(column_name, query, parsing_context,
                                       table_alias)

    def get_extensions(self) -> Mapping[str, QueryExtension]:
        return {
            "project":
            ProjectExtension(project_column="project_id"),
            "timeseries":
            TimeSeriesExtension(
                default_granularity=3600,
                default_window=timedelta(days=5),
                timestamp_column="timestamp",
            ),
        }

    def get_query_processors(self) -> Sequence[QueryProcessor]:
        return [
            TagsExpanderProcessor(),
            BasicFunctionsProcessor(),
            TimeSeriesColumnProcessor(self.__time_group_columns),
            HandledFunctionsProcessor("exception_stacks.mechanism_handled",
                                      self.get_data_model()),
        ]

    # TODO: This needs to burned with fire, for so many reasons.
    # It's here now to reduce the scope of the initial entity changes
    # but can be moved to a processor if not removed entirely.
    def process_condition(
            self, condition: Tuple[str, str, Any]) -> Tuple[str, str, Any]:
        lhs, op, lit = condition
        if (lhs in self.__time_parse_columns
                and op in (">", "<", ">=", "<=", "=", "!=")
                and isinstance(lit, str)):
            lit = parse_datetime(lit)
        return lhs, op, lit
Example #2
0
class ErrorsEntity(Entity):
    """
    Represents the collections of all event types that are not transactions.

    This is meant to replace Events. They will both exist during the migration.
    """

    def __init__(self) -> None:
        storage = get_writable_storage(StorageKey.ERRORS)
        schema = storage.get_table_writer().get_schema()
        columns = schema.get_columns()

        self.__time_group_columns = {"time": "timestamp", "rtime": "received"}
        self.__time_parse_columns = ("timestamp", "received")
        super().__init__(
            storages=[storage],
            query_plan_builder=SingleStorageQueryPlanBuilder(
                storage=storage, mappers=errors_translators
            ),
            abstract_column_set=columns,
            writable_storage=storage,
        )

        self.__tags_processor = TagColumnProcessor(
            columns=columns,
            promoted_columns=self._get_promoted_columns(),
            column_tag_map=self._get_column_tag_map(),
        )

    def column_expr(
        self,
        column_name: str,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ) -> Union[None, Any]:
        processed_column = self.__tags_processor.process_column_expression(
            column_name, query, parsing_context, table_alias
        )
        return processed_column or super().column_expr(
            column_name, query, parsing_context, table_alias
        )

    def _get_promoted_columns(self) -> Mapping[str, FrozenSet[str]]:
        return {
            "tags": frozenset(promoted_tag_columns.values()),
            "contexts": frozenset(),
        }

    def _get_column_tag_map(self) -> Mapping[str, Mapping[str, str]]:
        return {
            "tags": {col: tag for tag, col in promoted_tag_columns.items()},
            "contexts": {},
        }

    def get_extensions(self) -> Mapping[str, QueryExtension]:
        return {
            "project": ProjectExtension(project_column="project_id"),
            "timeseries": TimeSeriesExtension(
                default_granularity=3600,
                default_window=timedelta(days=5),
                timestamp_column="timestamp",
            ),
        }

    def get_query_processors(self) -> Sequence[QueryProcessor]:
        return [
            TagsExpanderProcessor(),
            BasicFunctionsProcessor(),
            TimeSeriesColumnProcessor(self.__time_group_columns),
            HandledFunctionsProcessor(
                "exception_stacks.mechanism_handled", self.get_data_model()
            ),
        ]

    # TODO: This needs to burned with fire, for so many reasons.
    # It's here now to reduce the scope of the initial entity changes
    # but can be moved to a processor if not removed entirely.
    def process_condition(
        self, condition: Tuple[str, str, Any]
    ) -> Tuple[str, str, Any]:
        lhs, op, lit = condition
        if (
            lhs in self.__time_parse_columns
            and op in (">", "<", ">=", "<=", "=", "!=")
            and isinstance(lit, str)
        ):
            lit = parse_datetime(lit)
        return lhs, op, lit
Example #3
0
class TransactionsDataset(TimeSeriesDataset):
    def __init__(self) -> None:
        columns = ColumnSet(
            [
                ("project_id", UInt(64)),
                ("event_id", UUID()),
                ("trace_id", UUID()),
                ("span_id", UInt(64)),
                ("transaction_name", LowCardinality(String())),
                (
                    "transaction_hash",
                    Materialized(UInt(64), "cityHash64(transaction_name)",),
                ),
                ("transaction_op", LowCardinality(String())),
                ("transaction_status", WithDefault(UInt(8), UNKNOWN_SPAN_STATUS)),
                ("start_ts", DateTime()),
                ("start_ms", UInt(16)),
                ("_start_date", Materialized(Date(), "toDate(start_ts)"),),
                ("finish_ts", DateTime()),
                ("finish_ms", UInt(16)),
                ("_finish_date", Materialized(Date(), "toDate(finish_ts)"),),
                ("duration", UInt(32)),
                ("platform", LowCardinality(String())),
                ("environment", LowCardinality(Nullable(String()))),
                ("release", LowCardinality(Nullable(String()))),
                ("dist", LowCardinality(Nullable(String()))),
                ("ip_address_v4", Nullable(IPv4())),
                ("ip_address_v6", Nullable(IPv6())),
                ("user", WithDefault(String(), "''",)),
                ("user_hash", Materialized(UInt(64), "cityHash64(user)"),),
                ("user_id", Nullable(String())),
                ("user_name", Nullable(String())),
                ("user_email", Nullable(String())),
                ("sdk_name", WithDefault(LowCardinality(String()), "''")),
                ("sdk_version", WithDefault(LowCardinality(String()), "''")),
                ("tags", Nested([("key", String()), ("value", String())])),
                ("_tags_flattened", String()),
                ("contexts", Nested([("key", String()), ("value", String())])),
                ("_contexts_flattened", String()),
                ("partition", UInt(16)),
                ("offset", UInt(64)),
                ("retention_days", UInt(16)),
                ("deleted", UInt(8)),
            ]
        )

        schema = ReplacingMergeTreeSchema(
            columns=columns,
            local_table_name="transactions_local",
            dist_table_name="transactions_dist",
            mandatory_conditions=[],
            prewhere_candidates=["event_id", "project_id"],
            order_by="(project_id, _finish_date, transaction_name, cityHash64(span_id))",
            partition_by="(retention_days, toMonday(_finish_date))",
            version_column="deleted",
            sample_expr=None,
            migration_function=transactions_migrations,
        )

        dataset_schemas = DatasetSchemas(read_schema=schema, write_schema=schema,)

        self.__tags_processor = TagColumnProcessor(
            columns=columns,
            promoted_columns=self._get_promoted_columns(),
            column_tag_map=self._get_column_tag_map(),
        )

        super().__init__(
            dataset_schemas=dataset_schemas,
            table_writer=TransactionsTableWriter(
                write_schema=schema,
                stream_loader=KafkaStreamLoader(
                    processor=TransactionsMessageProcessor(), default_topic="events",
                ),
            ),
            time_group_columns={
                "bucketed_start": "start_ts",
                "bucketed_end": "finish_ts",
            },
            time_parse_columns=("start_ts", "finish_ts"),
        )

    def _get_promoted_columns(self):
        # TODO: Support promoted tags
        return {
            "tags": frozenset(),
            "contexts": frozenset(),
        }

    def _get_column_tag_map(self):
        # TODO: Support promoted tags
        return {
            "tags": {},
            "contexts": {},
        }

    def get_extensions(self) -> Mapping[str, QueryExtension]:
        return {
            "project": ProjectExtension(
                processor=ProjectExtensionProcessor(project_column="project_id")
            ),
            "timeseries": TimeSeriesExtension(
                default_granularity=3600,
                default_window=timedelta(days=5),
                timestamp_column="start_ts",
            ),
        }

    def column_expr(
        self,
        column_name,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ):
        # TODO remove these casts when clickhouse-driver is >= 0.0.19
        if column_name == "ip_address_v4":
            return "IPv4NumToString(ip_address_v4)"
        if column_name == "ip_address_v6":
            return "IPv6NumToString(ip_address_v6)"
        if column_name == "ip_address":
            return f"coalesce(IPv4NumToString(ip_address_v4), IPv6NumToString(ip_address_v6))"
        if column_name == "event_id":
            return "replaceAll(toString(event_id), '-', '')"
        processed_column = self.__tags_processor.process_column_expression(
            column_name, query, parsing_context, table_alias
        )
        if processed_column:
            # If processed_column is None, this was not a tag/context expression
            return processed_column
        return super().column_expr(column_name, query, parsing_context)

    def get_split_query_spec(self) -> Union[None, ColumnSplitSpec]:
        return ColumnSplitSpec(
            id_column="event_id",
            project_column="project_id",
            timestamp_column="start_ts",
        )

    def get_query_processors(self) -> Sequence[QueryProcessor]:
        return [
            BasicFunctionsProcessor(),
            ApdexProcessor(),
            ImpactProcessor(),
            PrewhereProcessor(),
            NestedFieldConditionOptimizer(
                "tags", "_tags_flattened", {"start_ts", "finish_ts"}, BEGINNING_OF_TIME
            ),
            NestedFieldConditionOptimizer(
                "contexts",
                "_contexts_flattened",
                {"start_ts", "finish_ts"},
                BEGINNING_OF_TIME,
            ),
        ]
Example #4
0
class TransactionsDataset(TimeSeriesDataset):
    def __init__(self) -> None:
        storage = get_writable_storage("transactions")
        schema = storage.get_table_writer().get_schema()
        columns = schema.get_columns()

        self.__tags_processor = TagColumnProcessor(
            columns=columns,
            promoted_columns=self._get_promoted_columns(),
            column_tag_map=self._get_column_tag_map(),
        )
        self.__time_group_columns = {
            "bucketed_start": "start_ts",
            "bucketed_end": "finish_ts",
        }
        super().__init__(
            storages=[storage],
            query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage),
            abstract_column_set=schema.get_columns(),
            writable_storage=storage,
            time_group_columns=self.__time_group_columns,
            time_parse_columns=("start_ts", "finish_ts"),
        )

    def _get_promoted_columns(self):
        # TODO: Support promoted tags
        return {
            "tags": frozenset(),
            "contexts": frozenset(),
        }

    def _get_column_tag_map(self):
        # TODO: Support promoted tags
        return {
            "tags": {},
            "contexts": {},
        }

    def get_extensions(self) -> Mapping[str, QueryExtension]:
        return {
            "project":
            ProjectExtension(processor=ProjectExtensionProcessor(
                project_column="project_id")),
            "timeseries":
            TimeSeriesExtension(
                default_granularity=3600,
                default_window=timedelta(days=5),
                timestamp_column="start_ts",
            ),
        }

    def column_expr(
        self,
        column_name,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ):
        # TODO remove these casts when clickhouse-driver is >= 0.0.19
        if column_name == "ip_address_v4":
            return "IPv4NumToString(ip_address_v4)"
        if column_name == "ip_address_v6":
            return "IPv6NumToString(ip_address_v6)"
        if column_name == "ip_address":
            return f"coalesce(IPv4NumToString(ip_address_v4), IPv6NumToString(ip_address_v6))"
        if column_name == "event_id":
            return "replaceAll(toString(event_id), '-', '')"
        processed_column = self.__tags_processor.process_column_expression(
            column_name, query, parsing_context, table_alias)
        if processed_column:
            # If processed_column is None, this was not a tag/context expression
            return processed_column
        return super().column_expr(column_name, query, parsing_context)

    def get_split_query_spec(self) -> Union[None, ColumnSplitSpec]:
        return ColumnSplitSpec(
            id_column="event_id",
            project_column="project_id",
            timestamp_column="start_ts",
        )

    def get_query_processors(self) -> Sequence[QueryProcessor]:
        return [
            BasicFunctionsProcessor(),
            ApdexProcessor(),
            ImpactProcessor(),
            TimeSeriesColumnProcessor(self.__time_group_columns),
        ]
Example #5
0
class TransactionsDataset(TimeSeriesDataset):
    def __init__(self):
        columns = ColumnSet([
            ('project_id', UInt(64)),
            ('event_id', UUID()),
            ('trace_id', UUID()),
            ('span_id', UInt(64)),
            ('transaction_name', String()),
            ('transaction_hash',
             Materialized(
                 UInt(64),
                 'cityHash64(transaction_name)',
             )),
            ('transaction_op', LowCardinality(String())),
            ('start_ts', DateTime()),
            ('start_ms', UInt(16)),
            ('finish_ts', DateTime()),
            ('finish_ms', UInt(16)),
            ('duration', UInt(32)),
            ('platform', LowCardinality(String())),
            ('environment', Nullable(String())),
            ('release', Nullable(String())),
            ('dist', Nullable(String())),
            ('ip_address_v4', Nullable(IPv4())),
            ('ip_address_v6', Nullable(IPv6())),
            ('user', WithDefault(
                String(),
                "''",
            )),
            ('user_id', Nullable(String())),
            ('user_name', Nullable(String())),
            ('user_email', Nullable(String())),
            ('tags', Nested([
                ('key', String()),
                ('value', String()),
            ])),
            ('contexts', Nested([
                ('key', String()),
                ('value', String()),
            ])),
            ('partition', UInt(16)),
            ('offset', UInt(64)),
            ('retention_days', UInt(16)),
            ('deleted', UInt(8)),
        ])

        schema = ReplacingMergeTreeSchema(
            columns=columns,
            local_table_name='transactions_local',
            dist_table_name='transactions_dist',
            mandatory_conditions=[],
            order_by=
            '(project_id, toStartOfDay(start_ts), transaction_hash, start_ts, start_ms, trace_id, span_id)',
            partition_by='(retention_days, toMonday(start_ts))',
            version_column='deleted',
            sample_expr=None,
            migration_function=transactions_migrations,
        )

        dataset_schemas = DatasetSchemas(
            read_schema=schema,
            write_schema=schema,
        )

        self.__tags_processor = TagColumnProcessor(
            columns=columns,
            promoted_columns=self._get_promoted_columns(),
            column_tag_map=self._get_column_tag_map(),
        )

        super().__init__(dataset_schemas=dataset_schemas,
                         table_writer=TransactionsTableWriter(
                             write_schema=schema,
                             stream_loader=KafkaStreamLoader(
                                 processor=TransactionsMessageProcessor(),
                                 default_topic="events",
                             ),
                         ),
                         time_group_columns={
                             'bucketed_start': 'start_ts',
                             'bucketed_end': 'finish_ts',
                         },
                         time_parse_columns=('start_ts', 'finish_ts'))

    def _get_promoted_columns(self):
        # TODO: Support promoted tags
        return {
            'tags': frozenset(),
            'contexts': frozenset(),
        }

    def _get_column_tag_map(self):
        # TODO: Support promoted tags
        return {
            'tags': {},
            'contexts': {},
        }

    def get_extensions(self) -> Mapping[str, QueryExtension]:
        return {
            'project':
            ProjectExtension(processor=ProjectExtensionProcessor(
                project_column="project_id")),
            'timeseries':
            TimeSeriesExtension(
                default_granularity=3600,
                default_window=timedelta(days=5),
                timestamp_column='start_ts',
            ),
        }

    def column_expr(self,
                    column_name,
                    query: Query,
                    parsing_context: ParsingContext,
                    table_alias: str = ""):
        # TODO remove these casts when clickhouse-driver is >= 0.0.19
        if column_name == 'ip_address_v4':
            return 'IPv4NumToString(ip_address_v4)'
        if column_name == 'ip_address_v6':
            return 'IPv6NumToString(ip_address_v6)'
        if column_name == 'event_id':
            return 'replaceAll(toString(event_id), \'-\', \'\')'
        processed_column = self.__tags_processor.process_column_expression(
            column_name, query, parsing_context, table_alias)
        if processed_column:
            # If processed_column is None, this was not a tag/context expression
            return processed_column
        return super().column_expr(column_name, query, parsing_context)

    def get_split_query_spec(self) -> Union[None, ColumnSplitSpec]:
        return ColumnSplitSpec(
            id_column="event_id",
            project_column="project_id",
            timestamp_column="start_ts",
        )

    def get_prewhere_keys(self) -> Sequence[str]:
        return ['event_id', 'project_id']
Example #6
0
class EventsDataset(TimeSeriesDataset):
    """
    Represents the collection of classic sentry "error" type events
    and the particular quirks of storing and querying them.
    """

    def __init__(self):
        metadata_columns = ColumnSet([
            # optional stream related data
            ('offset', Nullable(UInt(64))),
            ('partition', Nullable(UInt(16))),
        ])

        promoted_tag_columns = ColumnSet([
            # These are the classic tags, they are saved in Snuba exactly as they
            # appear in the event body.
            ('level', Nullable(String())),
            ('logger', Nullable(String())),
            ('server_name', Nullable(String())),  # future name: device_id?
            ('transaction', Nullable(String())),
            ('environment', Nullable(String())),
            ('sentry:release', Nullable(String())),
            ('sentry:dist', Nullable(String())),
            ('sentry:user', Nullable(String())),
            ('site', Nullable(String())),
            ('url', Nullable(String())),
        ])

        promoted_context_tag_columns = ColumnSet([
            # These are promoted tags that come in in `tags`, but are more closely
            # related to contexts.  To avoid naming confusion with Clickhouse nested
            # columns, they are stored in the database with s/./_/
            # promoted tags
            ('app_device', Nullable(String())),
            ('device', Nullable(String())),
            ('device_family', Nullable(String())),
            ('runtime', Nullable(String())),
            ('runtime_name', Nullable(String())),
            ('browser', Nullable(String())),
            ('browser_name', Nullable(String())),
            ('os', Nullable(String())),
            ('os_name', Nullable(String())),
            ('os_rooted', Nullable(UInt(8))),
        ])

        promoted_context_columns = ColumnSet([
            ('os_build', Nullable(String())),
            ('os_kernel_version', Nullable(String())),
            ('device_name', Nullable(String())),
            ('device_brand', Nullable(String())),
            ('device_locale', Nullable(String())),
            ('device_uuid', Nullable(String())),
            ('device_model_id', Nullable(String())),
            ('device_arch', Nullable(String())),
            ('device_battery_level', Nullable(Float(32))),
            ('device_orientation', Nullable(String())),
            ('device_simulator', Nullable(UInt(8))),
            ('device_online', Nullable(UInt(8))),
            ('device_charging', Nullable(UInt(8))),
        ])

        required_columns = ColumnSet([
            ('event_id', FixedString(32)),
            ('project_id', UInt(64)),
            ('group_id', UInt(64)),
            ('timestamp', DateTime()),
            ('deleted', UInt(8)),
            ('retention_days', UInt(16)),
        ])

        all_columns = required_columns + [
            # required for non-deleted
            ('platform', Nullable(String())),
            ('message', Nullable(String())),
            ('primary_hash', Nullable(FixedString(32))),
            ('received', Nullable(DateTime())),

            ('search_message', Nullable(String())),
            ('title', Nullable(String())),
            ('location', Nullable(String())),

            # optional user
            ('user_id', Nullable(String())),
            ('username', Nullable(String())),
            ('email', Nullable(String())),
            ('ip_address', Nullable(String())),

            # optional geo
            ('geo_country_code', Nullable(String())),
            ('geo_region', Nullable(String())),
            ('geo_city', Nullable(String())),

            ('sdk_name', Nullable(String())),
            ('sdk_version', Nullable(String())),
            ('type', Nullable(String())),
            ('version', Nullable(String())),
        ] + metadata_columns \
            + promoted_context_columns \
            + promoted_tag_columns \
            + promoted_context_tag_columns \
            + [
                # other tags
                ('tags', Nested([
                    ('key', String()),
                    ('value', String()),
                ])),

                # other context
                ('contexts', Nested([
                    ('key', String()),
                    ('value', String()),
                ])),

                # http interface
                ('http_method', Nullable(String())),
                ('http_referer', Nullable(String())),

                # exception interface
                ('exception_stacks', Nested([
                    ('type', Nullable(String())),
                    ('value', Nullable(String())),
                    ('mechanism_type', Nullable(String())),
                    ('mechanism_handled', Nullable(UInt(8))),
                ])),
                ('exception_frames', Nested([
                    ('abs_path', Nullable(String())),
                    ('filename', Nullable(String())),
                    ('package', Nullable(String())),
                    ('module', Nullable(String())),
                    ('function', Nullable(String())),
                    ('in_app', Nullable(UInt(8))),
                    ('colno', Nullable(UInt(32))),
                    ('lineno', Nullable(UInt(32))),
                    ('stack_level', UInt(16)),
                ])),

                # These are columns we added later in the life of the (current) production
                # database. They don't necessarily belong here in a logical/readability sense
                # but they are here to match the order of columns in production becase
                # `insert_distributed_sync` is very sensitive to column existence and ordering.
                ('culprit', Nullable(String())),
                ('sdk_integrations', Array(String())),
                ('modules', Nested([
                    ('name', String()),
                    ('version', String()),
                ])),
        ]

        sample_expr = 'cityHash64(toString(event_id))'
        schema = ReplacingMergeTreeSchema(
            columns=all_columns,
            local_table_name='sentry_local',
            dist_table_name='sentry_dist',
            mandatory_conditions=[('deleted', '=', 0)],
            order_by='(project_id, toStartOfDay(timestamp), %s)' % sample_expr,
            partition_by='(toMonday(timestamp), if(equals(retention_days, 30), 30, 90))',
            version_column='deleted',
            sample_expr=sample_expr,
            migration_function=events_migrations)

        dataset_schemas = DatasetSchemas(
            read_schema=schema,
            write_schema=schema,
        )

        table_writer = TableWriter(
            write_schema=schema,
            stream_loader=KafkaStreamLoader(
                processor=EventsProcessor(promoted_tag_columns),
                default_topic="events",
                replacement_topic="event-replacements",
                commit_log_topic="snuba-commit-log",
            )
        )

        super(EventsDataset, self).__init__(
            dataset_schemas=dataset_schemas,
            table_writer=table_writer,
            time_group_columns={
                'time': 'timestamp',
                'rtime': 'received'
            },
            time_parse_columns=('timestamp', 'received')
        )

        self.__metadata_columns = metadata_columns
        self.__promoted_tag_columns = promoted_tag_columns
        self.__promoted_context_tag_columns = promoted_context_tag_columns
        self.__promoted_context_columns = promoted_context_columns
        self.__required_columns = required_columns

        self.__tags_processor = TagColumnProcessor(
            columns=all_columns,
            promoted_columns=self._get_promoted_columns(),
            column_tag_map=self._get_column_tag_map(),
        )

    def get_split_query_spec(self) -> Union[None, ColumnSplitSpec]:
        return ColumnSplitSpec(
            id_column="event_id",
            project_column="project_id",
            timestamp_column="timestamp",
        )

    def column_expr(self, column_name, query: Query, parsing_context: ParsingContext, table_alias: str=""):
        processed_column = self.__tags_processor.process_column_expression(column_name, query, parsing_context, table_alias)
        if processed_column:
            # If processed_column is None, this was not a tag/context expression
            return processed_column
        elif column_name == 'issue' or column_name == 'group_id':
            return f"nullIf({qualified_column('group_id', table_alias)}, 0)"
        elif column_name == 'message':
            # Because of the rename from message->search_message without backfill,
            # records will have one or the other of these fields.
            # TODO this can be removed once all data has search_message filled in.
            search_message = qualified_column('search_message', table_alias)
            message = qualified_column('message', table_alias)
            return f"coalesce({search_message}, {message})"
        else:
            return super().column_expr(column_name, query, parsing_context, table_alias)

    def get_promoted_tag_columns(self):
        return self.__promoted_tag_columns

    def _get_promoted_context_tag_columns(self):
        return self.__promoted_context_tag_columns

    def _get_promoted_context_columns(self):
        return self.__promoted_context_columns

    def get_required_columns(self):
        return self.__required_columns

    def _get_promoted_columns(self):
        # The set of columns, and associated keys that have been promoted
        # to the top level table namespace.
        return {
            'tags': frozenset(col.flattened for col in (self.get_promoted_tag_columns() + self._get_promoted_context_tag_columns())),
            'contexts': frozenset(col.flattened for col in self._get_promoted_context_columns()),
        }

    def _get_column_tag_map(self):
        # For every applicable promoted column,  a map of translations from the column
        # name  we save in the database to the tag we receive in the query.
        promoted_context_tag_columns = self._get_promoted_context_tag_columns()

        return {
            'tags': {col.flattened: col.flattened.replace('_', '.') for col in promoted_context_tag_columns},
            'contexts': {},
        }

    def get_tag_column_map(self):
        # And a reverse map from the tags the client expects to the database columns
        return {
            col: dict(map(reversed, trans.items())) for col, trans in self._get_column_tag_map().items()
        }

    def get_promoted_tags(self):
        # The canonical list of foo.bar strings that you can send as a `tags[foo.bar]` query
        # and they can/will use a promoted column.
        return {
            col: [self._get_column_tag_map()[col].get(x, x) for x in self._get_promoted_columns()[col]]
            for col in self._get_promoted_columns()
        }

    def get_extensions(self) -> Mapping[str, QueryExtension]:
        return {
            'project': ProjectExtension(
                processor=ProjectWithGroupsProcessor(project_column="project_id")
            ),
            'timeseries': TimeSeriesExtension(
                default_granularity=3600,
                default_window=timedelta(days=5),
                timestamp_column='timestamp',
            ),
        }

    def get_prewhere_keys(self) -> Sequence[str]:
        return ['event_id', 'issue', 'tags[sentry:release]', 'message', 'environment', 'project_id']
Example #7
0
class ErrorsDataset(TimeSeriesDataset):
    """
    Represents the collections of all event types that are not transactions.

    This is meant to replace Events. They will both exist during the migration.
    """

    def __init__(self) -> None:
        storage = get_writable_storage("errors")
        schema = storage.get_table_writer().get_schema()
        columns = schema.get_columns()

        self.__time_group_columns = {"time": "timestamp", "rtime": "received"}
        super().__init__(
            storages=[storage],
            query_plan_builder=SingleStorageQueryPlanBuilder(storage=storage),
            abstract_column_set=columns,
            writable_storage=storage,
            time_group_columns=self.__time_group_columns,
            time_parse_columns=("timestamp", "received"),
        )

        self.__tags_processor = TagColumnProcessor(
            columns=columns,
            promoted_columns=self._get_promoted_columns(),
            column_tag_map=self._get_column_tag_map(),
        )

    def get_split_query_spec(self) -> Union[None, ColumnSplitSpec]:
        return ColumnSplitSpec(
            id_column="event_id",
            project_column="project_id",
            timestamp_column="timestamp",
        )

    def column_expr(
        self,
        column_name,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ):
        processed_column = self.__tags_processor.process_column_expression(
            column_name, query, parsing_context, table_alias
        )
        return processed_column or super().column_expr(
            column_name, query, parsing_context, table_alias
        )

    def _get_promoted_columns(self) -> Mapping[str, FrozenSet[str]]:
        return {
            "tags": frozenset(promoted_tag_columns.values()),
            "contexts": frozenset(),
        }

    def _get_column_tag_map(self) -> Mapping[str, Mapping[str, str]]:
        return {
            "tags": {col: tag for tag, col in promoted_tag_columns.items()},
            "contexts": {},
        }

    def get_extensions(self) -> Mapping[str, QueryExtension]:
        return {
            "project": ProjectExtension(
                processor=ProjectWithGroupsProcessor(
                    project_column="project_id",
                    replacer_state_name=ReplacerState.ERRORS,
                )
            ),
            "timeseries": TimeSeriesExtension(
                default_granularity=3600,
                default_window=timedelta(days=5),
                timestamp_column="timestamp",
            ),
        }

    def get_query_processors(self) -> Sequence[QueryProcessor]:
        return [
            BasicFunctionsProcessor(),
            TimeSeriesColumnProcessor(self.__time_group_columns),
        ]
Example #8
0
class EventsDataset(TimeSeriesDataset):
    """
    Represents the collection of classic sentry "error" type events
    and the particular quirks of storing and querying them.
    """
    def __init__(self) -> None:
        metadata_columns = ColumnSet([
            # optional stream related data
            ("offset", Nullable(UInt(64))),
            ("partition", Nullable(UInt(16))),
        ])

        promoted_tag_columns = ColumnSet([
            # These are the classic tags, they are saved in Snuba exactly as they
            # appear in the event body.
            ("level", Nullable(String())),
            ("logger", Nullable(String())),
            ("server_name", Nullable(String())),  # future name: device_id?
            ("transaction", Nullable(String())),
            ("environment", Nullable(String())),
            ("sentry:release", Nullable(String())),
            ("sentry:dist", Nullable(String())),
            ("sentry:user", Nullable(String())),
            ("site", Nullable(String())),
            ("url", Nullable(String())),
        ])

        promoted_context_tag_columns = ColumnSet([
            # These are promoted tags that come in in `tags`, but are more closely
            # related to contexts.  To avoid naming confusion with Clickhouse nested
            # columns, they are stored in the database with s/./_/
            # promoted tags
            ("app_device", Nullable(String())),
            ("device", Nullable(String())),
            ("device_family", Nullable(String())),
            ("runtime", Nullable(String())),
            ("runtime_name", Nullable(String())),
            ("browser", Nullable(String())),
            ("browser_name", Nullable(String())),
            ("os", Nullable(String())),
            ("os_name", Nullable(String())),
            ("os_rooted", Nullable(UInt(8))),
        ])

        promoted_context_columns = ColumnSet([
            ("os_build", Nullable(String())),
            ("os_kernel_version", Nullable(String())),
            ("device_name", Nullable(String())),
            ("device_brand", Nullable(String())),
            ("device_locale", Nullable(String())),
            ("device_uuid", Nullable(String())),
            ("device_model_id", Nullable(String())),
            ("device_arch", Nullable(String())),
            ("device_battery_level", Nullable(Float(32))),
            ("device_orientation", Nullable(String())),
            ("device_simulator", Nullable(UInt(8))),
            ("device_online", Nullable(UInt(8))),
            ("device_charging", Nullable(UInt(8))),
        ])

        required_columns = ColumnSet([
            ("event_id", FixedString(32)),
            ("project_id", UInt(64)),
            ("group_id", UInt(64)),
            ("timestamp", DateTime()),
            ("deleted", UInt(8)),
            ("retention_days", UInt(16)),
        ])

        all_columns = (
            required_columns + [
                # required for non-deleted
                ("platform", Nullable(String())),
                ("message", Nullable(String())),
                ("primary_hash", Nullable(FixedString(32))),
                ("received", Nullable(DateTime())),
                ("search_message", Nullable(String())),
                ("title", Nullable(String())),
                ("location", Nullable(String())),
                # optional user
                ("user_id", Nullable(String())),
                ("username", Nullable(String())),
                ("email", Nullable(String())),
                ("ip_address", Nullable(String())),
                # optional geo
                ("geo_country_code", Nullable(String())),
                ("geo_region", Nullable(String())),
                ("geo_city", Nullable(String())),
                ("sdk_name", Nullable(String())),
                ("sdk_version", Nullable(String())),
                ("type", Nullable(String())),
                ("version", Nullable(String())),
            ] + metadata_columns + promoted_context_columns +
            promoted_tag_columns + promoted_context_tag_columns + [
                # other tags
                ("tags", Nested([("key", String()), ("value", String())])),
                ("_tags_flattened", String()),
                # other context
                ("contexts", Nested([("key", String()), ("value", String())])),
                # http interface
                ("http_method", Nullable(String())),
                ("http_referer", Nullable(String())),
                # exception interface
                (
                    "exception_stacks",
                    Nested([
                        ("type", Nullable(String())),
                        ("value", Nullable(String())),
                        ("mechanism_type", Nullable(String())),
                        ("mechanism_handled", Nullable(UInt(8))),
                    ]),
                ),
                (
                    "exception_frames",
                    Nested([
                        ("abs_path", Nullable(String())),
                        ("filename", Nullable(String())),
                        ("package", Nullable(String())),
                        ("module", Nullable(String())),
                        ("function", Nullable(String())),
                        ("in_app", Nullable(UInt(8))),
                        ("colno", Nullable(UInt(32))),
                        ("lineno", Nullable(UInt(32))),
                        ("stack_level", UInt(16)),
                    ]),
                ),
                # These are columns we added later in the life of the (current) production
                # database. They don't necessarily belong here in a logical/readability sense
                # but they are here to match the order of columns in production becase
                # `insert_distributed_sync` is very sensitive to column existence and ordering.
                ("culprit", Nullable(String())),
                ("sdk_integrations", Array(String())),
                ("modules", Nested([("name", String()),
                                    ("version", String())])),
            ])

        sample_expr = "cityHash64(toString(event_id))"
        schema = ReplacingMergeTreeSchema(
            columns=all_columns,
            local_table_name="sentry_local",
            dist_table_name="sentry_dist",
            mandatory_conditions=[("deleted", "=", 0)],
            prewhere_candidates=[
                "event_id",
                "group_id",
                "tags[sentry:release]",
                "message",
                "environment",
                "project_id",
            ],
            order_by="(project_id, toStartOfDay(timestamp), %s)" % sample_expr,
            partition_by=
            "(toMonday(timestamp), if(equals(retention_days, 30), 30, 90))",
            version_column="deleted",
            sample_expr=sample_expr,
            migration_function=events_migrations,
        )

        dataset_schemas = DatasetSchemas(
            read_schema=schema,
            write_schema=schema,
        )

        table_writer = TableWriter(
            write_schema=schema,
            stream_loader=KafkaStreamLoader(
                processor=EventsProcessor(promoted_tag_columns),
                default_topic="events",
                replacement_topic="event-replacements",
                commit_log_topic="snuba-commit-log",
            ),
        )

        super(EventsDataset, self).__init__(
            dataset_schemas=dataset_schemas,
            table_writer=table_writer,
            time_group_columns={
                "time": "timestamp",
                "rtime": "received"
            },
            time_parse_columns=("timestamp", "received"),
        )

        self.__metadata_columns = metadata_columns
        self.__promoted_tag_columns = promoted_tag_columns
        self.__promoted_context_tag_columns = promoted_context_tag_columns
        self.__promoted_context_columns = promoted_context_columns
        self.__required_columns = required_columns

        self.__tags_processor = TagColumnProcessor(
            columns=all_columns,
            promoted_columns=self._get_promoted_columns(),
            column_tag_map=self._get_column_tag_map(),
        )

    def get_split_query_spec(self) -> Union[None, ColumnSplitSpec]:
        return ColumnSplitSpec(
            id_column="event_id",
            project_column="project_id",
            timestamp_column="timestamp",
        )

    def column_expr(
        self,
        column_name,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ):
        processed_column = self.__tags_processor.process_column_expression(
            column_name, query, parsing_context, table_alias)
        if processed_column:
            # If processed_column is None, this was not a tag/context expression
            return processed_column
        elif column_name == "group_id":
            return f"nullIf({qualified_column('group_id', table_alias)}, 0)"
        elif column_name == "message":
            # Because of the rename from message->search_message without backfill,
            # records will have one or the other of these fields.
            # TODO this can be removed once all data has search_message filled in.
            search_message = qualified_column("search_message", table_alias)
            message = qualified_column("message", table_alias)
            return f"coalesce({search_message}, {message})"
        else:
            return super().column_expr(column_name, query, parsing_context,
                                       table_alias)

    def get_promoted_tag_columns(self):
        return self.__promoted_tag_columns

    def _get_promoted_context_tag_columns(self):
        return self.__promoted_context_tag_columns

    def _get_promoted_context_columns(self):
        return self.__promoted_context_columns

    def get_required_columns(self):
        return self.__required_columns

    def _get_promoted_columns(self):
        # The set of columns, and associated keys that have been promoted
        # to the top level table namespace.
        return {
            "tags":
            frozenset(col.flattened
                      for col in (self.get_promoted_tag_columns() +
                                  self._get_promoted_context_tag_columns())),
            "contexts":
            frozenset(col.flattened
                      for col in self._get_promoted_context_columns()),
        }

    def _get_column_tag_map(self):
        # For every applicable promoted column,  a map of translations from the column
        # name  we save in the database to the tag we receive in the query.
        promoted_context_tag_columns = self._get_promoted_context_tag_columns()

        return {
            "tags": {
                col.flattened: col.flattened.replace("_", ".")
                for col in promoted_context_tag_columns
            },
            "contexts": {},
        }

    def get_tag_column_map(self):
        # And a reverse map from the tags the client expects to the database columns
        return {
            col: dict(map(reversed, trans.items()))
            for col, trans in self._get_column_tag_map().items()
        }

    def get_promoted_tags(self):
        # The canonical list of foo.bar strings that you can send as a `tags[foo.bar]` query
        # and they can/will use a promoted column.
        return {
            col: [
                self._get_column_tag_map()[col].get(x, x)
                for x in self._get_promoted_columns()[col]
            ]
            for col in self._get_promoted_columns()
        }

    def get_extensions(self) -> Mapping[str, QueryExtension]:
        return {
            "project":
            ProjectExtension(processor=ProjectWithGroupsProcessor(
                project_column="project_id")),
            "timeseries":
            TimeSeriesExtension(
                default_granularity=3600,
                default_window=timedelta(days=5),
                timestamp_column="timestamp",
            ),
        }

    def get_query_processors(self) -> Sequence[QueryProcessor]:
        return [BasicFunctionsProcessor(), PrewhereProcessor()]
Example #9
0
class ErrorsDataset(TimeSeriesDataset):
    """
    Represents the collections of all event types that are not transactions.

    This is meant to replace Events. They will both exist during the migration.
    """
    def __init__(self) -> None:
        all_columns = ColumnSet([
            ("org_id", UInt(64)),
            ("project_id", UInt(64)),
            ("timestamp", DateTime()),
            ("event_id", WithCodecs(UUID(), ["NONE"])),
            (
                "event_hash",
                WithCodecs(
                    Materialized(
                        UInt(64),
                        "cityHash64(toString(event_id))",
                    ),
                    ["NONE"],
                ),
            ),
            ("platform", LowCardinality(String())),
            ("environment", LowCardinality(Nullable(String()))),
            ("release", LowCardinality(Nullable(String()))),
            ("dist", LowCardinality(Nullable(String()))),
            ("ip_address_v4", Nullable(IPv4())),
            ("ip_address_v6", Nullable(IPv6())),
            ("user", WithDefault(String(), "''")),
            (
                "user_hash",
                Materialized(UInt(64), "cityHash64(user)"),
            ),
            ("user_id", Nullable(String())),
            ("user_name", Nullable(String())),
            ("user_email", Nullable(String())),
            ("sdk_name", LowCardinality(Nullable(String()))),
            ("sdk_version", LowCardinality(Nullable(String()))),
            ("tags", Nested([("key", String()), ("value", String())])),
            ("_tags_flattened", String()),
            ("contexts", Nested([("key", String()), ("value", String())])),
            ("_contexts_flattened", String()),
            ("transaction_name", WithDefault(LowCardinality(String()), "''")),
            (
                "transaction_hash",
                Materialized(UInt(64), "cityHash64(transaction_name)"),
            ),
            ("span_id", Nullable(UInt(64))),
            ("trace_id", Nullable(UUID())),
            ("partition", UInt(16)),
            ("offset", WithCodecs(UInt(64), ["DoubleDelta", "LZ4"])),
            ("retention_days", UInt(16)),
            ("deleted", UInt(8)),
            ("group_id", UInt(64)),
            ("primary_hash", FixedString(32)),
            ("primary_hash_hex", Materialized(UInt(64), "hex(primary_hash)")),
            ("event_string", WithCodecs(String(), ["NONE"])),
            ("received", DateTime()),
            ("message", String()),
            ("title", String()),
            ("culprit", String()),
            ("level", LowCardinality(String())),
            ("location", Nullable(String())),
            ("version", LowCardinality(Nullable(String()))),
            ("type", LowCardinality(String())),
            (
                "exception_stacks",
                Nested([
                    ("type", Nullable(String())),
                    ("value", Nullable(String())),
                    ("mechanism_type", Nullable(String())),
                    ("mechanism_handled", Nullable(UInt(8))),
                ]),
            ),
            (
                "exception_frames",
                Nested([
                    ("abs_path", Nullable(String())),
                    ("colno", Nullable(UInt(32))),
                    ("filename", Nullable(String())),
                    ("function", Nullable(String())),
                    ("lineno", Nullable(UInt(32))),
                    ("in_app", Nullable(UInt(8))),
                    ("package", Nullable(String())),
                    ("module", Nullable(String())),
                    ("stack_level", Nullable(UInt(16))),
                ]),
            ),
            ("sdk_integrations", Array(String())),
            ("modules", Nested([("name", String()), ("version", String())])),
        ])

        self.__promoted_tag_columns = {
            "environment": "environment",
            "sentry:release": "release",
            "sentry:dist": "dist",
            "sentry:user": "******",
            "transaction": "transaction_name",
            "level": "level",
        }

        schema = ReplacingMergeTreeSchema(
            columns=all_columns,
            local_table_name="errors_local",
            dist_table_name="errors_dist",
            mandatory_conditions=[("deleted", "=", 0)],
            prewhere_candidates=[
                "event_id",
                "group_id",
                "tags[sentry:release]",
                "message",
                "environment",
                "project_id",
            ],
            order_by=
            "(org_id, project_id, toStartOfDay(timestamp), primary_hash_hex, event_hash)",
            partition_by=
            "(toMonday(timestamp), if(retention_days = 30, 30, 90))",
            version_column="deleted",
            sample_expr="event_hash",
            ttl_expr="timestamp + toIntervalDay(retention_days)",
            settings={"index_granularity": "8192"},
        )

        dataset_schemas = DatasetSchemas(
            read_schema=schema,
            write_schema=schema,
        )

        table_writer = TableWriter(
            write_schema=schema,
            stream_loader=KafkaStreamLoader(
                processor=ErrorsProcessor(self.__promoted_tag_columns),
                default_topic="events",
            ),
        )

        super().__init__(
            dataset_schemas=dataset_schemas,
            table_writer=table_writer,
            time_group_columns={
                "time": "timestamp",
                "rtime": "received"
            },
            time_parse_columns=("timestamp", "received"),
        )

        self.__tags_processor = TagColumnProcessor(
            columns=all_columns,
            promoted_columns=self._get_promoted_columns(),
            column_tag_map=self._get_column_tag_map(),
        )

    def get_split_query_spec(self) -> Union[None, ColumnSplitSpec]:
        return ColumnSplitSpec(
            id_column="event_id",
            project_column="project_id",
            timestamp_column="timestamp",
        )

    def column_expr(
        self,
        column_name,
        query: Query,
        parsing_context: ParsingContext,
        table_alias: str = "",
    ):
        processed_column = self.__tags_processor.process_column_expression(
            column_name, query, parsing_context, table_alias)
        return processed_column or super().column_expr(
            column_name, query, parsing_context, table_alias)

    def _get_promoted_columns(self) -> Mapping[str, FrozenSet[str]]:
        return {
            "tags": frozenset(self.__promoted_tag_columns.values()),
            "contexts": frozenset(),
        }

    def _get_column_tag_map(self) -> Mapping[str, Mapping[str, str]]:
        return {
            "tags":
            {col: tag
             for tag, col in self.__promoted_tag_columns.items()},
            "contexts": {},
        }

    def get_extensions(self) -> Mapping[str, QueryExtension]:
        return {
            "project":
            ProjectExtension(processor=ProjectWithGroupsProcessor(
                project_column="project_id")),
            "timeseries":
            TimeSeriesExtension(
                default_granularity=3600,
                default_window=timedelta(days=5),
                timestamp_column="timestamp",
            ),
        }

    def get_query_processors(self) -> Sequence[QueryProcessor]:
        return [BasicFunctionsProcessor(), PrewhereProcessor()]