def __init__(self): read_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', Nullable(UInt(64))), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', LowCardinality(Nullable(String()))), ('event_id', Nullable(UUID())), ]) read_schema = MergeTreeSchema( columns=read_columns, local_table_name='outcomes_raw_local', dist_table_name='outcomes_raw_dist', order_by='(org_id, project_id, timestamp)', partition_by='(toMonday(timestamp))', settings={'index_granularity': 16384}) dataset_schemas = DatasetSchemas(read_schema=read_schema, write_schema=None, intermediary_schemas=[]) super().__init__(dataset_schemas=dataset_schemas, time_group_columns={ 'time': 'timestamp', }, time_parse_columns=('timestamp', ))
def __init__(self) -> None: read_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", Nullable(UInt(64))), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(Nullable(String()))), ("event_id", Nullable(UUID())), ]) read_schema = MergeTreeSchema( columns=read_columns, local_table_name="outcomes_raw_local", dist_table_name="outcomes_raw_dist", order_by="(org_id, project_id, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": 16384}, migration_function=outcomes_raw_migrations, ) dataset_schemas = DatasetSchemas(read_schema=read_schema, write_schema=None, intermediary_schemas=[]) super().__init__( dataset_schemas=dataset_schemas, time_group_columns={"time": "timestamp"}, time_parse_columns=("timestamp", ), )
def __init__(self, *args, dataset_schemas: DatasetSchemas, time_group_columns: Mapping[str, str], **kwargs): super().__init__(*args, dataset_schemas=dataset_schemas, **kwargs) # Convenience columns that evaluate to a bucketed time. The bucketing # depends on the granularity parameter. # The bucketed time column names cannot be overlapping with existing # schema columns read_schema = dataset_schemas.get_read_schema() if read_schema: for bucketed_column in time_group_columns.keys(): assert \ bucketed_column not in read_schema.get_columns(), \ f"Bucketed column {bucketed_column} is already defined in the schema" self.__time_group_columns = time_group_columns
def __init__(self) -> None: columns = ColumnSet([ # columns to maintain the dataset # Kafka topic offset ("offset", UInt(64)), # GroupStatus in Sentry does not have a 'DELETED' state that reflects the deletion # of the record. Having a dedicated clickhouse-only flag to identify this case seems # more consistent than add an additional value into the status field below that does not # exists on the Sentry side. ("record_deleted", UInt(8)), # PG columns ("project_id", UInt(64)), ("id", UInt(64)), ("status", Nullable(UInt(8))), ("last_seen", Nullable(DateTime())), ("first_seen", Nullable(DateTime())), ("active_at", Nullable(DateTime())), ("first_release_id", Nullable(UInt(64))), ]) schema = ReplacingMergeTreeSchema( columns=columns, local_table_name="groupedmessage_local", dist_table_name="groupedmessage_dist", mandatory_conditions=[("record_deleted", "=", 0)], prewhere_candidates=["project_id", "id"], order_by="(project_id, id)", partition_by=None, version_column="offset", sample_expr="id", ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) super().__init__( dataset_schemas=dataset_schemas, table_writer=GroupedMessageTableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=GroupedMessageProcessor(self.POSTGRES_TABLE), default_topic="cdc", ), postgres_table=self.POSTGRES_TABLE, ), default_control_topic="cdc_control", postgres_table=self.POSTGRES_TABLE, )
def __init__(self): columns = ColumnSet([ # columns to maintain the dataset # Kafka topic offset ('offset', UInt(64)), # GroupStatus in Sentry does not have a 'DELETED' state that reflects the deletion # of the record. Having a dedicated clickhouse-only flag to identify this case seems # more consistent than add an additional value into the status field below that does not # exists on the Sentry side. ('record_deleted', UInt(8)), # PG columns ('project_id', UInt(64)), ('id', UInt(64)), ('status', Nullable(UInt(8))), ('last_seen', Nullable(DateTime())), ('first_seen', Nullable(DateTime())), ('active_at', Nullable(DateTime())), ('first_release_id', Nullable(UInt(64))), ]) schema = ReplacingMergeTreeSchema( columns=columns, local_table_name='groupedmessage_local', dist_table_name='groupedmessage_dist', order_by='(project_id, id)', partition_by=None, version_column='offset', sample_expr='id', ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) super(GroupedMessageDataset, self).__init__( dataset_schemas=dataset_schemas, processor=GroupedMessageProcessor(self.POSTGRES_TABLE), default_topic="cdc", default_replacement_topic=None, default_commit_log_topic=None, default_control_topic="cdc_control", )
def __init__(self) -> None: columns = ColumnSet([ # columns to maintain the dataset # Kafka topic offset ("offset", UInt(64)), ("record_deleted", UInt(8)), # PG columns ("project_id", UInt(64)), ("group_id", UInt(64)), ("date_added", Nullable(DateTime())), ("user_id", Nullable(UInt(64))), ("team_id", Nullable(UInt(64))), ]) schema = ReplacingMergeTreeSchema( columns=columns, local_table_name='groupassignee_local', dist_table_name='groupassignee_dist', order_by='(project_id, group_id)', partition_by=None, version_column='offset', ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) super().__init__( dataset_schemas=dataset_schemas, table_writer=GroupAssigneeTableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=GroupAssigneeProcessor(self.POSTGRES_TABLE), default_topic="cdc", ), postgres_table=self.POSTGRES_TABLE, ), default_control_topic="cdc_control", postgres_table=self.POSTGRES_TABLE, )
def __init__(self) -> None: self.__grouped_message = get_dataset("groupedmessage") groupedmessage_source = (self.__grouped_message.get_dataset_schemas(). get_read_schema().get_data_source()) self.__events = get_dataset("events") events_source = (self.__events.get_dataset_schemas().get_read_schema(). get_data_source()) join_structure = JoinClause( left_node=TableJoinNode( table_name=groupedmessage_source.format_from(), columns=groupedmessage_source.get_columns(), mandatory_conditions=[ # TODO: This will be replaced as soon as expressions won't be strings # thus we will be able to easily add an alias to a column in an # expression. (qualified_column("record_deleted", self.GROUPS_ALIAS), "=", 0) ], prewhere_candidates=[ qualified_column(col, self.GROUPS_ALIAS) for col in groupedmessage_source.get_prewhere_candidates() ], alias=self.GROUPS_ALIAS, ), right_node=TableJoinNode( table_name=events_source.format_from(), columns=events_source.get_columns(), mandatory_conditions=[ (qualified_column("deleted", self.EVENTS_ALIAS), "=", 0) ], prewhere_candidates=[ qualified_column(col, self.EVENTS_ALIAS) for col in events_source.get_prewhere_candidates() ], alias=self.EVENTS_ALIAS, ), mapping=[ JoinCondition( left=JoinConditionExpression(table_alias=self.GROUPS_ALIAS, column="project_id"), right=JoinConditionExpression( table_alias=self.EVENTS_ALIAS, column="project_id"), ), JoinCondition( left=JoinConditionExpression(table_alias=self.GROUPS_ALIAS, column="id"), right=JoinConditionExpression( table_alias=self.EVENTS_ALIAS, column="group_id"), ), ], join_type=JoinType.LEFT, ) schema = JoinedSchema(join_structure) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=None, ) super().__init__( dataset_schemas=dataset_schemas, time_group_columns={"events.time": "events.timestamp"}, time_parse_columns=[ "events.timestamp", "events.received", "groups.last_seen", "groups.first_seen", "groups.active_at", ], )
def __init__(self) -> None: columns = ColumnSet( [ ("project_id", UInt(64)), ("event_id", UUID()), ("trace_id", UUID()), ("span_id", UInt(64)), ("transaction_name", LowCardinality(String())), ( "transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)",), ), ("transaction_op", LowCardinality(String())), ("transaction_status", WithDefault(UInt(8), UNKNOWN_SPAN_STATUS)), ("start_ts", DateTime()), ("start_ms", UInt(16)), ("_start_date", Materialized(Date(), "toDate(start_ts)"),), ("finish_ts", DateTime()), ("finish_ms", UInt(16)), ("_finish_date", Materialized(Date(), "toDate(finish_ts)"),), ("duration", UInt(32)), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''",)), ("user_hash", Materialized(UInt(64), "cityHash64(user)"),), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", WithDefault(LowCardinality(String()), "''")), ("sdk_version", WithDefault(LowCardinality(String()), "''")), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("partition", UInt(16)), ("offset", UInt(64)), ("retention_days", UInt(16)), ("deleted", UInt(8)), ] ) schema = ReplacingMergeTreeSchema( columns=columns, local_table_name="transactions_local", dist_table_name="transactions_dist", mandatory_conditions=[], prewhere_candidates=["event_id", "project_id"], order_by="(project_id, _finish_date, transaction_name, cityHash64(span_id))", partition_by="(retention_days, toMonday(_finish_date))", version_column="deleted", sample_expr=None, migration_function=transactions_migrations, ) dataset_schemas = DatasetSchemas(read_schema=schema, write_schema=schema,) self.__tags_processor = TagColumnProcessor( columns=columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=TransactionsTableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=TransactionsMessageProcessor(), default_topic="events", ), ), time_group_columns={ "bucketed_start": "start_ts", "bucketed_end": "finish_ts", }, time_parse_columns=("start_ts", "finish_ts"), )
def __init__(self): columns = ColumnSet([ ('project_id', UInt(64)), ('event_id', UUID()), ('trace_id', UUID()), ('span_id', UInt(64)), ('transaction_name', String()), ('transaction_hash', Materialized( UInt(64), 'cityHash64(transaction_name)', )), ('transaction_op', LowCardinality(String())), ('start_ts', DateTime()), ('start_ms', UInt(16)), ('finish_ts', DateTime()), ('finish_ms', UInt(16)), ('duration', Materialized( UInt(32), '((finish_ts - start_ts) * 1000) + (finish_ms - start_ms)', )), ('platform', LowCardinality(String())), ('environment', Nullable(String())), ('release', Nullable(String())), ('dist', Nullable(String())), ('ip_address_v4', Nullable(IPv4())), ('ip_address_v6', Nullable(IPv6())), ('user', WithDefault( String(), "''", )), ('user_id', Nullable(String())), ('user_name', Nullable(String())), ('user_email', Nullable(String())), ('tags', Nested([ ('key', String()), ('value', String()), ])), ('contexts', Nested([ ('key', String()), ('value', String()), ])), ('partition', UInt(16)), ('offset', UInt(64)), ('retention_days', UInt(16)), ('deleted', UInt(8)), ]) schema = ReplacingMergeTreeSchema( columns=columns, local_table_name='transactions_local', dist_table_name='transactions_dist', order_by= '(project_id, toStartOfDay(start_ts), transaction_hash, start_ts, start_ms, trace_id, span_id)', partition_by='(retention_days, toMonday(start_ts))', version_column='deleted', sample_expr=None, ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) super().__init__( dataset_schemas=dataset_schemas, processor=TransactionsMessageProcessor(), default_topic="events", time_group_columns={ 'bucketed_start': 'start_ts', 'bucketed_end': 'finish_ts', }, )
def __init__(self): write_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', Nullable(UInt(64))), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', LowCardinality(Nullable(String()))), ('event_id', Nullable(UUID())), ]) write_schema = MergeTreeSchema( columns=write_columns, # TODO: change to outcomes.raw_local when we add multi DB support local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, order_by='(org_id, project_id, timestamp)', partition_by='(toMonday(timestamp))', settings={'index_granularity': 16384}) read_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', UInt(64)), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', LowCardinality(String())), ('times_seen', UInt(64)), ]) read_schema = SummingMergeTreeSchema( columns=read_columns, local_table_name=READ_LOCAL_TABLE_NAME, dist_table_name=READ_DIST_TABLE_NAME, order_by='(org_id, project_id, key_id, outcome, reason, timestamp)', partition_by='(toMonday(timestamp))', settings={'index_granularity': 256}) materialized_view_columns = ColumnSet([ ('org_id', UInt(64)), ('project_id', UInt(64)), ('key_id', UInt(64)), ('timestamp', DateTime()), ('outcome', UInt(8)), ('reason', String()), ('times_seen', UInt(64)), ]) # TODO: Find a better way to specify a query for a materialized view # The problem right now is that we have a way to define our columns in a ColumnSet abstraction but the query # doesn't use it. query = """ SELECT org_id, project_id, ifNull(key_id, 0) AS key_id, toStartOfHour(timestamp) AS timestamp, outcome, ifNull(reason, 'none') AS reason, count() AS times_seen FROM %(source_table_name)s GROUP BY org_id, project_id, key_id, timestamp, outcome, reason """ materialized_view = MaterializedViewSchema( local_materialized_view_name='outcomes_mv_hourly_local', dist_materialized_view_name='outcomes_mv_hourly_dist', columns=materialized_view_columns, query=query, local_source_table_name=WRITE_LOCAL_TABLE_NAME, local_destination_table_name=READ_LOCAL_TABLE_NAME, dist_source_table_name=WRITE_DIST_TABLE_NAME, dist_destination_table_name=READ_DIST_TABLE_NAME) dataset_schemas = DatasetSchemas( read_schema=read_schema, write_schema=write_schema, intermediary_schemas=[materialized_view]) super(OutcomesDataset, self).__init__( dataset_schemas=dataset_schemas, processor=OutcomesProcessor(), default_topic="outcomes", )
def __init__(self) -> None: write_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", Nullable(UInt(64))), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(Nullable(String()))), ("event_id", Nullable(UUID())), ]) write_schema = MergeTreeSchema( columns=write_columns, # TODO: change to outcomes.raw_local when we add multi DB support local_table_name=WRITE_LOCAL_TABLE_NAME, dist_table_name=WRITE_DIST_TABLE_NAME, order_by="(org_id, project_id, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": 16384}, ) read_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", LowCardinality(String())), ("times_seen", UInt(64)), ]) read_schema = SummingMergeTreeSchema( columns=read_columns, local_table_name=READ_LOCAL_TABLE_NAME, dist_table_name=READ_DIST_TABLE_NAME, order_by="(org_id, project_id, key_id, outcome, reason, timestamp)", partition_by="(toMonday(timestamp))", settings={"index_granularity": 256}, ) materialized_view_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", String()), ("times_seen", UInt(64)), ]) # TODO: Find a better way to specify a query for a materialized view # The problem right now is that we have a way to define our columns in a ColumnSet abstraction but the query # doesn't use it. query = """ SELECT org_id, project_id, ifNull(key_id, 0) AS key_id, toStartOfHour(timestamp) AS timestamp, outcome, ifNull(reason, 'none') AS reason, count() AS times_seen FROM %(source_table_name)s GROUP BY org_id, project_id, key_id, timestamp, outcome, reason """ materialized_view = MaterializedViewSchema( local_materialized_view_name="outcomes_mv_hourly_local", dist_materialized_view_name="outcomes_mv_hourly_dist", prewhere_candidates=["project_id", "org_id"], columns=materialized_view_columns, query=query, local_source_table_name=WRITE_LOCAL_TABLE_NAME, local_destination_table_name=READ_LOCAL_TABLE_NAME, dist_source_table_name=WRITE_DIST_TABLE_NAME, dist_destination_table_name=READ_DIST_TABLE_NAME, ) dataset_schemas = DatasetSchemas( read_schema=read_schema, write_schema=write_schema, intermediary_schemas=[materialized_view], ) table_writer = TableWriter( write_schema=write_schema, stream_loader=KafkaStreamLoader( processor=OutcomesProcessor(), default_topic="outcomes", ), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={"time": "timestamp"}, time_parse_columns=("timestamp", ), )
def __init__(self): metadata_columns = ColumnSet([ # optional stream related data ('offset', Nullable(UInt(64))), ('partition', Nullable(UInt(16))), ]) promoted_tag_columns = ColumnSet([ # These are the classic tags, they are saved in Snuba exactly as they # appear in the event body. ('level', Nullable(String())), ('logger', Nullable(String())), ('server_name', Nullable(String())), # future name: device_id? ('transaction', Nullable(String())), ('environment', Nullable(String())), ('sentry:release', Nullable(String())), ('sentry:dist', Nullable(String())), ('sentry:user', Nullable(String())), ('site', Nullable(String())), ('url', Nullable(String())), ]) promoted_context_tag_columns = ColumnSet([ # These are promoted tags that come in in `tags`, but are more closely # related to contexts. To avoid naming confusion with Clickhouse nested # columns, they are stored in the database with s/./_/ # promoted tags ('app_device', Nullable(String())), ('device', Nullable(String())), ('device_family', Nullable(String())), ('runtime', Nullable(String())), ('runtime_name', Nullable(String())), ('browser', Nullable(String())), ('browser_name', Nullable(String())), ('os', Nullable(String())), ('os_name', Nullable(String())), ('os_rooted', Nullable(UInt(8))), ]) promoted_context_columns = ColumnSet([ ('os_build', Nullable(String())), ('os_kernel_version', Nullable(String())), ('device_name', Nullable(String())), ('device_brand', Nullable(String())), ('device_locale', Nullable(String())), ('device_uuid', Nullable(String())), ('device_model_id', Nullable(String())), ('device_arch', Nullable(String())), ('device_battery_level', Nullable(Float(32))), ('device_orientation', Nullable(String())), ('device_simulator', Nullable(UInt(8))), ('device_online', Nullable(UInt(8))), ('device_charging', Nullable(UInt(8))), ]) required_columns = ColumnSet([ ('event_id', FixedString(32)), ('project_id', UInt(64)), ('group_id', UInt(64)), ('timestamp', DateTime()), ('deleted', UInt(8)), ('retention_days', UInt(16)), ]) all_columns = required_columns + [ # required for non-deleted ('platform', Nullable(String())), ('message', Nullable(String())), ('primary_hash', Nullable(FixedString(32))), ('received', Nullable(DateTime())), ('search_message', Nullable(String())), ('title', Nullable(String())), ('location', Nullable(String())), # optional user ('user_id', Nullable(String())), ('username', Nullable(String())), ('email', Nullable(String())), ('ip_address', Nullable(String())), # optional geo ('geo_country_code', Nullable(String())), ('geo_region', Nullable(String())), ('geo_city', Nullable(String())), ('sdk_name', Nullable(String())), ('sdk_version', Nullable(String())), ('type', Nullable(String())), ('version', Nullable(String())), ] + metadata_columns \ + promoted_context_columns \ + promoted_tag_columns \ + promoted_context_tag_columns \ + [ # other tags ('tags', Nested([ ('key', String()), ('value', String()), ])), # other context ('contexts', Nested([ ('key', String()), ('value', String()), ])), # http interface ('http_method', Nullable(String())), ('http_referer', Nullable(String())), # exception interface ('exception_stacks', Nested([ ('type', Nullable(String())), ('value', Nullable(String())), ('mechanism_type', Nullable(String())), ('mechanism_handled', Nullable(UInt(8))), ])), ('exception_frames', Nested([ ('abs_path', Nullable(String())), ('filename', Nullable(String())), ('package', Nullable(String())), ('module', Nullable(String())), ('function', Nullable(String())), ('in_app', Nullable(UInt(8))), ('colno', Nullable(UInt(32))), ('lineno', Nullable(UInt(32))), ('stack_level', UInt(16)), ])), # These are columns we added later in the life of the (current) production # database. They don't necessarily belong here in a logical/readability sense # but they are here to match the order of columns in production becase # `insert_distributed_sync` is very sensitive to column existence and ordering. ('culprit', Nullable(String())), ('sdk_integrations', Array(String())), ('modules', Nested([ ('name', String()), ('version', String()), ])), ] sample_expr = 'cityHash64(toString(event_id))' schema = ReplacingMergeTreeSchema( columns=all_columns, local_table_name='sentry_local', dist_table_name='sentry_dist', mandatory_conditions=[('deleted', '=', 0)], order_by='(project_id, toStartOfDay(timestamp), %s)' % sample_expr, partition_by='(toMonday(timestamp), if(equals(retention_days, 30), 30, 90))', version_column='deleted', sample_expr=sample_expr, migration_function=events_migrations) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) table_writer = TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=EventsProcessor(promoted_tag_columns), default_topic="events", replacement_topic="event-replacements", commit_log_topic="snuba-commit-log", ) ) super(EventsDataset, self).__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={ 'time': 'timestamp', 'rtime': 'received' }, time_parse_columns=('timestamp', 'received') ) self.__metadata_columns = metadata_columns self.__promoted_tag_columns = promoted_tag_columns self.__promoted_context_tag_columns = promoted_context_tag_columns self.__promoted_context_columns = promoted_context_columns self.__required_columns = required_columns self.__tags_processor = TagColumnProcessor( columns=all_columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), )
def __init__(self) -> None: self.__grouped_message = get_dataset("groupedmessage") groupedmessage_source = self.__grouped_message \ .get_dataset_schemas() \ .get_read_schema() \ .get_data_source() self.__events = get_dataset("events") events_source = self.__events \ .get_dataset_schemas() \ .get_read_schema() \ .get_data_source() join_structure = JoinClause( left_node=TableJoinNode( table_name=groupedmessage_source.format_from(), columns=groupedmessage_source.get_columns(), mandatory_conditions=[ # TODO: This will be replaced as soon as expressions won't be strings # thus we will be able to easily add an alias to a column in an # expression. (qualified_column('record_deleted', self.GROUPS_ALIAS), '=', 0) ], alias=self.GROUPS_ALIAS, ), right_node=TableJoinNode( table_name=events_source.format_from(), columns=events_source.get_columns(), mandatory_conditions=[ (qualified_column('deleted', self.EVENTS_ALIAS), '=', 0) ], alias=self.EVENTS_ALIAS, ), mapping=[ JoinCondition( left=JoinConditionExpression(table_alias=self.GROUPS_ALIAS, column="project_id"), right=JoinConditionExpression( table_alias=self.EVENTS_ALIAS, column="project_id"), ), JoinCondition( left=JoinConditionExpression(table_alias=self.GROUPS_ALIAS, column="id"), right=JoinConditionExpression( table_alias=self.EVENTS_ALIAS, column="group_id"), ), ], join_type=JoinType.LEFT, ) schema = JoinedSchema(join_structure) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=None, ) super().__init__( dataset_schemas=dataset_schemas, time_group_columns={ 'events.time': 'events.timestamp', }, time_parse_columns=['events.timestamp'], )
def __init__(self) -> None: metadata_columns = ColumnSet([ # optional stream related data ("offset", Nullable(UInt(64))), ("partition", Nullable(UInt(16))), ]) promoted_tag_columns = ColumnSet([ # These are the classic tags, they are saved in Snuba exactly as they # appear in the event body. ("level", Nullable(String())), ("logger", Nullable(String())), ("server_name", Nullable(String())), # future name: device_id? ("transaction", Nullable(String())), ("environment", Nullable(String())), ("sentry:release", Nullable(String())), ("sentry:dist", Nullable(String())), ("sentry:user", Nullable(String())), ("site", Nullable(String())), ("url", Nullable(String())), ]) promoted_context_tag_columns = ColumnSet([ # These are promoted tags that come in in `tags`, but are more closely # related to contexts. To avoid naming confusion with Clickhouse nested # columns, they are stored in the database with s/./_/ # promoted tags ("app_device", Nullable(String())), ("device", Nullable(String())), ("device_family", Nullable(String())), ("runtime", Nullable(String())), ("runtime_name", Nullable(String())), ("browser", Nullable(String())), ("browser_name", Nullable(String())), ("os", Nullable(String())), ("os_name", Nullable(String())), ("os_rooted", Nullable(UInt(8))), ]) promoted_context_columns = ColumnSet([ ("os_build", Nullable(String())), ("os_kernel_version", Nullable(String())), ("device_name", Nullable(String())), ("device_brand", Nullable(String())), ("device_locale", Nullable(String())), ("device_uuid", Nullable(String())), ("device_model_id", Nullable(String())), ("device_arch", Nullable(String())), ("device_battery_level", Nullable(Float(32))), ("device_orientation", Nullable(String())), ("device_simulator", Nullable(UInt(8))), ("device_online", Nullable(UInt(8))), ("device_charging", Nullable(UInt(8))), ]) required_columns = ColumnSet([ ("event_id", FixedString(32)), ("project_id", UInt(64)), ("group_id", UInt(64)), ("timestamp", DateTime()), ("deleted", UInt(8)), ("retention_days", UInt(16)), ]) all_columns = ( required_columns + [ # required for non-deleted ("platform", Nullable(String())), ("message", Nullable(String())), ("primary_hash", Nullable(FixedString(32))), ("received", Nullable(DateTime())), ("search_message", Nullable(String())), ("title", Nullable(String())), ("location", Nullable(String())), # optional user ("user_id", Nullable(String())), ("username", Nullable(String())), ("email", Nullable(String())), ("ip_address", Nullable(String())), # optional geo ("geo_country_code", Nullable(String())), ("geo_region", Nullable(String())), ("geo_city", Nullable(String())), ("sdk_name", Nullable(String())), ("sdk_version", Nullable(String())), ("type", Nullable(String())), ("version", Nullable(String())), ] + metadata_columns + promoted_context_columns + promoted_tag_columns + promoted_context_tag_columns + [ # other tags ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), # other context ("contexts", Nested([("key", String()), ("value", String())])), # http interface ("http_method", Nullable(String())), ("http_referer", Nullable(String())), # exception interface ( "exception_stacks", Nested([ ("type", Nullable(String())), ("value", Nullable(String())), ("mechanism_type", Nullable(String())), ("mechanism_handled", Nullable(UInt(8))), ]), ), ( "exception_frames", Nested([ ("abs_path", Nullable(String())), ("filename", Nullable(String())), ("package", Nullable(String())), ("module", Nullable(String())), ("function", Nullable(String())), ("in_app", Nullable(UInt(8))), ("colno", Nullable(UInt(32))), ("lineno", Nullable(UInt(32))), ("stack_level", UInt(16)), ]), ), # These are columns we added later in the life of the (current) production # database. They don't necessarily belong here in a logical/readability sense # but they are here to match the order of columns in production becase # `insert_distributed_sync` is very sensitive to column existence and ordering. ("culprit", Nullable(String())), ("sdk_integrations", Array(String())), ("modules", Nested([("name", String()), ("version", String())])), ]) sample_expr = "cityHash64(toString(event_id))" schema = ReplacingMergeTreeSchema( columns=all_columns, local_table_name="sentry_local", dist_table_name="sentry_dist", mandatory_conditions=[("deleted", "=", 0)], prewhere_candidates=[ "event_id", "group_id", "tags[sentry:release]", "message", "environment", "project_id", ], order_by="(project_id, toStartOfDay(timestamp), %s)" % sample_expr, partition_by= "(toMonday(timestamp), if(equals(retention_days, 30), 30, 90))", version_column="deleted", sample_expr=sample_expr, migration_function=events_migrations, ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) table_writer = TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=EventsProcessor(promoted_tag_columns), default_topic="events", replacement_topic="event-replacements", commit_log_topic="snuba-commit-log", ), ) super(EventsDataset, self).__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={ "time": "timestamp", "rtime": "received" }, time_parse_columns=("timestamp", "received"), ) self.__metadata_columns = metadata_columns self.__promoted_tag_columns = promoted_tag_columns self.__promoted_context_tag_columns = promoted_context_tag_columns self.__promoted_context_columns = promoted_context_columns self.__required_columns = required_columns self.__tags_processor = TagColumnProcessor( columns=all_columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), )
def __init__(self) -> None: all_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("timestamp", DateTime()), ("event_id", WithCodecs(UUID(), ["NONE"])), ( "event_hash", WithCodecs( Materialized( UInt(64), "cityHash64(toString(event_id))", ), ["NONE"], ), ), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''")), ( "user_hash", Materialized(UInt(64), "cityHash64(user)"), ), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", LowCardinality(Nullable(String()))), ("sdk_version", LowCardinality(Nullable(String()))), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("transaction_name", WithDefault(LowCardinality(String()), "''")), ( "transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)"), ), ("span_id", Nullable(UInt(64))), ("trace_id", Nullable(UUID())), ("partition", UInt(16)), ("offset", WithCodecs(UInt(64), ["DoubleDelta", "LZ4"])), ("retention_days", UInt(16)), ("deleted", UInt(8)), ("group_id", UInt(64)), ("primary_hash", FixedString(32)), ("primary_hash_hex", Materialized(UInt(64), "hex(primary_hash)")), ("event_string", WithCodecs(String(), ["NONE"])), ("received", DateTime()), ("message", String()), ("title", String()), ("culprit", String()), ("level", LowCardinality(String())), ("location", Nullable(String())), ("version", LowCardinality(Nullable(String()))), ("type", LowCardinality(String())), ( "exception_stacks", Nested([ ("type", Nullable(String())), ("value", Nullable(String())), ("mechanism_type", Nullable(String())), ("mechanism_handled", Nullable(UInt(8))), ]), ), ( "exception_frames", Nested([ ("abs_path", Nullable(String())), ("colno", Nullable(UInt(32))), ("filename", Nullable(String())), ("function", Nullable(String())), ("lineno", Nullable(UInt(32))), ("in_app", Nullable(UInt(8))), ("package", Nullable(String())), ("module", Nullable(String())), ("stack_level", Nullable(UInt(16))), ]), ), ("sdk_integrations", Array(String())), ("modules", Nested([("name", String()), ("version", String())])), ]) self.__promoted_tag_columns = { "environment": "environment", "sentry:release": "release", "sentry:dist": "dist", "sentry:user": "******", "transaction": "transaction_name", "level": "level", } schema = ReplacingMergeTreeSchema( columns=all_columns, local_table_name="errors_local", dist_table_name="errors_dist", mandatory_conditions=[("deleted", "=", 0)], prewhere_candidates=[ "event_id", "group_id", "tags[sentry:release]", "message", "environment", "project_id", ], order_by= "(org_id, project_id, toStartOfDay(timestamp), primary_hash_hex, event_hash)", partition_by= "(toMonday(timestamp), if(retention_days = 30, 30, 90))", version_column="deleted", sample_expr="event_hash", ttl_expr="timestamp + toIntervalDay(retention_days)", settings={"index_granularity": "8192"}, ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) table_writer = TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=ErrorsProcessor(self.__promoted_tag_columns), default_topic="events", ), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={ "time": "timestamp", "rtime": "received" }, time_parse_columns=("timestamp", "received"), ) self.__tags_processor = TagColumnProcessor( columns=all_columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), )