def _get_column(column_type: str, default_type: str, default_expr: str, codec_expr: str) -> ColumnType: column: ColumnType = Visitor().visit(grammar.parse(column_type)) if default_type == "MATERIALIZED": column = Materialized(column, _strip_cast(default_expr)) elif default_type == "DEFAULT": column = WithDefault(column, _strip_cast(default_expr)) if codec_expr: column = WithCodecs(column, codec_expr.split(", ")) return column
def test_modifiers() -> None: cols = ColumnSet( [ ("col1", WithDefault(String(), "")), ("col2", Nullable(Array(String()))), ("col3", WithCodecs(Materialized(String(), "something"), ["c"]),), ( "col4", WithCodecs(Nullable(Materialized(String(), "something")), ["c"],), ), ] ) assert [WithDefault] == cols["col1"].type.get_all_modifiers() assert [Nullable] == cols["col2"].type.get_all_modifiers() assert [Materialized, WithCodecs] == cols["col3"].type.get_all_modifiers() assert [Materialized, Nullable, WithCodecs] == cols["col4"].type.get_all_modifiers()
f"ALTER TABLE {clickhouse_table} ADD COLUMN http_referer Nullable(String) AFTER http_method" ) return ret columns = ColumnSet( [ ("project_id", UInt(64)), ("event_id", UUID()), ("trace_id", UUID()), ("span_id", UInt(64)), ("transaction_name", LowCardinality(String())), ("transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)",),), ("transaction_op", LowCardinality(String())), ("transaction_status", WithDefault(UInt(8), str(UNKNOWN_SPAN_STATUS))), ("start_ts", DateTime()), ("start_ms", UInt(16)), ("finish_ts", DateTime()), ("finish_ms", UInt(16)), ("duration", UInt(32)), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''",)), ("user_hash", Materialized(UInt(64), "cityHash64(user)"),), ("user_id", Nullable(String())), ("user_name", Nullable(String())),
("project_id", UInt(64)), ("timestamp", DateTime()), ("event_id", WithCodecs(UUID(), ["NONE"])), ( "event_hash", WithCodecs( Materialized(UInt(64), "cityHash64(toString(event_id))",), ["NONE"], ), ), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''")), ("user_hash", Materialized(UInt(64), "cityHash64(user)"),), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", LowCardinality(Nullable(String()))), ("sdk_version", LowCardinality(Nullable(String()))), ("http_method", LowCardinality(Nullable(String()))), ("http_referer", Nullable(String())), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("_tags_hash_map", Materialized(Array(UInt(64)), TAGS_HASH_MAP_COLUMN)), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("transaction_name", WithDefault(LowCardinality(String()), "''")), ("transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)"),),
from snuba.query.processors.prewhere import PrewhereProcessor from snuba.web.split import TimeSplitQueryStrategy columns = ColumnSet([ ("project_id", UInt(64)), ("transaction_id", UUID()), ("trace_id", UUID()), ("transaction_span_id", UInt(64)), ("span_id", UInt(64)), ("parent_span_id", Nullable(UInt(64))), ("transaction_name", LowCardinality(String())), ("description", String()), # description in span ("op", LowCardinality(String())), ( "status", WithDefault(UInt(8), str(UNKNOWN_SPAN_STATUS)), ), ("start_ts", DateTime()), ("start_ns", UInt(32)), ("finish_ts", DateTime()), ("finish_ns", UInt(32)), ("duration_ms", UInt(32)), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_hash_map", Materialized(Array(UInt(64)), TAGS_HASH_MAP_COLUMN)), ("retention_days", UInt(16)), ("deleted", UInt(8)), ]) schema = WritableTableSchema( columns=columns, local_table_name="spans_experimental_local",
def __init__(self) -> None: columns = ColumnSet( [ ("project_id", UInt(64)), ("event_id", UUID()), ("trace_id", UUID()), ("span_id", UInt(64)), ("transaction_name", LowCardinality(String())), ( "transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)",), ), ("transaction_op", LowCardinality(String())), ("transaction_status", WithDefault(UInt(8), UNKNOWN_SPAN_STATUS)), ("start_ts", DateTime()), ("start_ms", UInt(16)), ("_start_date", Materialized(Date(), "toDate(start_ts)"),), ("finish_ts", DateTime()), ("finish_ms", UInt(16)), ("_finish_date", Materialized(Date(), "toDate(finish_ts)"),), ("duration", UInt(32)), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''",)), ("user_hash", Materialized(UInt(64), "cityHash64(user)"),), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", WithDefault(LowCardinality(String()), "''")), ("sdk_version", WithDefault(LowCardinality(String()), "''")), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("partition", UInt(16)), ("offset", UInt(64)), ("retention_days", UInt(16)), ("deleted", UInt(8)), ] ) schema = ReplacingMergeTreeSchema( columns=columns, local_table_name="transactions_local", dist_table_name="transactions_dist", mandatory_conditions=[], prewhere_candidates=["event_id", "project_id"], order_by="(project_id, _finish_date, transaction_name, cityHash64(span_id))", partition_by="(retention_days, toMonday(_finish_date))", version_column="deleted", sample_expr=None, migration_function=transactions_migrations, ) dataset_schemas = DatasetSchemas(read_schema=schema, write_schema=schema,) self.__tags_processor = TagColumnProcessor( columns=columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=TransactionsTableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=TransactionsMessageProcessor(), default_topic="events", ), ), time_group_columns={ "bucketed_start": "start_ts", "bucketed_end": "finish_ts", }, time_parse_columns=("start_ts", "finish_ts"), )
def __init__(self): columns = ColumnSet([ ('project_id', UInt(64)), ('event_id', UUID()), ('trace_id', UUID()), ('span_id', UInt(64)), ('transaction_name', String()), ('transaction_hash', Materialized( UInt(64), 'cityHash64(transaction_name)', )), ('transaction_op', LowCardinality(String())), ('start_ts', DateTime()), ('start_ms', UInt(16)), ('finish_ts', DateTime()), ('finish_ms', UInt(16)), ('duration', Materialized( UInt(32), '((finish_ts - start_ts) * 1000) + (finish_ms - start_ms)', )), ('platform', LowCardinality(String())), ('environment', Nullable(String())), ('release', Nullable(String())), ('dist', Nullable(String())), ('ip_address_v4', Nullable(IPv4())), ('ip_address_v6', Nullable(IPv6())), ('user', WithDefault( String(), "''", )), ('user_id', Nullable(String())), ('user_name', Nullable(String())), ('user_email', Nullable(String())), ('tags', Nested([ ('key', String()), ('value', String()), ])), ('contexts', Nested([ ('key', String()), ('value', String()), ])), ('partition', UInt(16)), ('offset', UInt(64)), ('retention_days', UInt(16)), ('deleted', UInt(8)), ]) schema = ReplacingMergeTreeSchema( columns=columns, local_table_name='transactions_local', dist_table_name='transactions_dist', order_by= '(project_id, toStartOfDay(start_ts), transaction_hash, start_ts, start_ms, trace_id, span_id)', partition_by='(retention_days, toMonday(start_ts))', version_column='deleted', sample_expr=None, ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) super().__init__( dataset_schemas=dataset_schemas, processor=TransactionsMessageProcessor(), default_topic="events", time_group_columns={ 'bucketed_start': 'start_ts', 'bucketed_end': 'finish_ts', }, )
(("Array(Nullable(UUID))", "", "", ""), Array(Nullable(UUID()))), # Nullable (("Nullable(String)", "", "", ""), Nullable(String())), (("Nullable(FixedString(8))", "", "", ""), Nullable(FixedString(8))), (("Nullable(Date)", "", "", ""), Nullable(Date())), # Low cardinality (("LowCardinality(String)", "", "", ""), LowCardinality(String())), (("LowCardinality(Nullable(String))", "", "", ""), LowCardinality(Nullable(String()))), # Materialized (("Date", "MATERIALIZED", "toDate(col1)", ""), Materialized(Date(), "toDate(col1)")), (("UInt64", "MATERIALIZED", "CAST(cityHash64(col1), 'UInt64')", ""), Materialized(UInt(64), "cityHash64(col1)")), # Default value (("LowCardinality(String)", "DEFAULT", "a", ""), WithDefault(LowCardinality(String()), "a")), (("UInt8", "DEFAULT", "2", ""), WithDefault(UInt(8), "2")), # With codecs (("UUID", "", "", "NONE"), WithCodecs(UUID(), ["NONE"])), (("DateTime", "", "", "DoubleDelta, LZ4"), WithCodecs(DateTime(), ["DoubleDelta", "LZ4"])), ] @pytest.mark.parametrize("input, expected_output", test_data) def test_parse_column(input, expected_output): (input_name, input_type, default_expr, codec_expr) = input assert _get_column(input_name, input_type, default_expr, codec_expr) == expected_output
from snuba.clickhouse.columns import Column, DateTime, Enum, String, UInt, WithDefault from snuba.clusters.storage_sets import StorageSetKey from snuba.migrations import migration, operations from snuba.migrations.context import Context from snuba.migrations.status import Status from snuba.migrations.table_engines import Distributed, ReplacingMergeTree columns = [ Column("group", String()), Column("migration_id", String()), Column("timestamp", DateTime()), Column( "status", Enum([("completed", 0), ("in_progress", 1), ("not_started", 2)]), ), Column("version", WithDefault(UInt(64), "1")), ] class Migration(migration.Migration): """ This migration extends Migration instead of MultiStepMigration since it is responsible for bootstrapping the migration system itself. It skips setting the in progress status in the forwards method and the not started status in the backwards method. Since the migration table doesn't exist yet, we can't write any statuses until this migration is completed. """ blocking = False def __forwards_local(self) -> Sequence[operations.Operation]:
def __forward_migrations( self, table_name: str) -> Sequence[operations.Operation]: return [ operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.all_columns", WithDefault( Array(Array(LowCardinality(String()))), "arrayResize([['']], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.consistent", ), operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.or_conditions", WithDefault( Array(UInt(8)), "arrayResize([0], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.all_columns", ), operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.where_columns", WithDefault( Array(Array(LowCardinality(String()))), "arrayResize([['']], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.or_conditions", ), operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.where_mapping_columns", WithDefault( Array(Array(LowCardinality(String()))), "arrayResize([['']], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.where_columns", ), operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.groupby_columns", WithDefault( Array(Array(LowCardinality(String()))), "arrayResize([['']], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.where_mapping_columns", ), operations.AddColumn( storage_set=StorageSetKey.QUERYLOG, table_name=table_name, column=Column( "clickhouse_queries.array_join_columns", WithDefault( Array(Array(LowCardinality(String()))), "arrayResize([['']], length(clickhouse_queries.sql))", ), ), after="clickhouse_queries.groupby_columns", ), ]
("clickhouse_queries.duration_ms", Array(UInt(32))), ("clickhouse_queries.stats", Array(String())), ("clickhouse_queries.final", Array(UInt(8))), ("clickhouse_queries.cache_hit", Array(UInt(8))), ("clickhouse_queries.sample", Array(Float(32))), ("clickhouse_queries.max_threads", Array(UInt(8))), ("clickhouse_queries.num_days", Array(UInt(32))), ("clickhouse_queries.clickhouse_table", Array(LowCardinality(String()))), ("clickhouse_queries.query_id", Array(String())), # XXX: ``is_duplicate`` is currently not set when using the # ``Cache.get_readthrough`` query execution path. See GH-902. ("clickhouse_queries.is_duplicate", Array(UInt(8))), ("clickhouse_queries.consistent", Array(UInt(8))), ( "clickhouse_queries.all_columns", WithDefault(Array(Array(LowCardinality(String()))), NESTED_ARRAY_DEFAULT), ), ( "clickhouse_queries.or_conditions", WithDefault( Array(UInt(8)), "arrayResize([0], length(clickhouse_queries.sql))", ), ), ( "clickhouse_queries.where_columns", WithDefault(Array(Array(LowCardinality(String()))), NESTED_ARRAY_DEFAULT), ), ( "clickhouse_queries.where_mapping_columns",
UNKNOWN_SPAN_STATUS = SPAN_STATUS_NAME_TO_CODE["unknown"] tags_col = Column("tags", Nested([("key", String()), ("value", String())])) columns = [ Column("project_id", UInt(64)), Column("transaction_id", UUID()), Column("trace_id", UUID()), Column("transaction_span_id", UInt(64)), Column("span_id", UInt(64)), Column("parent_span_id", Nullable(UInt(64))), Column("transaction_name", LowCardinality(String())), Column("description", String()), # description in span Column("op", LowCardinality(String())), Column("status", WithDefault(UInt(8), str(UNKNOWN_SPAN_STATUS)),), Column("start_ts", DateTime()), Column("start_ns", UInt(32)), Column("finish_ts", DateTime()), Column("finish_ns", UInt(32)), Column("duration_ms", UInt(32)), tags_col, Column("retention_days", UInt(16)), Column("deleted", UInt(8)), ] class Migration(migration.MultiStepMigration): blocking = False def forwards_local(self) -> Sequence[operations.Operation]:
def __init__(self) -> None: all_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("timestamp", DateTime()), ("event_id", WithCodecs(UUID(), ["NONE"])), ( "event_hash", WithCodecs( Materialized( UInt(64), "cityHash64(toString(event_id))", ), ["NONE"], ), ), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''")), ( "user_hash", Materialized(UInt(64), "cityHash64(user)"), ), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", LowCardinality(Nullable(String()))), ("sdk_version", LowCardinality(Nullable(String()))), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("transaction_name", WithDefault(LowCardinality(String()), "''")), ( "transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)"), ), ("span_id", Nullable(UInt(64))), ("trace_id", Nullable(UUID())), ("partition", UInt(16)), ("offset", WithCodecs(UInt(64), ["DoubleDelta", "LZ4"])), ("retention_days", UInt(16)), ("deleted", UInt(8)), ("group_id", UInt(64)), ("primary_hash", FixedString(32)), ("primary_hash_hex", Materialized(UInt(64), "hex(primary_hash)")), ("event_string", WithCodecs(String(), ["NONE"])), ("received", DateTime()), ("message", String()), ("title", String()), ("culprit", String()), ("level", LowCardinality(String())), ("location", Nullable(String())), ("version", LowCardinality(Nullable(String()))), ("type", LowCardinality(String())), ( "exception_stacks", Nested([ ("type", Nullable(String())), ("value", Nullable(String())), ("mechanism_type", Nullable(String())), ("mechanism_handled", Nullable(UInt(8))), ]), ), ( "exception_frames", Nested([ ("abs_path", Nullable(String())), ("colno", Nullable(UInt(32))), ("filename", Nullable(String())), ("function", Nullable(String())), ("lineno", Nullable(UInt(32))), ("in_app", Nullable(UInt(8))), ("package", Nullable(String())), ("module", Nullable(String())), ("stack_level", Nullable(UInt(16))), ]), ), ("sdk_integrations", Array(String())), ("modules", Nested([("name", String()), ("version", String())])), ]) self.__promoted_tag_columns = { "environment": "environment", "sentry:release": "release", "sentry:dist": "dist", "sentry:user": "******", "transaction": "transaction_name", "level": "level", } schema = ReplacingMergeTreeSchema( columns=all_columns, local_table_name="errors_local", dist_table_name="errors_dist", mandatory_conditions=[("deleted", "=", 0)], prewhere_candidates=[ "event_id", "group_id", "tags[sentry:release]", "message", "environment", "project_id", ], order_by= "(org_id, project_id, toStartOfDay(timestamp), primary_hash_hex, event_hash)", partition_by= "(toMonday(timestamp), if(retention_days = 30, 30, 90))", version_column="deleted", sample_expr="event_hash", ttl_expr="timestamp + toIntervalDay(retention_days)", settings={"index_granularity": "8192"}, ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) table_writer = TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=ErrorsProcessor(self.__promoted_tag_columns), default_topic="events", ), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={ "time": "timestamp", "rtime": "received" }, time_parse_columns=("timestamp", "received"), ) self.__tags_processor = TagColumnProcessor( columns=all_columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), )