def test_joined_columns(): schema = JoinedSchema(complex_join_structure) columns = schema.get_columns() expected_columns = ColumnSet([ ("t1.t1c1", UInt(64)), ("t1.t1c2", String()), ("t1.t1c3", Nested([ ("t11c4", UInt(64)) ])), ("t2.t2c1", UInt(64)), ("t2.t2c2", String()), ("t2.t2c3", Nested([ ("t21c4", UInt(64)) ])), ("t3.t3c1", UInt(64)), ("t3.t3c2", String()), ("t3.t3c3", Nested([ ("t31c4", UInt(64)) ])), ]) # Checks equality between flattened columns. Nested columns are # exploded here assert set([c.flattened for c in columns]) \ == set([c.flattened for c in expected_columns]) # Checks equality between the structured set of columns. Nested columns # are not exploded. assert set([repr(c) for c in columns.columns]) \ == set([repr(c) for c in expected_columns.columns])
def __init__( self, writable_storage_key: StorageKey, readable_storage_key: StorageKey, value_schema: Sequence[Column[SchemaModifiers]], mappers: TranslationMappers, ) -> None: writable_storage = get_writable_storage(writable_storage_key) readable_storage = get_storage(readable_storage_key) super().__init__( storages=[writable_storage, readable_storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( readable_storage, mappers=TranslationMappers(subscriptables=[ SubscriptableMapper(None, "tags", None, "tags"), ], ).concat(mappers), )), abstract_column_set=ColumnSet([ Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("timestamp", DateTime()), Column("tags", Nested([("key", UInt(64)), ("value", UInt(64))])), *value_schema, ]), join_relationships={}, writable_storage=writable_storage, validators=[ EntityRequiredColumnValidator({"org_id", "project_id"}) ], required_time_column="timestamp", )
def test_schema(self): cols = ColumnSet([("foo", UInt(8)), ("bar", Nested([("qux:mux", String())]))]) assert cols.for_schema() == "foo UInt8, bar Nested(`qux:mux` String)" assert cols["foo"].type == UInt(8) assert cols["bar.qux:mux"].type == Array(String())
def test_schema(self): cols = ColumnSet([('foo', UInt(8)), ('bar', Nested([('qux:mux', String())]))]) assert cols.for_schema() == 'foo UInt8, bar Nested(`qux:mux` String)' assert cols['foo'].type == UInt(8) assert cols['bar.qux:mux'].type == Array(String())
def __init__(self) -> None: storage = get_writable_storage(StorageKey.SPANS) super().__init__( storages=[storage], query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( storage=storage, mappers=TranslationMappers( subscriptables=[ SubscriptableMapper(None, "tags", None, "tags") ], ), ), ), abstract_column_set=ColumnSet( [ ("project_id", UInt(64)), ("transaction_id", UUID()), ("trace_id", UUID()), ("transaction_span_id", UInt(64)), ("span_id", UInt(64)), ("parent_span_id", UInt(64, Modifiers(nullable=True))), ("transaction_name", String()), ("op", String()), ("status", UInt(8)), ("start_ts", DateTime()), ("start_ns", UInt(32)), ("finish_ts", DateTime()), ("finish_ns", UInt(32)), ("duration_ms", UInt(32)), ("tags", Nested([("key", String()), ("value", String())])), ] ), join_relationships={ "contained": JoinRelationship( rhs_entity=EntityKey.TRANSACTIONS, columns=[ ("project_id", "project_id"), ("transaction_span_id", "span_id"), ], join_type=JoinType.INNER, equivalences=[ ColumnEquivalence("transaction_id", "event_id"), ColumnEquivalence("transaction_name", "transaction_name"), ColumnEquivalence("trace_id", "trace_id"), ], ) }, writable_storage=storage, validators=[EntityRequiredColumnValidator({"project_id"})], required_time_column=None, )
def forwards_dist(self) -> Sequence[operations.Operation]: return [ operations.AddColumn( storage_set=StorageSetKey.TRANSACTIONS, table_name="transactions_dist", column=Column( "measurements", Nested([("key", LowCardinality(String())), ("value", Float(64))]), ), after="_contexts_flattened", ), ]
def __init__( self, writable_storage_key: Optional[StorageKey], readable_storage_key: StorageKey, value_schema: Sequence[Column[SchemaModifiers]], mappers: TranslationMappers, abstract_column_set: Optional[ColumnSet] = None, validators: Optional[Sequence[QueryValidator]] = None, ) -> None: writable_storage = (get_writable_storage(writable_storage_key) if writable_storage_key else None) readable_storage = get_storage(readable_storage_key) storages = [readable_storage] if writable_storage: storages.append(writable_storage) if abstract_column_set is None: abstract_column_set = ColumnSet([ Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("timestamp", DateTime()), Column("bucketed_time", DateTime()), Column("tags", Nested([("key", UInt(64)), ("value", UInt(64))])), *value_schema, ]) if validators is None: validators = [ EntityRequiredColumnValidator({"org_id", "project_id"}), GranularityValidator(minimum=10), ] super().__init__( storages=storages, query_pipeline_builder=SimplePipelineBuilder( query_plan_builder=SingleStorageQueryPlanBuilder( readable_storage, mappers=TranslationMappers(subscriptables=[ SubscriptableMapper(None, "tags", None, "tags"), ], ).concat(mappers), )), abstract_column_set=abstract_column_set, join_relationships={}, writable_storage=writable_storage, validators=validators, required_time_column="timestamp", )
def forwards_dist(self) -> Sequence[operations.SqlOperation]: return [ operations.AddColumn( storage_set=StorageSetKey.TRANSACTIONS, table_name="transactions_dist", column=Column( "span_op_breakdowns", Nested([ ("key", String(Modifiers(low_cardinality=True))), ("value", Float(64)), ]), ), after="measurements.value", ), ]
def forwards_local(self) -> Sequence[operations.SqlOperation]: return [ operations.AddColumn( storage_set=StorageSetKey.TRANSACTIONS, table_name="transactions_local", column=Column( "measurements", Nested([ ("key", String(Modifiers(low_cardinality=True))), ("value", Float(64)), ]), ), after="_contexts_flattened", ), ]
from snuba.datasets.storages import StorageKey from snuba.query.processors.arrayjoin_keyvalue_optimizer import ( ArrayJoinKeyValueOptimizer, ) from snuba.query.processors.table_rate_limit import TableRateLimit aggregated_columns = [ Column("org_id", UInt(64)), Column("use_case_id", String()), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("granularity", UInt(8)), Column("timestamp", DateTime()), Column("retention_days", UInt(16)), Column( "tags", Nested([("key", UInt(64)), ("indexed_value", UInt(64)), ("raw_value", String())]), ), Column("_raw_tags_hash", Array(UInt(64), SchemaModifiers(readonly=True))), Column("_indexed_tags_hash", Array(UInt(64), SchemaModifiers(readonly=True))), ] sets_storage = ReadableTableStorage( storage_key=StorageKey.GENERIC_METRICS_SETS, storage_set_key=StorageSetKey.GENERIC_METRICS_SETS, schema=TableSchema( local_table_name="generic_metrics_sets_local", dist_table_name="generic_metrics_sets_dist", storage_set_key=StorageSetKey.GENERIC_METRICS_SETS, columns=ColumnSet([ *aggregated_columns,
def __init__(self) -> None: columns = ColumnSet( [ ("project_id", UInt(64)), ("event_id", UUID()), ("trace_id", UUID()), ("span_id", UInt(64)), ("transaction_name", LowCardinality(String())), ( "transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)",), ), ("transaction_op", LowCardinality(String())), ("transaction_status", WithDefault(UInt(8), UNKNOWN_SPAN_STATUS)), ("start_ts", DateTime()), ("start_ms", UInt(16)), ("_start_date", Materialized(Date(), "toDate(start_ts)"),), ("finish_ts", DateTime()), ("finish_ms", UInt(16)), ("_finish_date", Materialized(Date(), "toDate(finish_ts)"),), ("duration", UInt(32)), ("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''",)), ("user_hash", Materialized(UInt(64), "cityHash64(user)"),), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", WithDefault(LowCardinality(String()), "''")), ("sdk_version", WithDefault(LowCardinality(String()), "''")), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("partition", UInt(16)), ("offset", UInt(64)), ("retention_days", UInt(16)), ("deleted", UInt(8)), ] ) schema = ReplacingMergeTreeSchema( columns=columns, local_table_name="transactions_local", dist_table_name="transactions_dist", mandatory_conditions=[], prewhere_candidates=["event_id", "project_id"], order_by="(project_id, _finish_date, transaction_name, cityHash64(span_id))", partition_by="(retention_days, toMonday(_finish_date))", version_column="deleted", sample_expr=None, migration_function=transactions_migrations, ) dataset_schemas = DatasetSchemas(read_schema=schema, write_schema=schema,) self.__tags_processor = TagColumnProcessor( columns=columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), ) super().__init__( dataset_schemas=dataset_schemas, table_writer=TransactionsTableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=TransactionsMessageProcessor(), default_topic="events", ), ), time_group_columns={ "bucketed_start": "start_ts", "bucketed_end": "finish_ts", }, time_parse_columns=("start_ts", "finish_ts"), )
def __init__(self) -> None: self.__common_columns = ColumnSet( [ ("event_id", FixedString(32)), ("project_id", UInt(64)), ("type", Nullable(String())), ("timestamp", DateTime()), ("platform", Nullable(String())), ("environment", Nullable(String())), ("release", Nullable(String())), ("dist", Nullable(String())), ("user", Nullable(String())), ("transaction", Nullable(String())), ("message", Nullable(String())), ("title", Nullable(String())), # User ("user_id", Nullable(String())), ("username", Nullable(String())), ("email", Nullable(String())), ("ip_address", Nullable(String())), # SDK ("sdk_name", Nullable(String())), ("sdk_version", Nullable(String())), # geo location context ("geo_country_code", Nullable(String())), ("geo_region", Nullable(String())), ("geo_city", Nullable(String())), ("http_method", Nullable(String())), ("http_referer", Nullable(String())), # Other tags and context ("tags", Nested([("key", String()), ("value", String())])), ("contexts", Nested([("key", String()), ("value", String())])), ] ) self.__events_columns = ColumnSet( [ ("group_id", Nullable(UInt(64))), ("primary_hash", Nullable(FixedString(32))), # Promoted tags ("level", Nullable(String())), ("logger", Nullable(String())), ("server_name", Nullable(String())), ("site", Nullable(String())), ("url", Nullable(String())), ("search_message", Nullable(String())), ("location", Nullable(String())), ("culprit", Nullable(String())), ("received", Nullable(DateTime())), ("sdk_integrations", Nullable(Array(String()))), ("version", Nullable(String())), # exception interface ( "exception_stacks", Nested( [ ("type", Nullable(String())), ("value", Nullable(String())), ("mechanism_type", Nullable(String())), ("mechanism_handled", Nullable(UInt(8))), ] ), ), ( "exception_frames", Nested( [ ("abs_path", Nullable(String())), ("filename", Nullable(String())), ("package", Nullable(String())), ("module", Nullable(String())), ("function", Nullable(String())), ("in_app", Nullable(UInt(8))), ("colno", Nullable(UInt(32))), ("lineno", Nullable(UInt(32))), ("stack_level", UInt(16)), ] ), ), ("modules", Nested([("name", String()), ("version", String())])), ] ) self.__transactions_columns = ColumnSet( [ ("trace_id", Nullable(UUID())), ("span_id", Nullable(UInt(64))), ("transaction_hash", Nullable(UInt(64))), ("transaction_op", Nullable(String())), ("transaction_status", Nullable(UInt(8))), ("duration", Nullable(UInt(32))), ( "measurements", Nested([("key", LowCardinality(String())), ("value", Float(64))]), ), ] ) events_storage = get_storage(StorageKey.EVENTS) events_ro_storage = get_storage(StorageKey.EVENTS_RO) transactions_storage = get_storage(StorageKey.TRANSACTIONS) self.__time_group_columns: Mapping[str, str] = {} self.__time_parse_columns = ("timestamp",) super().__init__( storages=[events_storage, transactions_storage], query_plan_builder=SelectedStorageQueryPlanBuilder( selector=DiscoverQueryStorageSelector( events_table=events_storage, events_ro_table=events_ro_storage, abstract_events_columns=self.__events_columns, transactions_table=transactions_storage, abstract_transactions_columns=self.__transactions_columns, ), ), abstract_column_set=( self.__common_columns + self.__events_columns + self.__transactions_columns ), writable_storage=None, )
def __init__(self): columns = ColumnSet([ ('project_id', UInt(64)), ('event_id', UUID()), ('trace_id', UUID()), ('span_id', UInt(64)), ('transaction_name', String()), ('transaction_hash', Materialized( UInt(64), 'cityHash64(transaction_name)', )), ('transaction_op', LowCardinality(String())), ('start_ts', DateTime()), ('start_ms', UInt(16)), ('finish_ts', DateTime()), ('finish_ms', UInt(16)), ('duration', Materialized( UInt(32), '((finish_ts - start_ts) * 1000) + (finish_ms - start_ms)', )), ('platform', LowCardinality(String())), ('environment', Nullable(String())), ('release', Nullable(String())), ('dist', Nullable(String())), ('ip_address_v4', Nullable(IPv4())), ('ip_address_v6', Nullable(IPv6())), ('user', WithDefault( String(), "''", )), ('user_id', Nullable(String())), ('user_name', Nullable(String())), ('user_email', Nullable(String())), ('tags', Nested([ ('key', String()), ('value', String()), ])), ('contexts', Nested([ ('key', String()), ('value', String()), ])), ('partition', UInt(16)), ('offset', UInt(64)), ('retention_days', UInt(16)), ('deleted', UInt(8)), ]) schema = ReplacingMergeTreeSchema( columns=columns, local_table_name='transactions_local', dist_table_name='transactions_dist', order_by= '(project_id, toStartOfDay(start_ts), transaction_hash, start_ts, start_ms, trace_id, span_id)', partition_by='(retention_days, toMonday(start_ts))', version_column='deleted', sample_expr=None, ) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) super().__init__( dataset_schemas=dataset_schemas, processor=TransactionsMessageProcessor(), default_topic="events", time_group_columns={ 'bucketed_start': 'start_ts', 'bucketed_end': 'finish_ts', }, )
Column("platform", String(Modifiers(low_cardinality=True))), Column("environment", String(Modifiers(nullable=True, low_cardinality=True))), Column("release", String(Modifiers(nullable=True, low_cardinality=True))), Column("dist", String(Modifiers(nullable=True, low_cardinality=True))), Column("ip_address_v4", IPv4(Modifiers(nullable=True))), Column("ip_address_v6", IPv6(Modifiers(nullable=True))), Column("user", (String(Modifiers(default="''")))), Column("user_hash", UInt(64, Modifiers(materialized="cityHash64(user)"))), Column("user_id", String(Modifiers(nullable=True))), Column("user_name", String(Modifiers(nullable=True))), Column("user_email", String(Modifiers(nullable=True))), Column("sdk_name", String(Modifiers(nullable=True, low_cardinality=True))), Column("sdk_version", String(Modifiers(nullable=True, low_cardinality=True))), Column("tags", Nested([("key", String()), ("value", String())])), Column("_tags_flattened", String()), Column("contexts", Nested([("key", String()), ("value", String())])), Column("_contexts_flattened", String()), Column("transaction_name", String(Modifiers(low_cardinality=True, default="''"))), Column( "transaction_hash", UInt(64, Modifiers(materialized="cityHash64(transaction_name)")), ), Column("span_id", UInt(64, Modifiers(nullable=True))), Column("trace_id", UUID(Modifiers(nullable=True))), Column("partition", UInt(16)), Column("offset", UInt(64, Modifiers(codecs=["DoubleDelta", "LZ4"]))), Column("message_timestamp", DateTime()), Column("retention_days", UInt(16)),
def test_events_promoted_boolean_context() -> None: columns = ColumnSet( [ ("device_charging", UInt(8, Modifier(nullable=True))), ("contexts", Nested([("key", String()), ("value", String())])), ] ) query = ClickhouseQuery( Table("events", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "device.charging"), ), ), ), ), ) ], ) expected = ClickhouseQuery( Table("events", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "if", ( binary_condition( ConditionFunctions.IN, FunctionCall( None, "toString", (Column(None, None, "device_charging"),), ), literals_tuple( None, [Literal(None, "1"), Literal(None, "True")] ), ), Literal(None, "True"), Literal(None, "False"), ), ), ) ], ) settings = HTTPQuerySettings() MappingColumnPromoter( {"contexts": {"device.charging": "device_charging"}}, cast_to_string=True ).process_query(query, settings) EventsPromotedBooleanContextsProcessor().process_query(query, settings) assert query.get_selected_columns() == expected.get_selected_columns()
("dist", String(Modifiers(nullable=True))), ("transaction_name", String()), ("message", String()), ("title", String()), ("user", String()), ("user_hash", UInt(64)), ("user_id", String(Modifiers(nullable=True))), ("user_name", String(Modifiers(nullable=True))), ("user_email", String(Modifiers(nullable=True))), ("ip_address_v4", IPv4(Modifiers(nullable=True))), ("ip_address_v6", IPv6(Modifiers(nullable=True))), ("sdk_name", String(Modifiers(nullable=True))), ("sdk_version", String(Modifiers(nullable=True))), ("http_method", String(Modifiers(nullable=True))), ("http_referer", String(Modifiers(nullable=True))), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_hash_map", Array(UInt(64))), ("contexts", Nested([("key", String()), ("value", String())])), ("trace_id", UUID(Modifiers(nullable=True))), ("deleted", UInt(8)), ] ) schema = TableSchema( columns=columns, local_table_name="discover_local", dist_table_name="discover_dist", storage_set_key=StorageSetKey.DISCOVER, mandatory_conditions=mandatory_conditions, )
def __init__(self) -> None: self.__common_columns = ColumnSet([ ("event_id", FixedString(32)), ("project_id", UInt(64)), ("type", String(Modifiers(nullable=True))), ("timestamp", DateTime()), ("platform", String(Modifiers(nullable=True))), ("environment", String(Modifiers(nullable=True))), ("release", String(Modifiers(nullable=True))), ("dist", String(Modifiers(nullable=True))), ("user", String(Modifiers(nullable=True))), ("transaction", String(Modifiers(nullable=True))), ("message", String(Modifiers(nullable=True))), ("title", String(Modifiers(nullable=True))), # User ("user_id", String(Modifiers(nullable=True))), ("username", String(Modifiers(nullable=True))), ("email", String(Modifiers(nullable=True))), ("ip_address", String(Modifiers(nullable=True))), # SDK ("sdk_name", String(Modifiers(nullable=True))), ("sdk_version", String(Modifiers(nullable=True))), # geo location context ("geo_country_code", String(Modifiers(nullable=True))), ("geo_region", String(Modifiers(nullable=True))), ("geo_city", String(Modifiers(nullable=True))), ("http_method", String(Modifiers(nullable=True))), ("http_referer", String(Modifiers(nullable=True))), # Other tags and context ("tags", Nested([("key", String()), ("value", String())])), ("contexts", Nested([("key", String()), ("value", String())])), ("trace_id", String(Modifiers(nullable=True))), ("span_id", UInt(64, Modifiers(nullable=True))), ]) self.__events_columns = EVENTS_COLUMNS self.__transactions_columns = TRANSACTIONS_COLUMNS discover_storage = get_storage(StorageKey.DISCOVER) discover_storage_plan_builder = SingleStorageQueryPlanBuilder( storage=discover_storage, mappers=events_translation_mappers. concat(transaction_translation_mappers).concat( null_function_translation_mappers).concat( TranslationMappers(columns=[ ColumnToFunction( None, "ip_address", "coalesce", ( FunctionCall( None, "IPv4NumToString", (Column(None, None, "ip_address_v4"), ), ), FunctionCall( None, "IPv6NumToString", (Column(None, None, "ip_address_v6"), ), ), ), ), ColumnToColumn(None, "transaction", None, "transaction_name"), ColumnToColumn(None, "username", None, "user_name"), ColumnToColumn(None, "email", None, "user_email"), ColumnToMapping( None, "geo_country_code", None, "contexts", "geo.country_code", nullable=True, ), ColumnToMapping( None, "geo_region", None, "contexts", "geo.region", nullable=True, ), ColumnToMapping( None, "geo_city", None, "contexts", "geo.city", nullable=True, ), ColumnToFunction( None, "user", "nullIf", (Column(None, None, "user"), Literal(None, "")), ), ])).concat( TranslationMappers(subscriptables=[ SubscriptableMapper(None, "tags", None, "tags"), SubscriptableMapper(None, "contexts", None, "contexts"), ], )), ) discover_pipeline_builder = SimplePipelineBuilder( query_plan_builder=discover_storage_plan_builder) super().__init__( storages=[discover_storage], query_pipeline_builder=discover_pipeline_builder, abstract_column_set=(self.__common_columns + self.__events_columns + self.__transactions_columns), join_relationships={}, writable_storage=None, validators=[EntityRequiredColumnValidator({"project_id"})], required_time_column="timestamp", )
Column("dataset", LowCardinality(String())), Column("projects", Array(UInt(64))), Column("organization", Nullable(UInt(64))), Column("timestamp", DateTime()), Column("duration_ms", UInt(32)), Column("status", status_type), Column( "clickhouse_queries", Nested([ Column("sql", String()), Column("status", status_type), Column("trace_id", Nullable(UUID())), Column("duration_ms", UInt(32)), Column("stats", String()), Column("final", UInt(8)), Column("cache_hit", UInt(8)), Column("sample", Float(32)), Column("max_threads", UInt(8)), Column("num_days", UInt(32)), Column("clickhouse_table", LowCardinality(String())), Column("query_id", String()), Column("is_duplicate", UInt(8)), Column("consistent", UInt(8)), ]), ), ] class Migration(migration.MultiStepMigration): blocking = False def forwards_local(self) -> Sequence[operations.Operation]:
class Migration(migration.ClickhouseNodeMigration): blocking = False view_name = "generic_metric_sets_aggregation_mv" dest_table_columns: Sequence[Column[Modifiers]] = [ Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("granularity", UInt(8)), Column("timestamp", DateTime(modifiers=Modifiers(codecs=["DoubleDelta"]))), Column("retention_days", UInt(16)), Column( "tags", Nested([ ("key", UInt(64)), ("indexed_value", UInt(64)), ("raw_value", String()), ]), ), Column("value", AggregateFunction("uniqCombined64", [UInt(64)])), Column("use_case_id", String(Modifiers(low_cardinality=True))), ] def forwards_local(self) -> Sequence[operations.SqlOperation]: return [ operations.DropTable( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.view_name, ), operations.CreateMaterializedView( storage_set=StorageSetKey.GENERIC_METRICS_SETS, view_name=self.view_name, columns=self.dest_table_columns, destination_table_name="generic_metric_sets_local", query=""" SELECT use_case_id, org_id, project_id, metric_id, arrayJoin(granularities) as granularity, tags.key, tags.indexed_value, tags.raw_value, toDateTime(multiIf(granularity=0,10,granularity=1,60,granularity=2,3600,granularity=3,86400,-1) * intDiv(toUnixTimestamp(timestamp), multiIf(granularity=0,10,granularity=1,60,granularity=2,3600,granularity=3,86400,-1))) as timestamp, retention_days, uniqCombined64State(arrayJoin(set_values)) as value FROM generic_metric_sets_raw_local WHERE materialization_version = 1 AND metric_type = 'set' GROUP BY use_case_id, org_id, project_id, metric_id, tags.key, tags.indexed_value, tags.raw_value, timestamp, granularity, retention_days """, ), ] def backwards_local(self) -> Sequence[operations.SqlOperation]: return [ operations.DropTable( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.view_name, ) ] def forwards_dist(self) -> Sequence[operations.SqlOperation]: return [] def backwards_dist(self) -> Sequence[operations.SqlOperation]: return []
Array, Column, DateTime, Nested, Nullable, String, UInt, ) from snuba.clusters.storage_sets import StorageSetKey from snuba.datasets.storages.tags_hash_map import TAGS_HASH_MAP_COLUMN from snuba.migrations import migration, operations, table_engines from snuba.migrations.columns import LowCardinality, Materialized, WithDefault UNKNOWN_SPAN_STATUS = SPAN_STATUS_NAME_TO_CODE["unknown"] tags_col = Column("tags", Nested([("key", String()), ("value", String())])) columns = [ Column("project_id", UInt(64)), Column("transaction_id", UUID()), Column("trace_id", UUID()), Column("transaction_span_id", UInt(64)), Column("span_id", UInt(64)), Column("parent_span_id", Nullable(UInt(64))), Column("transaction_name", LowCardinality(String())), Column("description", String()), # description in span Column("op", LowCardinality(String())), Column( "status", WithDefault(UInt(8), str(UNKNOWN_SPAN_STATUS)), ),
def test_events_boolean_context() -> None: columns = ColumnSet( [("contexts", Nested([("key", String()), ("value", String())]))] ) query = ClickhouseQuery( Table("errors", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "device.charging"), ), ), ), ), ) ], ) expected = ClickhouseQuery( Table("errors", columns), selected_columns=[ SelectedExpression( "contexts[device.charging]", FunctionCall( "contexts[device.charging]", "if", ( binary_condition( ConditionFunctions.IN, FunctionCall( None, "arrayElement", ( Column(None, None, "contexts.value"), FunctionCall( None, "indexOf", ( Column(None, None, "contexts.key"), Literal(None, "device.charging"), ), ), ), ), literals_tuple( None, [Literal(None, "1"), Literal(None, "True")] ), ), Literal(None, "True"), Literal(None, "False"), ), ), ) ], ) settings = HTTPQuerySettings() EventsBooleanContextsProcessor().process_query(query, settings) assert query.get_selected_columns() == expected.get_selected_columns()
Nested, ) from snuba.datasets.schemas.tables import MergeTreeSchema from snuba.datasets.schemas.join import ( JoinConditionExpression, JoinCondition, JoinClause, JoinType, TableJoinNode, ) table1 = MergeTreeSchema( columns=ColumnSet([ ("t1c1", UInt(64)), ("t1c2", String()), ("t1c3", Nested([("t11c4", UInt(64))])), ]), local_table_name="table1", dist_table_name="table1", order_by="", partition_by="", ).get_data_source() table2 = MergeTreeSchema( columns=ColumnSet([ ("t2c1", UInt(64)), ("t2c2", String()), ("t2c3", Nested([("t21c4", UInt(64))])), ]), local_table_name="table2", dist_table_name="table2",
class Migration(migration.ClickhouseNodeMigration): blocking = False granularity = "2048" local_table_name = "generic_metric_sets_local" columns: Sequence[Column[Modifiers]] = [ Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("granularity", UInt(8)), Column("timestamp", DateTime(modifiers=Modifiers(codecs=["DoubleDelta"]))), Column("retention_days", UInt(16)), Column( "tags", Nested( [ ("key", UInt(64)), ("indexed_value", UInt(64)), ("raw_value", String()), ] ), ), Column("value", AggregateFunction("uniqCombined64", [UInt(64)])), Column("use_case_id", String(Modifiers(low_cardinality=True))), ] def forwards_local(self) -> Sequence[operations.SqlOperation]: return [ operations.CreateTable( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.local_table_name, engine=table_engines.AggregatingMergeTree( storage_set=StorageSetKey.GENERIC_METRICS_SETS, order_by="(org_id, project_id, metric_id, granularity, timestamp, tags.key, tags.indexed_value, tags.raw_value, retention_days, use_case_id)", primary_key="(org_id, project_id, metric_id, granularity, timestamp)", partition_by="(retention_days, toMonday(timestamp))", settings={"index_granularity": self.granularity}, ttl="timestamp + toIntervalDay(retention_days)", ), columns=self.columns, ), operations.AddColumn( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.local_table_name, column=Column( "_indexed_tags_hash", Array( UInt(64), Modifiers( materialized=hash_map_int_column_definition( "tags.key", "tags.indexed_value" ) ), ), ), ), operations.AddColumn( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.local_table_name, column=Column( "_raw_tags_hash", Array( UInt(64), Modifiers( materialized=hash_map_int_key_str_value_column_definition( "tags.key", "tags.raw_value" ) ), ), ), ), operations.AddIndex( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.local_table_name, index_name="bf_indexed_tags_hash", index_expression="_indexed_tags_hash", index_type="bloom_filter()", granularity=1, ), operations.AddIndex( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.local_table_name, index_name="bf_raw_tags_hash", index_expression="_raw_tags_hash", index_type="bloom_filter()", granularity=1, ), operations.AddIndex( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.local_table_name, index_name="bf_tags_key_hash", index_expression="tags.key", index_type="bloom_filter()", granularity=1, ), ] def backwards_local(self) -> Sequence[operations.SqlOperation]: return [ operations.DropTable( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.local_table_name, ) ] def forwards_dist(self) -> Sequence[operations.SqlOperation]: return [ operations.CreateTable( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name="generic_metric_sets_aggregated_dist", engine=table_engines.Distributed( local_table_name=self.local_table_name, sharding_key=None ), columns=self.columns, ) ] def backwards_dist(self) -> Sequence[operations.SqlOperation]: return [ operations.DropTable( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name="generic_metric_sets_aggregated_dist", ) ]
("platform", String()), ("environment", String(Modifiers(nullable=True))), ("release", String(Modifiers(nullable=True))), ("dist", String(Modifiers(nullable=True))), ("ip_address_v4", IPv4(Modifiers(nullable=True))), ("ip_address_v6", IPv6(Modifiers(nullable=True))), ("user", String()), ("user_hash", UInt(64, Modifiers(readonly=True))), ("user_id", String(Modifiers(nullable=True))), ("user_name", String(Modifiers(nullable=True))), ("user_email", String(Modifiers(nullable=True))), ("sdk_name", String()), ("sdk_version", String()), ("http_method", String(Modifiers(nullable=True))), ("http_referer", String(Modifiers(nullable=True))), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("_tags_hash_map", Array(UInt(64), Modifiers(readonly=True))), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("measurements", Nested([("key", String()), ("value", Float(64))]),), ("span_op_breakdowns", Nested([("key", String()), ("value", Float(64))]),), ("partition", UInt(16)), ("offset", UInt(64)), ("message_timestamp", DateTime()), ("retention_days", UInt(16)), ("deleted", UInt(8)), ("type", String(Modifiers(readonly=True))), ("message", String(Modifiers(readonly=True))), ("title", String(Modifiers(readonly=True))), ("timestamp", DateTime(Modifiers(readonly=True))),
def forwards_local(self) -> Sequence[operations.Operation]: return [ operations.AddColumn( storage_set=StorageSetKey.EVENTS, table_name="sentry_local", column=Column("group_id", UInt(64)), after="project_id", ), operations.DropColumn( storage_set=StorageSetKey.EVENTS, table_name="sentry_local", column_name="device_model", ), operations.AddColumn( storage_set=StorageSetKey.EVENTS, table_name="sentry_local", column=Column("sdk_integrations", Array(String())), after="exception_frames", ), operations.AddColumn( storage_set=StorageSetKey.EVENTS, table_name="sentry_local", column=Column("modules.name", Nested([("name", String())])), after="sdk_integrations", ), operations.AddColumn( storage_set=StorageSetKey.EVENTS, table_name="sentry_local", column=Column("culprit", Nullable(String())), after="sdk_integrations", ), operations.AddColumn( storage_set=StorageSetKey.EVENTS, table_name="sentry_local", column=Column("search_message", Nullable(String())), after="received", ), operations.AddColumn( storage_set=StorageSetKey.EVENTS, table_name="sentry_local", column=Column("title", Nullable(String())), after="search_message", ), operations.AddColumn( storage_set=StorageSetKey.EVENTS, table_name="sentry_local", column=Column("location", Nullable(String())), after="title", ), operations.AddColumn( storage_set=StorageSetKey.EVENTS, table_name="sentry_local", column=Column("_tags_flattened", String()), after="tags", ), operations.AddColumn( storage_set=StorageSetKey.EVENTS, table_name="sentry_local", column=Column("message_timestamp", DateTime()), after="partition", ), ]
def __init__(self): metadata_columns = ColumnSet([ # optional stream related data ('offset', Nullable(UInt(64))), ('partition', Nullable(UInt(16))), ]) promoted_tag_columns = ColumnSet([ # These are the classic tags, they are saved in Snuba exactly as they # appear in the event body. ('level', Nullable(String())), ('logger', Nullable(String())), ('server_name', Nullable(String())), # future name: device_id? ('transaction', Nullable(String())), ('environment', Nullable(String())), ('sentry:release', Nullable(String())), ('sentry:dist', Nullable(String())), ('sentry:user', Nullable(String())), ('site', Nullable(String())), ('url', Nullable(String())), ]) promoted_context_tag_columns = ColumnSet([ # These are promoted tags that come in in `tags`, but are more closely # related to contexts. To avoid naming confusion with Clickhouse nested # columns, they are stored in the database with s/./_/ # promoted tags ('app_device', Nullable(String())), ('device', Nullable(String())), ('device_family', Nullable(String())), ('runtime', Nullable(String())), ('runtime_name', Nullable(String())), ('browser', Nullable(String())), ('browser_name', Nullable(String())), ('os', Nullable(String())), ('os_name', Nullable(String())), ('os_rooted', Nullable(UInt(8))), ]) promoted_context_columns = ColumnSet([ ('os_build', Nullable(String())), ('os_kernel_version', Nullable(String())), ('device_name', Nullable(String())), ('device_brand', Nullable(String())), ('device_locale', Nullable(String())), ('device_uuid', Nullable(String())), ('device_model_id', Nullable(String())), ('device_arch', Nullable(String())), ('device_battery_level', Nullable(Float(32))), ('device_orientation', Nullable(String())), ('device_simulator', Nullable(UInt(8))), ('device_online', Nullable(UInt(8))), ('device_charging', Nullable(UInt(8))), ]) required_columns = ColumnSet([ ('event_id', FixedString(32)), ('project_id', UInt(64)), ('group_id', UInt(64)), ('timestamp', DateTime()), ('deleted', UInt(8)), ('retention_days', UInt(16)), ]) all_columns = required_columns + [ # required for non-deleted ('platform', Nullable(String())), ('message', Nullable(String())), ('primary_hash', Nullable(FixedString(32))), ('received', Nullable(DateTime())), ('search_message', Nullable(String())), ('title', Nullable(String())), ('location', Nullable(String())), # optional user ('user_id', Nullable(String())), ('username', Nullable(String())), ('email', Nullable(String())), ('ip_address', Nullable(String())), # optional geo ('geo_country_code', Nullable(String())), ('geo_region', Nullable(String())), ('geo_city', Nullable(String())), ('sdk_name', Nullable(String())), ('sdk_version', Nullable(String())), ('type', Nullable(String())), ('version', Nullable(String())), ] + metadata_columns \ + promoted_context_columns \ + promoted_tag_columns \ + promoted_context_tag_columns \ + [ # other tags ('tags', Nested([ ('key', String()), ('value', String()), ])), # other context ('contexts', Nested([ ('key', String()), ('value', String()), ])), # http interface ('http_method', Nullable(String())), ('http_referer', Nullable(String())), # exception interface ('exception_stacks', Nested([ ('type', Nullable(String())), ('value', Nullable(String())), ('mechanism_type', Nullable(String())), ('mechanism_handled', Nullable(UInt(8))), ])), ('exception_frames', Nested([ ('abs_path', Nullable(String())), ('filename', Nullable(String())), ('package', Nullable(String())), ('module', Nullable(String())), ('function', Nullable(String())), ('in_app', Nullable(UInt(8))), ('colno', Nullable(UInt(32))), ('lineno', Nullable(UInt(32))), ('stack_level', UInt(16)), ])), # These are columns we added later in the life of the (current) production # database. They don't necessarily belong here in a logical/readability sense # but they are here to match the order of columns in production becase # `insert_distributed_sync` is very sensitive to column existence and ordering. ('culprit', Nullable(String())), ('sdk_integrations', Array(String())), ('modules', Nested([ ('name', String()), ('version', String()), ])), ] sample_expr = 'cityHash64(toString(event_id))' schema = ReplacingMergeTreeSchema( columns=all_columns, local_table_name='sentry_local', dist_table_name='sentry_dist', mandatory_conditions=[('deleted', '=', 0)], order_by='(project_id, toStartOfDay(timestamp), %s)' % sample_expr, partition_by='(toMonday(timestamp), if(equals(retention_days, 30), 30, 90))', version_column='deleted', sample_expr=sample_expr, migration_function=events_migrations) dataset_schemas = DatasetSchemas( read_schema=schema, write_schema=schema, ) table_writer = TableWriter( write_schema=schema, stream_loader=KafkaStreamLoader( processor=EventsProcessor(promoted_tag_columns), default_topic="events", replacement_topic="event-replacements", commit_log_topic="snuba-commit-log", ) ) super(EventsDataset, self).__init__( dataset_schemas=dataset_schemas, table_writer=table_writer, time_group_columns={ 'time': 'timestamp', 'rtime': 'received' }, time_parse_columns=('timestamp', 'received') ) self.__metadata_columns = metadata_columns self.__promoted_tag_columns = promoted_tag_columns self.__promoted_context_tag_columns = promoted_context_tag_columns self.__promoted_context_columns = promoted_context_columns self.__required_columns = required_columns self.__tags_processor = TagColumnProcessor( columns=all_columns, promoted_columns=self._get_promoted_columns(), column_tag_map=self._get_column_tag_map(), )
("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''")), ("user_hash", Materialized(UInt(64), "cityHash64(user)"),), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", LowCardinality(Nullable(String()))), ("sdk_version", LowCardinality(Nullable(String()))), ("http_method", LowCardinality(Nullable(String()))), ("http_referer", Nullable(String())), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("_tags_hash_map", Materialized(Array(UInt(64)), TAGS_HASH_MAP_COLUMN)), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ("transaction_name", WithDefault(LowCardinality(String()), "''")), ("transaction_hash", Materialized(UInt(64), "cityHash64(transaction_name)"),), ("span_id", Nullable(UInt(64))), ("trace_id", Nullable(UUID())), ("partition", UInt(16)), ("offset", WithCodecs(UInt(64), ["DoubleDelta", "LZ4"])), ("message_timestamp", DateTime()), ("retention_days", UInt(16)), ("deleted", UInt(8)), ("group_id", UInt(64)), ("primary_hash", FixedString(32)),
from snuba.datasets.storages.tags_hash_map import INT_TAGS_HASH_MAP_COLUMN from snuba.migrations import operations, table_engines from snuba.migrations.columns import MigrationModifiers as Modifiers from snuba.utils.schemas import String #: The granularity used for the initial materialized views. #: This might differ from snuba.datasets.metrics.DEFAULT_GRANULARITY at #: a later point. ORIGINAL_GRANULARITY = 60 PRE_VALUE_BUCKETS_COLUMNS: Sequence[Column[Modifiers]] = [ Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("timestamp", DateTime()), Column("tags", Nested([Column("key", UInt(64)), Column("value", UInt(64))])), ] POST_VALUES_BUCKETS_COLUMNS: Sequence[Column[Modifiers]] = [ Column("materialization_version", UInt(8)), Column("retention_days", UInt(16)), Column("partition", UInt(16)), Column("offset", UInt(64)), ] COL_SCHEMA_DISTRIBUTIONS: Sequence[Column[Modifiers]] = [ Column( "percentiles", AggregateFunction("quantiles(0.5, 0.75, 0.9, 0.95, 0.99)", [Float(64)]), ),
("platform", LowCardinality(String())), ("environment", LowCardinality(Nullable(String()))), ("release", LowCardinality(Nullable(String()))), ("dist", LowCardinality(Nullable(String()))), ("ip_address_v4", Nullable(IPv4())), ("ip_address_v6", Nullable(IPv6())), ("user", WithDefault(String(), "''",)), ("user_hash", Materialized(UInt(64), "cityHash64(user)"),), ("user_id", Nullable(String())), ("user_name", Nullable(String())), ("user_email", Nullable(String())), ("sdk_name", WithDefault(LowCardinality(String()), "''")), ("sdk_version", WithDefault(LowCardinality(String()), "''")), ("http_method", LowCardinality(Nullable(String()))), ("http_referer", Nullable(String())), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_flattened", String()), ("_tags_hash_map", Materialized(Array(UInt(64)), TAGS_HASH_MAP_COLUMN)), ("contexts", Nested([("key", String()), ("value", String())])), ("_contexts_flattened", String()), ( "measurements", Nested([("key", LowCardinality(String())), ("value", Float(64))]), ), ("partition", UInt(16)), ("offset", UInt(64)), ("message_timestamp", DateTime()), ("retention_days", UInt(16)), ("deleted", UInt(8)), ] )
class Migration(migration.ClickhouseNodeMigration): blocking = False local_table_name = "generic_metric_sets_raw_local" dist_table_name = "generic_metric_sets_raw_dist" columns: Sequence[Column[Modifiers]] = [ Column("use_case_id", String(Modifiers(low_cardinality=True))), Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("timestamp", DateTime()), Column("retention_days", UInt(16)), Column( "tags", Nested([ ("key", UInt(64)), ("indexed_value", UInt(64)), ("raw_value", String()), ]), ), Column("set_values", Array(UInt(64))), Column("count_value", Float(64)), Column("distribution_values", Array(Float(64))), Column("metric_type", String(Modifiers(low_cardinality=True))), Column("materialization_version", UInt(8)), Column("timeseries_id", UInt(32)), Column("partition", UInt(16)), Column("offset", UInt(64)), ] def forwards_local(self) -> Sequence[operations.SqlOperation]: return [ operations.CreateTable( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.local_table_name, engine=table_engines.MergeTree( storage_set=StorageSetKey.GENERIC_METRICS_SETS, order_by= "(use_case_id, org_id, project_id, metric_id, timestamp)", partition_by= "(toStartOfInterval(timestamp, toIntervalDay(3)))", ttl="timestamp + toIntervalDay(7)", ), columns=self.columns, ) ] def backwards_local(self) -> Sequence[operations.SqlOperation]: return [ operations.DropTable( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.local_table_name, ) ] def forwards_dist(self) -> Sequence[operations.SqlOperation]: return [ operations.CreateTable( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.dist_table_name, engine=table_engines.Distributed( local_table_name=self.local_table_name, sharding_key="cityHash64(timeseries_id)", ), columns=self.columns, ) ] def backwards_dist(self) -> Sequence[operations.SqlOperation]: return [ operations.DropTable( storage_set=StorageSetKey.GENERIC_METRICS_SETS, table_name=self.dist_table_name, ) ]