def __init__( self, schema: TableSchema, host, port, encoder: Callable[[WriterTableRow], bytes], options=None, table_name=None, chunk_size: int = 1, ): """ Builds a writer to send a batch to Clickhouse. :param schema: The dataset schema to take the table name from :param host: Clickhosue host :param port: Clickhosue port :param encoder: A function that will be applied to each row to turn it into bytes :param options: options passed to Clickhouse :param table_name: Overrides the table coming from the schema (generally used for uplaoding on temporary tables) :param chunk_size: The chunk size (in rows). We send data to the server with Transfer-Encoding: chunked. If 0 we send the entire content in one chunk. """ self.__pool = HTTPConnectionPool(host, port) self.__options = options if options is not None else {} self.__table_name = table_name or schema.get_table_name() self.__chunk_size = chunk_size self.__encoder = encoder
("sdk_name", String(Modifiers(nullable=True))), ("sdk_version", String(Modifiers(nullable=True))), ("http_method", String(Modifiers(nullable=True))), ("http_referer", String(Modifiers(nullable=True))), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_hash_map", Array(UInt(64))), ("contexts", Nested([("key", String()), ("value", String())])), ("trace_id", UUID(Modifiers(nullable=True))), ("deleted", UInt(8)), ] ) schema = TableSchema( columns=columns, local_table_name="discover_local", dist_table_name="discover_dist", storage_set_key=StorageSetKey.DISCOVER, mandatory_conditions=mandatory_conditions, ) storage = ReadableTableStorage( storage_key=StorageKey.DISCOVER, storage_set_key=StorageSetKey.DISCOVER, schema=schema, query_processors=[ MappingColumnPromoter( mapping_specs={ "tags": { "environment": "environment", "sentry:release": "release", "sentry:dist": "dist",
("http_method", String(Modifiers(nullable=True))), ("http_referer", String(Modifiers(nullable=True))), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_hash_map", Array(UInt(64))), ("contexts", Nested([("key", String()), ("value", String())])), ("deleted", UInt(8)), ]) schema = TableSchema( columns=columns, local_table_name="discover_local", dist_table_name="discover_dist", storage_set_key=StorageSetKey.DISCOVER, mandatory_conditions=mandatory_conditions, prewhere_candidates=[ "event_id", "release", "message", "transaction_name", "environment", "project_id", ], ) storage = ReadableTableStorage( storage_key=StorageKey.DISCOVER, storage_set_key=StorageSetKey.DISCOVER, schema=schema, query_processors=[ MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), EventIdColumnProcessor(),
("sessions_preaggr", AggregateFunction("sumIf", [UInt(32), UInt(8)])), ("sessions_crashed", AggregateFunction("countIf", [UUID(), UInt(8)])), ("sessions_crashed_preaggr", AggregateFunction("sumIf", [UInt(32), UInt(8)])), ("sessions_abnormal", AggregateFunction("countIf", [UUID(), UInt(8)])), ("sessions_abnormal_preaggr", AggregateFunction("sumIf", [UInt(32), UInt(8)])), ("sessions_errored", AggregateFunction("uniqIf", [UUID(), UInt(8)])), ("sessions_errored_preaggr", AggregateFunction("sumIf", [UInt(32), UInt(8)])), ("users", AggregateFunction("uniqIf", [UUID(), UInt(8)])), ("users_crashed", AggregateFunction("uniqIf", [UUID(), UInt(8)])), ("users_abnormal", AggregateFunction("uniqIf", [UUID(), UInt(8)])), ("users_errored", AggregateFunction("uniqIf", [UUID(), UInt(8)])), ] ) read_schema = TableSchema( columns=read_columns, local_table_name=READ_LOCAL_TABLE_NAME, dist_table_name=READ_DIST_TABLE_NAME, storage_set_key=StorageSetKey.SESSIONS, ) materialized_view_schema = TableSchema( local_table_name=READ_LOCAL_MV_NAME, dist_table_name=READ_DIST_MV_NAME, storage_set_key=StorageSetKey.SESSIONS, columns=read_columns, ) # The raw table we write onto, and that potentially we could # query. raw_storage = WritableTableStorage( storage_key=StorageKey.SESSIONS_RAW, storage_set_key=StorageSetKey.SESSIONS, schema=raw_schema,
def process_delete_tag( message: Mapping[str, Any], schema: TableSchema, tag_column_map: Mapping[str, Mapping[str, str]], promoted_tags: Mapping[str, Sequence[str]], ) -> Optional[Replacement]: tag = message["tag"] if not tag: return None assert isinstance(tag, str) timestamp = datetime.strptime(message["datetime"], settings.PAYLOAD_DATETIME_FORMAT) tag_column_name = tag_column_map["tags"].get(tag, tag) is_promoted = tag in promoted_tags["tags"] where = """\ WHERE project_id = %(project_id)s AND received <= CAST('%(timestamp)s' AS DateTime) AND NOT deleted """ if is_promoted: prewhere = " PREWHERE %(tag_column)s IS NOT NULL " else: prewhere = " PREWHERE has(`tags.key`, %(tag_str)s) " insert_query_template = ( """\ INSERT INTO %(dist_write_table_name)s (%(all_columns)s) SELECT %(select_columns)s FROM %(dist_read_table_name)s FINAL """ + prewhere + where ) all_columns = [ col for col in schema.get_columns() if Materialized not in col.type.get_all_modifiers() ] select_columns = [] for col in all_columns: if is_promoted and col.flattened == tag_column_name: select_columns.append("NULL") elif col.flattened == "tags.key": select_columns.append( "arrayFilter(x -> (indexOf(`tags.key`, x) != indexOf(`tags.key`, %s)), `tags.key`)" % escape_string(tag) ) elif col.flattened == "tags.value": select_columns.append( "arrayMap(x -> arrayElement(`tags.value`, x), arrayFilter(x -> x != indexOf(`tags.key`, %s), arrayEnumerate(`tags.value`)))" % escape_string(tag) ) elif col.flattened == "_tags_flattened": select_columns.append(FLATTENED_COLUMN_TEMPLATE % escape_string(tag)) else: select_columns.append(col.escaped) all_column_names = [col.escaped for col in all_columns] query_args = { "all_columns": ", ".join(all_column_names), "select_columns": ", ".join(select_columns), "project_id": message["project_id"], "tag_str": escape_string(tag), "tag_column": escape_identifier(tag_column_name), "timestamp": timestamp.strftime(DATETIME_FORMAT), } count_query_template = ( """\ SELECT count() FROM %(dist_read_table_name)s FINAL """ + prewhere + where ) query_time_flags = (NEEDS_FINAL, message["project_id"]) return Replacement( count_query_template, insert_query_template, query_args, query_time_flags )
Column("granularity", UInt(8)), Column("timestamp", DateTime()), Column("retention_days", UInt(16)), Column( "tags", Nested([("key", UInt(64)), ("indexed_value", UInt(64)), ("raw_value", String())]), ), Column("_raw_tags_hash", Array(UInt(64), SchemaModifiers(readonly=True))), Column("_indexed_tags_hash", Array(UInt(64), SchemaModifiers(readonly=True))), ] sets_storage = ReadableTableStorage( storage_key=StorageKey.GENERIC_METRICS_SETS, storage_set_key=StorageSetKey.GENERIC_METRICS_SETS, schema=TableSchema( local_table_name="generic_metrics_sets_local", dist_table_name="generic_metrics_sets_dist", storage_set_key=StorageSetKey.GENERIC_METRICS_SETS, columns=ColumnSet([ *aggregated_columns, Column("value", AggregateFunction("uniqCombined64", [UInt(64)])), ]), ), query_processors=[ ArrayJoinKeyValueOptimizer("tags"), TableRateLimit(), ], )
"sessions_crashed", AggregateFunction("countIf", UUID(), UInt(8)), ), ( "sessions_abnormal", AggregateFunction("countIf", UUID(), UInt(8)), ), ("sessions_errored", AggregateFunction("uniqIf", UUID(), UInt(8))), ("users_crashed", AggregateFunction("uniqIf", UUID(), UInt(8))), ("users_abnormal", AggregateFunction("uniqIf", UUID(), UInt(8))), ("users_errored", AggregateFunction("uniqIf", UUID(), UInt(8))), ]) read_schema = TableSchema( columns=read_columns, local_table_name=READ_LOCAL_TABLE_NAME, dist_table_name=READ_DIST_TABLE_NAME, storage_set_key=StorageSetKey.SESSIONS, prewhere_candidates=["project_id", "org_id"], ) materialized_view_schema = TableSchema( local_table_name=READ_LOCAL_MV_NAME, dist_table_name=READ_DIST_MV_NAME, storage_set_key=StorageSetKey.SESSIONS, prewhere_candidates=["project_id", "org_id"], columns=read_columns, ) # The raw table we write onto, and that potentially we could # query. raw_storage = WritableTableStorage( storage_key=StorageKey.SESSIONS_RAW,
storage_set_key=StorageSetKey.OUTCOMES, ) read_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", String()), ("times_seen", UInt(64)), ]) read_schema = TableSchema( columns=read_columns, local_table_name=READ_LOCAL_TABLE_NAME, dist_table_name=READ_DIST_TABLE_NAME, storage_set_key=StorageSetKey.OUTCOMES, ) materialized_view_columns = ColumnSet([ ("org_id", UInt(64)), ("project_id", UInt(64)), ("key_id", UInt(64)), ("timestamp", DateTime()), ("outcome", UInt(8)), ("reason", String()), ("times_seen", UInt(64)), ]) materialized_view_schema = TableSchema( local_table_name="outcomes_mv_hourly_local",
from snuba.clusters.storage_sets import StorageSetKey from snuba.datasets.storage import ReadableTableStorage from snuba.datasets.schemas.tables import TableSchema from snuba.datasets.storages import StorageKey from snuba.datasets.storages.events_common import ( all_columns, mandatory_conditions, query_processors, query_splitters, ) schema = TableSchema( columns=all_columns, local_table_name="sentry_local", dist_table_name="sentry_dist_ro", storage_set_key=StorageSetKey.EVENTS_RO, mandatory_conditions=mandatory_conditions, ) storage = ReadableTableStorage( storage_key=StorageKey.EVENTS_RO, storage_set_key=StorageSetKey.EVENTS_RO, schema=schema, query_processors=query_processors, query_splitters=query_splitters, )
) from snuba.clusters.storage_sets import StorageSetKey from snuba.datasets.schemas.tables import TableSchema from snuba.datasets.schemas.join import ( JoinConditionExpression, JoinCondition, JoinClause, JoinType, TableJoinNode, ) table1 = TableSchema( columns=ColumnSet([ ("t1c1", UInt(64)), ("t1c2", String()), ("t1c3", Nested([("t11c4", UInt(64))])), ]), local_table_name="table1", dist_table_name="table1", storage_set_key=StorageSetKey.EVENTS, ).get_data_source() table2 = TableSchema( columns=ColumnSet([ ("t2c1", UInt(64)), ("t2c2", String()), ("t2c3", Nested([("t21c4", UInt(64))])), ]), local_table_name="table2", dist_table_name="table2", storage_set_key=StorageSetKey.EVENTS, ).get_data_source()
from snuba.clusters.storage_sets import StorageSetKey from snuba.datasets.storage import ReadableTableStorage from snuba.datasets.schemas.tables import TableSchema from snuba.datasets.storages import StorageKey from snuba.datasets.storages.errors_common import ( all_columns, mandatory_conditions, prewhere_candidates, query_processors, query_splitters, ) schema = TableSchema( columns=all_columns, local_table_name="errors_local", dist_table_name="errors_dist_ro", storage_set_key=StorageSetKey.EVENTS_RO, mandatory_conditions=mandatory_conditions, prewhere_candidates=prewhere_candidates, ) storage = ReadableTableStorage( storage_key=StorageKey.ERRORS_RO, storage_set_key=StorageSetKey.EVENTS_RO, schema=schema, query_processors=query_processors, query_splitters=query_splitters, )
from snuba.clusters.storage_sets import StorageSetKey from snuba.datasets.schemas.tables import TableSchema from snuba.datasets.storage import ReadableTableStorage from snuba.datasets.storages import StorageKey from snuba.datasets.storages.transactions_common import ( columns, mandatory_condition_checkers, query_processors, query_splitters, ) schema = TableSchema( columns=columns, local_table_name="transactions_local", dist_table_name="transactions_dist", storage_set_key=StorageSetKey.TRANSACTIONS_RO, mandatory_conditions=[], ) storage = ReadableTableStorage( storage_key=StorageKey.TRANSACTIONS_RO, storage_set_key=StorageSetKey.TRANSACTIONS_RO, schema=schema, query_processors=query_processors, query_splitters=query_splitters, mandatory_condition_checkers=mandatory_condition_checkers, )
Column("metric_id", UInt(64)), Column("granularity", UInt(32)), Column("timestamp", DateTime()), Column("retention_days", UInt(16)), Column("tags", Nested([("key", UInt(64)), ("value", UInt(64))])), Column("_tags_hash", Array(UInt(64), SchemaModifiers(readonly=True))), ] sets_storage = ReadableTableStorage( storage_key=StorageKey.METRICS_SETS, storage_set_key=StorageSetKey.METRICS, schema=TableSchema( local_table_name="metrics_sets_local", dist_table_name="metrics_sets_dist", storage_set_key=StorageSetKey.METRICS, columns=ColumnSet([ *aggregated_columns, Column("value", AggregateFunction("uniqCombined64", [UInt(64)])), ]), ), query_processors=[ArrayJoinKeyValueOptimizer("tags")], ) counters_storage = ReadableTableStorage( storage_key=StorageKey.METRICS_COUNTERS, storage_set_key=StorageSetKey.METRICS, schema=TableSchema( local_table_name="metrics_counters_local", dist_table_name="metrics_counters_dist", storage_set_key=StorageSetKey.METRICS, columns=ColumnSet([
from snuba.clusters.storage_sets import StorageSetKey from snuba.datasets.schemas.tables import TableSchema from snuba.datasets.storage import ReadableTableStorage from snuba.datasets.storages import StorageKey from snuba.datasets.storages.errors_common import ( all_columns, mandatory_conditions, query_splitters, ) from snuba.datasets.storages.errors_v2 import query_processors schema = TableSchema( columns=all_columns, local_table_name="errors_local", dist_table_name="errors_dist_ro", storage_set_key=StorageSetKey.ERRORS_V2_RO, mandatory_conditions=mandatory_conditions, ) storage = ReadableTableStorage( storage_key=StorageKey.ERRORS_V2_RO, storage_set_key=StorageSetKey.ERRORS_V2_RO, schema=schema, query_processors=query_processors, query_splitters=query_splitters, )
CounterAggregateProcessor(), default_topic=Topic.METRICS, dead_letter_queue_policy_creator=produce_policy_creator, ), write_format=WriteFormat.VALUES, ) org_counters_storage = ReadableTableStorage( storage_key=StorageKey.ORG_METRICS_COUNTERS, storage_set_key=StorageSetKey.METRICS, schema=TableSchema( local_table_name="metrics_counters_v2_local", dist_table_name="metrics_counters_v2_dist", storage_set_key=StorageSetKey.METRICS, columns=ColumnSet([ Column("org_id", UInt(64)), Column("project_id", UInt(64)), Column("metric_id", UInt(64)), Column("granularity", UInt(32)), Column("timestamp", DateTime()), ]), ), query_processors=[TableRateLimit()], ) distributions_storage = WritableTableStorage( storage_key=StorageKey.METRICS_DISTRIBUTIONS, storage_set_key=StorageSetKey.METRICS, schema=WritableTableSchema( local_table_name="metrics_distributions_v2_local", dist_table_name="metrics_distributions_v2_dist", storage_set_key=StorageSetKey.METRICS,