def test_prewhere(query_body, keys, new_conditions, prewhere_conditions) -> None: settings.MAX_PREWHERE_CONDITIONS = 2 query = Query(query_body, TableSource("my_table", ColumnSet([]), None, keys),) request_settings = HTTPRequestSettings() processor = PrewhereProcessor() processor.process_query(query, request_settings) assert query.get_conditions() == new_conditions assert query.get_prewhere() == prewhere_conditions
def test_prewhere( query_body: MutableMapping[str, Any], keys: Sequence[str], omit_if_final_keys: Sequence[str], new_ast_condition: Optional[Expression], new_prewhere_ast_condition: Optional[Expression], final: bool, ) -> None: settings.MAX_PREWHERE_CONDITIONS = 2 events = get_dataset("events") # HACK until we migrate these tests to SnQL query_body["selected_columns"] = ["project_id"] query_body["conditions"] += [ ["timestamp", ">=", "2021-01-01T00:00:00"], ["timestamp", "<", "2021-01-02T00:00:00"], ["project_id", "=", 1], ] snql_query = json_to_snql(query_body, "events") query, _ = parse_snql_query(str(snql_query), events) query = identity_translate(query) query.set_from_clause(Table("my_table", all_columns, final=final)) query_settings = HTTPQuerySettings() processor = PrewhereProcessor(keys, omit_if_final=omit_if_final_keys) processor.process_query(query, query_settings) # HACK until we migrate these tests to SnQL def verify_expressions(top_level: Expression, expected: Expression) -> bool: actual_conds = get_first_level_and_conditions(top_level) expected_conds = get_first_level_and_conditions(expected) for cond in expected_conds: if cond not in actual_conds: return False return True if new_ast_condition: condition = query.get_condition() assert condition is not None assert verify_expressions(condition, new_ast_condition) if new_prewhere_ast_condition: prewhere = query.get_prewhere_ast() assert prewhere is not None assert verify_expressions(prewhere, new_prewhere_ast_condition)
def test_prewhere( query_body: MutableMapping[str, Any], keys: Sequence[str], new_ast_condition: Optional[Expression], new_prewhere_ast_condition: Optional[Expression], ) -> None: settings.MAX_PREWHERE_CONDITIONS = 2 events = get_dataset("events") query = parse_query(query_body, events) query.set_data_source(TableSource("my_table", ColumnSet([]), None, keys)) request_settings = HTTPRequestSettings() processor = PrewhereProcessor() processor.process_query(Query(query), request_settings) assert query.get_condition_from_ast() == new_ast_condition assert query.get_prewhere_ast() == new_prewhere_ast_condition
def test_prewhere( query_body: MutableMapping[str, Any], keys: Sequence[str], omit_if_final_keys: Sequence[str], new_ast_condition: Optional[Expression], new_prewhere_ast_condition: Optional[Expression], final: bool, ) -> None: settings.MAX_PREWHERE_CONDITIONS = 2 events = get_dataset("events") query = identity_translate(parse_query(query_body, events)) query.set_from_clause(Table("my_table", all_columns, final=final)) request_settings = HTTPRequestSettings() processor = PrewhereProcessor(keys, omit_if_final=omit_if_final_keys) processor.process_query(query, request_settings) assert query.get_condition_from_ast() == new_ast_condition assert query.get_prewhere_ast() == new_prewhere_ast_condition
def get_query_processors(self) -> Sequence[QueryProcessor]: return [ BasicFunctionsProcessor(), ApdexProcessor(), ImpactProcessor(), PrewhereProcessor(), NestedFieldConditionOptimizer( "tags", "_tags_flattened", {"start_ts", "finish_ts"}, BEGINNING_OF_TIME ), NestedFieldConditionOptimizer( "contexts", "_contexts_flattened", {"start_ts", "finish_ts"}, BEGINNING_OF_TIME, ), ]
), ("start_ts", DateTime()), ("start_ns", UInt(32)), ("finish_ts", DateTime()), ("finish_ns", UInt(32)), ("duration_ms", UInt(32)), ("tags", Nested([("key", String()), ("value", String())])), ("_tags_hash_map", Materialized(Array(UInt(64)), TAGS_HASH_MAP_COLUMN)), ("retention_days", UInt(16)), ("deleted", UInt(8)), ]) schema = WritableTableSchema( columns=columns, local_table_name="spans_experimental_local", dist_table_name="spans_experimental_dist", storage_set_key=StorageSetKey.TRANSACTIONS, ) storage = WritableTableStorage( storage_key=StorageKey.SPANS, storage_set_key=StorageSetKey.TRANSACTIONS, schema=schema, query_processors=[PrewhereProcessor()], stream_loader=KafkaStreamLoader( processor=SpansMessageProcessor(), default_topic="events", ), query_splitters=[TimeSplitQueryStrategy(timestamp_col="finish_ts")], )
def get_query_processors(self) -> Sequence[QueryProcessor]: return [ BasicFunctionsProcessor(), PrewhereProcessor(), ]
def get_query_processors(self) -> Sequence[QueryProcessor]: return [ SimpleJoinOptimizer(), PrewhereProcessor(), ]
dist_table_name=READ_DIST_TABLE_NAME, storage_set_key=StorageSetKey.SESSIONS, ) materialized_view_schema = TableSchema( local_table_name=READ_LOCAL_MV_NAME, dist_table_name=READ_DIST_MV_NAME, storage_set_key=StorageSetKey.SESSIONS, columns=read_columns, ) # The raw table we write onto, and that potentially we could # query. raw_storage = WritableTableStorage( storage_key=StorageKey.SESSIONS_RAW, storage_set_key=StorageSetKey.SESSIONS, schema=raw_schema, query_processors=[], stream_loader=build_kafka_stream_loader_from_settings( StorageKey.SESSIONS_RAW, processor=SessionsProcessor(), default_topic_name="ingest-sessions", ), ) # The materialized view we query aggregate data from. materialized_storage = ReadableTableStorage( storage_key=StorageKey.SESSIONS_HOURLY, storage_set_key=StorageSetKey.SESSIONS, schema=read_schema, query_processors=[PrewhereProcessor(["project_id", "org_id"])], )
("user_id", UInt(64, Modifiers(nullable=True))), ("team_id", UInt(64, Modifiers(nullable=True))), ]) schema = WritableTableSchema( columns=columns, local_table_name="groupassignee_local", dist_table_name="groupassignee_dist", storage_set_key=StorageSetKey.CDC, ) POSTGRES_TABLE = "sentry_groupasignee" storage = CdcStorage( storage_key=StorageKey.GROUPASSIGNEES, storage_set_key=StorageSetKey.CDC, schema=schema, query_processors=[ PrewhereProcessor(["project_id"]), ConsistencyEnforcerProcessor(), ], stream_loader=build_kafka_stream_loader_from_settings( processor=GroupAssigneeProcessor(POSTGRES_TABLE), default_topic=Topic.CDC, pre_filter=CdcTableNameMessageFilter(POSTGRES_TABLE), ), default_control_topic="cdc_control", postgres_table=POSTGRES_TABLE, row_processor=lambda row: GroupAssigneeRow.from_bulk(row).to_clickhouse(), )
}), UserColumnProcessor(), UUIDColumnProcessor({"event_id", "primary_hash", "trace_id"}), HexIntColumnProcessor({"span_id"}), UUIDArrayColumnProcessor({"hierarchical_hashes"}), SliceOfMapOptimizer(), EventsBooleanContextsProcessor(), TypeConditionOptimizer(), MappingOptimizer("tags", "_tags_hash_map", "events_tags_hash_map_enabled"), EmptyTagConditionProcessor(), ArrayJoinKeyValueOptimizer("tags"), PrewhereProcessor( prewhere_candidates, # Environment and release are excluded from prewhere in case of final # queries because of a Clickhouse bug. # group_id instead is excluded since `final` is applied after prewhere. # thus, in this case, we could be filtering out rows that should be # merged together by the final. omit_if_final=["environment", "release", "group_id"], ), TableRateLimit(), ] query_splitters = [ ColumnSplitQueryStrategy( id_column="event_id", project_column="project_id", timestamp_column="timestamp", ), TimeSplitQueryStrategy(timestamp_col="timestamp"), ]
"tags[sentry:release]", "release", "message", "environment", "project_id", ] query_processors = [ PostReplacementConsistencyEnforcer( project_column="project_id", replacer_state_name=ReplacerState.ERRORS, ), MappingColumnPromoter(mapping_specs={"tags": promoted_tag_columns}), UserColumnProcessor(), EventIdColumnProcessor(), GroupIdColumnProcessor(), MappingOptimizer("tags", "_tags_hash_map", "events_tags_hash_map_enabled"), ArrayJoinKeyValueOptimizer("tags"), PrewhereProcessor(prewhere_candidates, omit_if_final=["environment", "release"]), ] query_splitters = [ ColumnSplitQueryStrategy( id_column="event_id", project_column="project_id", timestamp_column="timestamp", ), TimeSplitQueryStrategy(timestamp_col="timestamp"), ]
def get_query_processors(self) -> Sequence[ClickhouseProcessor]: return [PrewhereProcessor()]
"sentry:release": "release", "sentry:dist": "dist", "sentry:user": "******", }, "contexts": {"trace.trace_id": "trace_id"}, } ), MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), ArrayJoinKeyValueOptimizer("tags"), UUIDColumnProcessor(set(["event_id", "trace_id"])), EventsBooleanContextsProcessor(), PrewhereProcessor( [ "event_id", "release", "message", "transaction_name", "environment", "project_id", ] ), ], query_splitters=[ ColumnSplitQueryStrategy( id_column="event_id", project_column="project_id", timestamp_column="timestamp", ), TimeSplitQueryStrategy(timestamp_col="timestamp"), ], )
"contexts": {"trace.trace_id": "trace_id", "trace.span_id": "span_id"}, } ), UUIDColumnProcessor(set(["event_id", "trace_id"])), HexIntColumnProcessor({"span_id"}), EventsBooleanContextsProcessor(), MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), EmptyTagConditionProcessor(), ArrayJoinKeyValueOptimizer("tags"), ArrayJoinKeyValueOptimizer("measurements"), ArrayJoinKeyValueOptimizer("span_op_breakdowns"), PrewhereProcessor( [ "event_id", "trace_id", "span_id", "transaction_name", "transaction", "title", ] ), TableRateLimit(), ], stream_loader=build_kafka_stream_loader_from_settings( processor=TransactionsMessageProcessor(), default_topic=Topic.EVENTS, commit_log_topic=Topic.COMMIT_LOG, subscription_result_topic=Topic.SUBSCRIPTION_RESULTS_TRANSACTIONS, ), query_splitters=[TimeSplitQueryStrategy(timestamp_col="finish_ts")], mandatory_condition_checkers=[ProjectIdEnforcer()], writer_options={"insert_allow_materialized_columns": 1},
columns=columns, local_table_name="transactions_local", dist_table_name="transactions_dist", storage_set_key=StorageSetKey.TRANSACTIONS, mandatory_conditions=[], part_format=[util.PartSegment.RETENTION_DAYS, util.PartSegment.DATE], ) storage = WritableTableStorage( storage_key=StorageKey.TRANSACTIONS, storage_set_key=StorageSetKey.TRANSACTIONS, schema=schema, query_processors=[ MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), EventIdColumnProcessor(), ArrayJoinKeyValueOptimizer("tags"), ArrayJoinKeyValueOptimizer("measurements"), UUIDColumnProcessor(set(["event_id", "trace_id"])), PrewhereProcessor( ["event_id", "transaction_name", "transaction", "title"]), ], stream_loader=build_kafka_stream_loader_from_settings( StorageKey.TRANSACTIONS, processor=TransactionsMessageProcessor(), default_topic_name="events", commit_log_topic_name="snuba-commit-log", ), query_splitters=[TimeSplitQueryStrategy(timestamp_col="finish_ts")], writer_options={"insert_allow_materialized_columns": 1}, )
for col in promoted_tag_columns}, get_promoted_context_tag_col_mapping(), ), "contexts": get_promoted_context_col_mapping(), }, cast_to_string=True, ), # This processor must not be ported to the errors dataset. We should # not support promoting tags/contexts with boolean values. There is # no way to convert them back consistently to the value provided by # the client when the event is ingested, in all ways to access # tags/contexts. Once the errors dataset is in use, we will not have # boolean promoted tags/contexts so this constraint will be easy # to enforce. EventsPromotedBooleanContextsProcessor(), MappingOptimizer("tags", "_tags_hash_map", "events_tags_hash_map_enabled"), ArrayJoinKeyValueOptimizer("tags"), PrewhereProcessor(prewhere_candidates), FixedStringArrayColumnProcessor({"hierarchical_hashes"}, 32), ] query_splitters = [ ColumnSplitQueryStrategy( id_column="event_id", project_column="project_id", timestamp_column="timestamp", ), TimeSplitQueryStrategy(timestamp_col="timestamp"), ]
# The raw table we write onto, and that potentially we could # query. raw_storage = WritableTableStorage( storage_key=StorageKey.SESSIONS_RAW, storage_set_key=StorageSetKey.SESSIONS, schema=raw_schema, query_processors=[MinuteResolutionProcessor(), TableRateLimit()], mandatory_condition_checkers=[OrgIdEnforcer(), ProjectIdEnforcer()], stream_loader=build_kafka_stream_loader_from_settings( processor=SessionsProcessor(), default_topic=Topic.SESSIONS, ), ) # The materialized view we query aggregate data from. materialized_storage = ReadableTableStorage( storage_key=StorageKey.SESSIONS_HOURLY, storage_set_key=StorageSetKey.SESSIONS, schema=read_schema, query_processors=[PrewhereProcessor(["project_id", "org_id"]), TableRateLimit()], mandatory_condition_checkers=[OrgIdEnforcer(), ProjectIdEnforcer()], ) org_materialized_storage = ReadableTableStorage( storage_key=StorageKey.ORG_SESSIONS, storage_set_key=StorageSetKey.SESSIONS, schema=read_schema, query_processors=[PrewhereProcessor(["project_id", "org_id"]), TableRateLimit()], mandatory_condition_checkers=[], )
# during create statement # (https://github.com/ClickHouse/ClickHouse/issues/12586), so the # materialization is added with a migration. skipped_cols_on_creation={"_tags_hash_map"}, ) storage = WritableTableStorage( storage_key=StorageKey.TRANSACTIONS, storage_set_key=StorageSetKey.TRANSACTIONS, schema=schema, query_processors=[ NestedFieldConditionOptimizer( "contexts", "_contexts_flattened", {"start_ts", "finish_ts"}, BEGINNING_OF_TIME, ), MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"), TransactionColumnProcessor(), ArrayJoinKeyValueOptimizer("tags"), ArrayJoinKeyValueOptimizer("measurements"), PrewhereProcessor(), ], stream_loader=KafkaStreamLoader( processor=TransactionsMessageProcessor(), default_topic="events", ), query_splitters=[TimeSplitQueryStrategy(timestamp_col="finish_ts")], writer_options={"insert_allow_materialized_columns": 1}, )
storage_key=StorageKey.SESSIONS_RAW, storage_set_key=StorageSetKey.SESSIONS, schema=raw_schema, query_processors=[MinuteResolutionProcessor(), TableRateLimit()], mandatory_condition_checkers=[OrgIdEnforcer(), ProjectIdEnforcer()], stream_loader=kafka_stream_loader, ) # The materialized view we query aggregate data from. materialized_storage = ReadableTableStorage( storage_key=StorageKey.SESSIONS_HOURLY, storage_set_key=StorageSetKey.SESSIONS, schema=read_schema, query_processors=[ PrewhereProcessor(["project_id", "org_id"]), TableRateLimit() ], mandatory_condition_checkers=[OrgIdEnforcer(), ProjectIdEnforcer()], ) org_materialized_storage = ReadableTableStorage( storage_key=StorageKey.ORG_SESSIONS, storage_set_key=StorageSetKey.SESSIONS, schema=read_schema, query_processors=[ PrewhereProcessor(["project_id", "org_id"]), TableRateLimit() ], mandatory_condition_checkers=[],