Beispiel #1
0
def test_prewhere(query_body, keys, new_conditions, prewhere_conditions) -> None:
    settings.MAX_PREWHERE_CONDITIONS = 2
    query = Query(query_body, TableSource("my_table", ColumnSet([]), None, keys),)

    request_settings = HTTPRequestSettings()
    processor = PrewhereProcessor()
    processor.process_query(query, request_settings)

    assert query.get_conditions() == new_conditions
    assert query.get_prewhere() == prewhere_conditions
Beispiel #2
0
def test_prewhere(
    query_body: MutableMapping[str, Any],
    keys: Sequence[str],
    omit_if_final_keys: Sequence[str],
    new_ast_condition: Optional[Expression],
    new_prewhere_ast_condition: Optional[Expression],
    final: bool,
) -> None:
    settings.MAX_PREWHERE_CONDITIONS = 2
    events = get_dataset("events")
    # HACK until we migrate these tests to SnQL
    query_body["selected_columns"] = ["project_id"]
    query_body["conditions"] += [
        ["timestamp", ">=", "2021-01-01T00:00:00"],
        ["timestamp", "<", "2021-01-02T00:00:00"],
        ["project_id", "=", 1],
    ]
    snql_query = json_to_snql(query_body, "events")
    query, _ = parse_snql_query(str(snql_query), events)
    query = identity_translate(query)
    query.set_from_clause(Table("my_table", all_columns, final=final))

    query_settings = HTTPQuerySettings()
    processor = PrewhereProcessor(keys, omit_if_final=omit_if_final_keys)
    processor.process_query(query, query_settings)

    # HACK until we migrate these tests to SnQL
    def verify_expressions(top_level: Expression,
                           expected: Expression) -> bool:
        actual_conds = get_first_level_and_conditions(top_level)
        expected_conds = get_first_level_and_conditions(expected)
        for cond in expected_conds:
            if cond not in actual_conds:
                return False

        return True

    if new_ast_condition:
        condition = query.get_condition()
        assert condition is not None
        assert verify_expressions(condition, new_ast_condition)

    if new_prewhere_ast_condition:
        prewhere = query.get_prewhere_ast()
        assert prewhere is not None
        assert verify_expressions(prewhere, new_prewhere_ast_condition)
Beispiel #3
0
def test_prewhere(
    query_body: MutableMapping[str, Any],
    keys: Sequence[str],
    new_ast_condition: Optional[Expression],
    new_prewhere_ast_condition: Optional[Expression],
) -> None:
    settings.MAX_PREWHERE_CONDITIONS = 2
    events = get_dataset("events")
    query = parse_query(query_body, events)
    query.set_data_source(TableSource("my_table", ColumnSet([]), None, keys))

    request_settings = HTTPRequestSettings()
    processor = PrewhereProcessor()
    processor.process_query(Query(query), request_settings)

    assert query.get_condition_from_ast() == new_ast_condition
    assert query.get_prewhere_ast() == new_prewhere_ast_condition
Beispiel #4
0
def test_prewhere(
    query_body: MutableMapping[str, Any],
    keys: Sequence[str],
    omit_if_final_keys: Sequence[str],
    new_ast_condition: Optional[Expression],
    new_prewhere_ast_condition: Optional[Expression],
    final: bool,
) -> None:
    settings.MAX_PREWHERE_CONDITIONS = 2
    events = get_dataset("events")
    query = identity_translate(parse_query(query_body, events))
    query.set_from_clause(Table("my_table", all_columns, final=final))

    request_settings = HTTPRequestSettings()
    processor = PrewhereProcessor(keys, omit_if_final=omit_if_final_keys)
    processor.process_query(query, request_settings)

    assert query.get_condition_from_ast() == new_ast_condition
    assert query.get_prewhere_ast() == new_prewhere_ast_condition
Beispiel #5
0
 def get_query_processors(self) -> Sequence[QueryProcessor]:
     return [
         BasicFunctionsProcessor(),
         ApdexProcessor(),
         ImpactProcessor(),
         PrewhereProcessor(),
         NestedFieldConditionOptimizer(
             "tags", "_tags_flattened", {"start_ts", "finish_ts"}, BEGINNING_OF_TIME
         ),
         NestedFieldConditionOptimizer(
             "contexts",
             "_contexts_flattened",
             {"start_ts", "finish_ts"},
             BEGINNING_OF_TIME,
         ),
     ]
Beispiel #6
0
    ),
    ("start_ts", DateTime()),
    ("start_ns", UInt(32)),
    ("finish_ts", DateTime()),
    ("finish_ns", UInt(32)),
    ("duration_ms", UInt(32)),
    ("tags", Nested([("key", String()), ("value", String())])),
    ("_tags_hash_map", Materialized(Array(UInt(64)), TAGS_HASH_MAP_COLUMN)),
    ("retention_days", UInt(16)),
    ("deleted", UInt(8)),
])

schema = WritableTableSchema(
    columns=columns,
    local_table_name="spans_experimental_local",
    dist_table_name="spans_experimental_dist",
    storage_set_key=StorageSetKey.TRANSACTIONS,
)

storage = WritableTableStorage(
    storage_key=StorageKey.SPANS,
    storage_set_key=StorageSetKey.TRANSACTIONS,
    schema=schema,
    query_processors=[PrewhereProcessor()],
    stream_loader=KafkaStreamLoader(
        processor=SpansMessageProcessor(),
        default_topic="events",
    ),
    query_splitters=[TimeSplitQueryStrategy(timestamp_col="finish_ts")],
)
Beispiel #7
0
 def get_query_processors(self) -> Sequence[QueryProcessor]:
     return [
         BasicFunctionsProcessor(),
         PrewhereProcessor(),
     ]
Beispiel #8
0
 def get_query_processors(self) -> Sequence[QueryProcessor]:
     return [
         SimpleJoinOptimizer(),
         PrewhereProcessor(),
     ]
Beispiel #9
0
    dist_table_name=READ_DIST_TABLE_NAME,
    storage_set_key=StorageSetKey.SESSIONS,
)
materialized_view_schema = TableSchema(
    local_table_name=READ_LOCAL_MV_NAME,
    dist_table_name=READ_DIST_MV_NAME,
    storage_set_key=StorageSetKey.SESSIONS,
    columns=read_columns,
)

# The raw table we write onto, and that potentially we could
# query.
raw_storage = WritableTableStorage(
    storage_key=StorageKey.SESSIONS_RAW,
    storage_set_key=StorageSetKey.SESSIONS,
    schema=raw_schema,
    query_processors=[],
    stream_loader=build_kafka_stream_loader_from_settings(
        StorageKey.SESSIONS_RAW,
        processor=SessionsProcessor(),
        default_topic_name="ingest-sessions",
    ),
)
# The materialized view we query aggregate data from.
materialized_storage = ReadableTableStorage(
    storage_key=StorageKey.SESSIONS_HOURLY,
    storage_set_key=StorageSetKey.SESSIONS,
    schema=read_schema,
    query_processors=[PrewhereProcessor(["project_id", "org_id"])],
)
Beispiel #10
0
    ("user_id", UInt(64, Modifiers(nullable=True))),
    ("team_id", UInt(64, Modifiers(nullable=True))),
])

schema = WritableTableSchema(
    columns=columns,
    local_table_name="groupassignee_local",
    dist_table_name="groupassignee_dist",
    storage_set_key=StorageSetKey.CDC,
)

POSTGRES_TABLE = "sentry_groupasignee"

storage = CdcStorage(
    storage_key=StorageKey.GROUPASSIGNEES,
    storage_set_key=StorageSetKey.CDC,
    schema=schema,
    query_processors=[
        PrewhereProcessor(["project_id"]),
        ConsistencyEnforcerProcessor(),
    ],
    stream_loader=build_kafka_stream_loader_from_settings(
        processor=GroupAssigneeProcessor(POSTGRES_TABLE),
        default_topic=Topic.CDC,
        pre_filter=CdcTableNameMessageFilter(POSTGRES_TABLE),
    ),
    default_control_topic="cdc_control",
    postgres_table=POSTGRES_TABLE,
    row_processor=lambda row: GroupAssigneeRow.from_bulk(row).to_clickhouse(),
)
Beispiel #11
0
    }),
    UserColumnProcessor(),
    UUIDColumnProcessor({"event_id", "primary_hash", "trace_id"}),
    HexIntColumnProcessor({"span_id"}),
    UUIDArrayColumnProcessor({"hierarchical_hashes"}),
    SliceOfMapOptimizer(),
    EventsBooleanContextsProcessor(),
    TypeConditionOptimizer(),
    MappingOptimizer("tags", "_tags_hash_map", "events_tags_hash_map_enabled"),
    EmptyTagConditionProcessor(),
    ArrayJoinKeyValueOptimizer("tags"),
    PrewhereProcessor(
        prewhere_candidates,
        # Environment and release are excluded from prewhere in case of final
        # queries because of a Clickhouse bug.
        # group_id instead is excluded since `final` is applied after prewhere.
        # thus, in this case, we could be filtering out rows that should be
        # merged together by the final.
        omit_if_final=["environment", "release", "group_id"],
    ),
    TableRateLimit(),
]

query_splitters = [
    ColumnSplitQueryStrategy(
        id_column="event_id",
        project_column="project_id",
        timestamp_column="timestamp",
    ),
    TimeSplitQueryStrategy(timestamp_col="timestamp"),
]
Beispiel #12
0
    "tags[sentry:release]",
    "release",
    "message",
    "environment",
    "project_id",
]

query_processors = [
    PostReplacementConsistencyEnforcer(
        project_column="project_id",
        replacer_state_name=ReplacerState.ERRORS,
    ),
    MappingColumnPromoter(mapping_specs={"tags": promoted_tag_columns}),
    UserColumnProcessor(),
    EventIdColumnProcessor(),
    GroupIdColumnProcessor(),
    MappingOptimizer("tags", "_tags_hash_map", "events_tags_hash_map_enabled"),
    ArrayJoinKeyValueOptimizer("tags"),
    PrewhereProcessor(prewhere_candidates,
                      omit_if_final=["environment", "release"]),
]

query_splitters = [
    ColumnSplitQueryStrategy(
        id_column="event_id",
        project_column="project_id",
        timestamp_column="timestamp",
    ),
    TimeSplitQueryStrategy(timestamp_col="timestamp"),
]
Beispiel #13
0
 def get_query_processors(self) -> Sequence[ClickhouseProcessor]:
     return [PrewhereProcessor()]
Beispiel #14
0
                    "sentry:release": "release",
                    "sentry:dist": "dist",
                    "sentry:user": "******",
                },
                "contexts": {"trace.trace_id": "trace_id"},
            }
        ),
        MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"),
        ArrayJoinKeyValueOptimizer("tags"),
        UUIDColumnProcessor(set(["event_id", "trace_id"])),
        EventsBooleanContextsProcessor(),
        PrewhereProcessor(
            [
                "event_id",
                "release",
                "message",
                "transaction_name",
                "environment",
                "project_id",
            ]
        ),
    ],
    query_splitters=[
        ColumnSplitQueryStrategy(
            id_column="event_id",
            project_column="project_id",
            timestamp_column="timestamp",
        ),
        TimeSplitQueryStrategy(timestamp_col="timestamp"),
    ],
)
Beispiel #15
0
             "contexts": {"trace.trace_id": "trace_id", "trace.span_id": "span_id"},
         }
     ),
     UUIDColumnProcessor(set(["event_id", "trace_id"])),
     HexIntColumnProcessor({"span_id"}),
     EventsBooleanContextsProcessor(),
     MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"),
     EmptyTagConditionProcessor(),
     ArrayJoinKeyValueOptimizer("tags"),
     ArrayJoinKeyValueOptimizer("measurements"),
     ArrayJoinKeyValueOptimizer("span_op_breakdowns"),
     PrewhereProcessor(
         [
             "event_id",
             "trace_id",
             "span_id",
             "transaction_name",
             "transaction",
             "title",
         ]
     ),
     TableRateLimit(),
 ],
 stream_loader=build_kafka_stream_loader_from_settings(
     processor=TransactionsMessageProcessor(),
     default_topic=Topic.EVENTS,
     commit_log_topic=Topic.COMMIT_LOG,
     subscription_result_topic=Topic.SUBSCRIPTION_RESULTS_TRANSACTIONS,
 ),
 query_splitters=[TimeSplitQueryStrategy(timestamp_col="finish_ts")],
 mandatory_condition_checkers=[ProjectIdEnforcer()],
 writer_options={"insert_allow_materialized_columns": 1},
Beispiel #16
0
    columns=columns,
    local_table_name="transactions_local",
    dist_table_name="transactions_dist",
    storage_set_key=StorageSetKey.TRANSACTIONS,
    mandatory_conditions=[],
    part_format=[util.PartSegment.RETENTION_DAYS, util.PartSegment.DATE],
)

storage = WritableTableStorage(
    storage_key=StorageKey.TRANSACTIONS,
    storage_set_key=StorageSetKey.TRANSACTIONS,
    schema=schema,
    query_processors=[
        MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"),
        EventIdColumnProcessor(),
        ArrayJoinKeyValueOptimizer("tags"),
        ArrayJoinKeyValueOptimizer("measurements"),
        UUIDColumnProcessor(set(["event_id", "trace_id"])),
        PrewhereProcessor(
            ["event_id", "transaction_name", "transaction", "title"]),
    ],
    stream_loader=build_kafka_stream_loader_from_settings(
        StorageKey.TRANSACTIONS,
        processor=TransactionsMessageProcessor(),
        default_topic_name="events",
        commit_log_topic_name="snuba-commit-log",
    ),
    query_splitters=[TimeSplitQueryStrategy(timestamp_col="finish_ts")],
    writer_options={"insert_allow_materialized_columns": 1},
)
Beispiel #17
0
                 for col in promoted_tag_columns},
                get_promoted_context_tag_col_mapping(),
            ),
            "contexts":
            get_promoted_context_col_mapping(),
        },
        cast_to_string=True,
    ),
    # This processor must not be ported to the errors dataset. We should
    # not support promoting tags/contexts with boolean values. There is
    # no way to convert them back consistently to the value provided by
    # the client when the event is ingested, in all ways to access
    # tags/contexts. Once the errors dataset is in use, we will not have
    # boolean promoted tags/contexts so this constraint will be easy
    # to enforce.
    EventsPromotedBooleanContextsProcessor(),
    MappingOptimizer("tags", "_tags_hash_map", "events_tags_hash_map_enabled"),
    ArrayJoinKeyValueOptimizer("tags"),
    PrewhereProcessor(prewhere_candidates),
    FixedStringArrayColumnProcessor({"hierarchical_hashes"}, 32),
]

query_splitters = [
    ColumnSplitQueryStrategy(
        id_column="event_id",
        project_column="project_id",
        timestamp_column="timestamp",
    ),
    TimeSplitQueryStrategy(timestamp_col="timestamp"),
]
Beispiel #18
0

# The raw table we write onto, and that potentially we could
# query.
raw_storage = WritableTableStorage(
    storage_key=StorageKey.SESSIONS_RAW,
    storage_set_key=StorageSetKey.SESSIONS,
    schema=raw_schema,
    query_processors=[MinuteResolutionProcessor(), TableRateLimit()],
    mandatory_condition_checkers=[OrgIdEnforcer(), ProjectIdEnforcer()],
    stream_loader=build_kafka_stream_loader_from_settings(
        processor=SessionsProcessor(), default_topic=Topic.SESSIONS,
    ),
)
# The materialized view we query aggregate data from.
materialized_storage = ReadableTableStorage(
    storage_key=StorageKey.SESSIONS_HOURLY,
    storage_set_key=StorageSetKey.SESSIONS,
    schema=read_schema,
    query_processors=[PrewhereProcessor(["project_id", "org_id"]), TableRateLimit()],
    mandatory_condition_checkers=[OrgIdEnforcer(), ProjectIdEnforcer()],
)

org_materialized_storage = ReadableTableStorage(
    storage_key=StorageKey.ORG_SESSIONS,
    storage_set_key=StorageSetKey.SESSIONS,
    schema=read_schema,
    query_processors=[PrewhereProcessor(["project_id", "org_id"]), TableRateLimit()],
    mandatory_condition_checkers=[],
)
Beispiel #19
0
    # during create statement
    # (https://github.com/ClickHouse/ClickHouse/issues/12586), so the
    # materialization is added with a migration.
    skipped_cols_on_creation={"_tags_hash_map"},
)


storage = WritableTableStorage(
    storage_key=StorageKey.TRANSACTIONS,
    storage_set_key=StorageSetKey.TRANSACTIONS,
    schema=schema,
    query_processors=[
        NestedFieldConditionOptimizer(
            "contexts",
            "_contexts_flattened",
            {"start_ts", "finish_ts"},
            BEGINNING_OF_TIME,
        ),
        MappingOptimizer("tags", "_tags_hash_map", "tags_hash_map_enabled"),
        TransactionColumnProcessor(),
        ArrayJoinKeyValueOptimizer("tags"),
        ArrayJoinKeyValueOptimizer("measurements"),
        PrewhereProcessor(),
    ],
    stream_loader=KafkaStreamLoader(
        processor=TransactionsMessageProcessor(), default_topic="events",
    ),
    query_splitters=[TimeSplitQueryStrategy(timestamp_col="finish_ts")],
    writer_options={"insert_allow_materialized_columns": 1},
)
Beispiel #20
0
    storage_key=StorageKey.SESSIONS_RAW,
    storage_set_key=StorageSetKey.SESSIONS,
    schema=raw_schema,
    query_processors=[MinuteResolutionProcessor(),
                      TableRateLimit()],
    mandatory_condition_checkers=[OrgIdEnforcer(),
                                  ProjectIdEnforcer()],
    stream_loader=kafka_stream_loader,
)
# The materialized view we query aggregate data from.
materialized_storage = ReadableTableStorage(
    storage_key=StorageKey.SESSIONS_HOURLY,
    storage_set_key=StorageSetKey.SESSIONS,
    schema=read_schema,
    query_processors=[
        PrewhereProcessor(["project_id", "org_id"]),
        TableRateLimit()
    ],
    mandatory_condition_checkers=[OrgIdEnforcer(),
                                  ProjectIdEnforcer()],
)

org_materialized_storage = ReadableTableStorage(
    storage_key=StorageKey.ORG_SESSIONS,
    storage_set_key=StorageSetKey.SESSIONS,
    schema=read_schema,
    query_processors=[
        PrewhereProcessor(["project_id", "org_id"]),
        TableRateLimit()
    ],
    mandatory_condition_checkers=[],