Example #1
0
def test_serialization() -> None:
    profile = ClickhouseQueryProfile(
        time_range=10,
        table="events",
        all_columns={"col", "timestamp", "arrayjoin"},
        multi_level_condition=True,
        where_profile=FilterProfile(
            columns={"timestamp"},
            mapping_cols=set(),
        ),
        groupby_cols={"col"},
        array_join_cols={"arrayjoin"},
    )

    assert profile.to_dict() == {
        "time_range": 10,
        "table": "events",
        "all_columns": ["arrayjoin", "col", "timestamp"],
        "multi_level_condition": True,
        "where_profile": {
            "columns": ["timestamp"],
            "mapping_cols": []
        },
        "groupby_cols": ["col"],
        "array_join_cols": ["arrayjoin"],
    }
Example #2
0
def generate_profile(
    query: Union[Query, CompositeQuery[Table]],
) -> ClickhouseQueryProfile:
    """
    Takes a Physical query in, analyzes it and produces the
    ClickhouseQueryProfile data structure.
    """
    collector = TablesCollector()
    collector.visit(query)

    all_condition = collector.get_all_conditions()

    try:
        return ClickhouseQueryProfile(
            time_range=collector.get_max_time_range(),
            table=",".join(sorted([t for t in collector.get_tables()])),
            all_columns=_get_all_columns(collector.get_all_raw_columns()),
            multi_level_condition=collector.has_complex_condition(),
            where_profile=FilterProfile(
                columns=_list_columns_in_condition(all_condition),
                mapping_cols=_list_mappings(all_condition),
            ),
            groupby_cols=_list_columns(collector.get_all_groupby()),
            array_join_cols=_list_columns(collector.get_all_arrayjoin()),
        )
    except Exception:
        # Should never happen, but it is not worth failing queries while
        # rolling this out because we cannot build he profile.
        logger.warning("Failed to build query profile", exc_info=True)
        return ClickhouseQueryProfile(
            time_range=-1,
            table="",
            all_columns=set(),
            multi_level_condition=False,
            where_profile=FilterProfile(
                columns=set(),
                mapping_cols=set(),
            ),
            groupby_cols=set(),
            array_join_cols=set(),
        )
Example #3
0
def generate_profile(query: Query) -> ClickhouseQueryProfile:
    """
    Takes a Physical query in, analyzes it and produces the
    ClickhouseQueryProfile data structure.
    """
    where = query.get_condition_from_ast()
    groupby = query.get_groupby_from_ast()

    try:
        return ClickhouseQueryProfile(
            time_range=_get_date_range(query),
            table=_get_table(query),
            all_columns=_get_all_columns(query),
            multi_level_condition=_has_complex_conditions(query),
            where_profile=FilterProfile(
                columns=_list_columns(where) if where is not None else set(),
                mapping_cols=_list_mapping(where)
                if where is not None else set(),
            ),
            groupby_cols=_list_groupby_columns(groupby)
            if groupby is not None else set(),
            array_join_cols=_list_array_join(query),
        )
    except Exception:
        # Should never happen, but it is not worth failing queries while
        # rolling this out because we cannot build he profile.
        logger.warning("Failed to build query profile", exc_info=True)
        return ClickhouseQueryProfile(
            time_range=-1,
            table="",
            all_columns=set(),
            multi_level_condition=False,
            where_profile=FilterProfile(
                columns=set(),
                mapping_cols=set(),
            ),
            groupby_cols=set(),
            array_join_cols=set(),
        )
Example #4
0
             ),
             groupby=[
                 Column("column2", None, "column2"),
                 Column("column3", None, "column3"),
             ],
         )),
     ClickhouseQueryProfile(
         time_range=31,
         table="events",
         all_columns={
             "timestamp",
             "column2",
             "column3",
             "contexts.key",
             "tags.key",
             "tags.value",
         },
         multi_level_condition=False,
         where_profile=FilterProfile(
             columns={"timestamp", "tags.key", "tags.value"},
             mapping_cols={"tags.key", "tags.value"},
         ),
         groupby_cols={"column2", "column3"},
         array_join_cols={"contexts.key"},
     ),
     id="Query with timestamp, tags, and arrayjoin",
 ),
 pytest.param(
     ClickhouseQuery(
         Query(
             {},
Example #5
0
def test_simple() -> None:
    request_body = {
        "selected_columns": ["event_id"],
        "orderby": "event_id",
        "sample": 0.1,
        "limit": 100,
        "offset": 50,
        "project": 1,
    }

    query = Query(
        Entity(EntityKey.EVENTS,
               get_entity(EntityKey.EVENTS).get_data_model()))

    request = Request(
        id=uuid.UUID("a" * 32).hex,
        original_body=request_body,
        query=query,
        snql_anonymized="",
        query_settings=HTTPQuerySettings(referrer="search"),
        attribution_info=AttributionInfo(get_app_id("default"), "search", None,
                                         None, None),
    )

    time = TestingClock()

    timer = Timer("test", clock=time)
    time.sleep(0.01)

    message = SnubaQueryMetadata(
        request=request,
        start_timestamp=datetime.utcnow() - timedelta(days=3),
        end_timestamp=datetime.utcnow(),
        dataset="events",
        timer=timer,
        query_list=[
            ClickhouseQueryMetadata(
                sql=
                "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100",
                sql_anonymized=
                "select event_id from sentry_dist sample 0.1 prewhere project_id in ($I) limit 50, 100",
                start_timestamp=datetime.utcnow() - timedelta(days=3),
                end_timestamp=datetime.utcnow(),
                stats={
                    "sample": 10,
                    "error_code": 386
                },
                status=QueryStatus.SUCCESS,
                profile=ClickhouseQueryProfile(
                    time_range=10,
                    table="events",
                    all_columns={"timestamp", "tags"},
                    multi_level_condition=False,
                    where_profile=FilterProfile(
                        columns={"timestamp"},
                        mapping_cols={"tags"},
                    ),
                    groupby_cols=set(),
                    array_join_cols=set(),
                ),
                trace_id="b" * 32,
            )
        ],
        projects={2},
        snql_anonymized=request.snql_anonymized,
        entity=EntityKey.EVENTS.value,
    ).to_dict()

    processor = (get_writable_storage(StorageKey.QUERYLOG).get_table_writer().
                 get_stream_loader().get_processor())

    assert processor.process_message(
        message, KafkaMessageMetadata(0, 0, datetime.now())
    ) == InsertBatch(
        [{
            "request_id":
            str(uuid.UUID("a" * 32)),
            "request_body":
            '{"limit": 100, "offset": 50, "orderby": "event_id", "project": 1, "sample": 0.1, "selected_columns": ["event_id"]}',
            "referrer":
            "search",
            "dataset":
            "events",
            "projects": [2],
            "organization":
            None,
            "timestamp":
            timer.for_json()["timestamp"],
            "duration_ms":
            10,
            "status":
            "success",
            "clickhouse_queries.sql": [
                "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100"
            ],
            "clickhouse_queries.status": ["success"],
            "clickhouse_queries.trace_id": [str(uuid.UUID("b" * 32))],
            "clickhouse_queries.duration_ms": [0],
            "clickhouse_queries.stats": ['{"error_code": 386, "sample": 10}'],
            "clickhouse_queries.final": [0],
            "clickhouse_queries.cache_hit": [0],
            "clickhouse_queries.sample": [10.0],
            "clickhouse_queries.max_threads": [0],
            "clickhouse_queries.num_days": [10],
            "clickhouse_queries.clickhouse_table": [""],
            "clickhouse_queries.query_id": [""],
            "clickhouse_queries.is_duplicate": [0],
            "clickhouse_queries.consistent": [0],
            "clickhouse_queries.all_columns": [["tags", "timestamp"]],
            "clickhouse_queries.or_conditions": [False],
            "clickhouse_queries.where_columns": [["timestamp"]],
            "clickhouse_queries.where_mapping_columns": [["tags"]],
            "clickhouse_queries.groupby_columns": [[]],
            "clickhouse_queries.array_join_columns": [[]],
        }],
        None,
    )
Example #6
0
def test_simple() -> None:
    request_body = {
        "selected_columns": ["event_id"],
        "orderby": "event_id",
        "sample": 0.1,
        "limit": 100,
        "offset": 50,
        "project": 1,
    }

    query = Query(get_storage(StorageKey.EVENTS).get_schema().get_data_source())

    request = Request(
        uuid.UUID("a" * 32).hex, request_body, query, HTTPRequestSettings(), "search",
    )

    time = TestingClock()

    timer = Timer("test", clock=time)
    time.sleep(0.01)

    message = SnubaQueryMetadata(
        request=request,
        dataset="events",
        timer=timer,
        query_list=[
            ClickhouseQueryMetadata(
                sql="select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100",
                stats={"sample": 10},
                status=QueryStatus.SUCCESS,
                profile=ClickhouseQueryProfile(
                    time_range=10,
                    table="events",
                    all_columns={"timestamp", "tags"},
                    multi_level_condition=False,
                    where_profile=FilterProfile(
                        columns={"timestamp"}, mapping_cols={"tags"},
                    ),
                    groupby_cols=set(),
                    array_join_cols=set(),
                ),
                trace_id="b" * 32,
            )
        ],
    ).to_dict()

    processor = (
        get_writable_storage(StorageKey.QUERYLOG)
        .get_table_writer()
        .get_stream_loader()
        .get_processor()
    )

    assert processor.process_message(
        message, KafkaMessageMetadata(0, 0, datetime.now())
    ) == InsertBatch(
        [
            {
                "request_id": str(uuid.UUID("a" * 32)),
                "request_body": '{"limit": 100, "offset": 50, "orderby": "event_id", "project": 1, "sample": 0.1, "selected_columns": ["event_id"]}',
                "referrer": "search",
                "dataset": "events",
                "projects": [1],
                "organization": None,
                "timestamp": timer.for_json()["timestamp"],
                "duration_ms": 10,
                "status": "success",
                "clickhouse_queries.sql": [
                    "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100"
                ],
                "clickhouse_queries.status": ["success"],
                "clickhouse_queries.trace_id": [str(uuid.UUID("b" * 32))],
                "clickhouse_queries.duration_ms": [0],
                "clickhouse_queries.stats": ['{"sample": 10}'],
                "clickhouse_queries.final": [0],
                "clickhouse_queries.cache_hit": [0],
                "clickhouse_queries.sample": [10.0],
                "clickhouse_queries.max_threads": [0],
                "clickhouse_queries.num_days": [10],
                "clickhouse_queries.clickhouse_table": [""],
                "clickhouse_queries.query_id": [""],
                "clickhouse_queries.is_duplicate": [0],
                "clickhouse_queries.consistent": [0],
                "clickhouse_queries.all_columns": [["tags", "timestamp"]],
                "clickhouse_queries.or_conditions": [False],
                "clickhouse_queries.where_columns": [["timestamp"]],
                "clickhouse_queries.where_mapping_columns": [["tags"]],
                "clickhouse_queries.groupby_columns": [[]],
                "clickhouse_queries.array_join_columns": [[]],
            }
        ],
    )
Example #7
0
         groupby=[
             Column("column2", None, "column2"),
             Column("column3", None, "column3"),
         ],
     ),
     ClickhouseQueryProfile(
         time_range=31,
         table="events",
         all_columns={
             "events.timestamp",
             "events.column2",
             "events.column3",
             "events.contexts.key",
             "events.tags.key",
             "events.tags.value",
         },
         multi_level_condition=False,
         where_profile=FilterProfile(
             columns={
                 "events.timestamp", "events.tags.key", "events.tags.value"
             },
             mapping_cols={"events.tags.key", "events.tags.value"},
         ),
         groupby_cols={"events.column2", "events.column3"},
         array_join_cols={"events.contexts.key"},
     ),
     id="Query with timestamp, tags, and arrayjoin",
 ),
 pytest.param(
     ClickhouseQuery(
         Table("events", ColumnSet([])),