Exemple #1
0
    def col_split(dataset, request: Request,
                  column_split_spec: ColumnSplitSpec, *args, **kwargs):
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id and project_id.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        # The query function may mutate the request body during query
        # evaluation, so we need to copy the body to ensure that the query has
        # not been modified by the time we're ready to run the full query.
        minimal_request = copy.deepcopy(request)
        minimal_request.query.set_selected_columns(
            column_split_spec.get_min_columns())
        result = query_func(dataset, minimal_request, *args, **kwargs)
        del minimal_request

        if result.result["data"]:
            request = copy.deepcopy(request)

            event_ids = list(
                set([
                    event[column_split_spec.id_column]
                    for event in result.result["data"]
                ]))
            request.query.add_conditions([(column_split_spec.id_column, "IN",
                                           event_ids)])
            request.query.set_offset(0)
            request.query.set_limit(len(event_ids))

            project_ids = list(
                set([
                    event[column_split_spec.project_column]
                    for event in result.result["data"]
                ]))
            request.extensions["project"]["project"] = project_ids

            timestamp_field = column_split_spec.timestamp_column
            timestamps = [
                event[timestamp_field] for event in result.result["data"]
            ]
            request.extensions[
                "timeseries"]["from_date"] = util.parse_datetime(
                    min(timestamps)).isoformat()
            # We add 1 second since this gets translated to ('timestamp', '<', to_date)
            # and events are stored with a granularity of 1 second.
            request.extensions["timeseries"]["to_date"] = (
                util.parse_datetime(max(timestamps)) +
                timedelta(seconds=1)).isoformat()

        return query_func(dataset, request, *args, **kwargs)
Exemple #2
0
    def validate(self, value) -> Request:
        value = validate_jsonschema(value, self.__composite_schema)

        query_body = {
            key: value.pop(key)
            for key in self.__query_schema['properties'].keys() if key in value
        }
        settings = {
            key: value.pop(key)
            for key in self.__settings_schema['properties'].keys()
            if key in value
        }

        extensions = {}
        for extension_name, extension_schema in self.__extension_schemas.items(
        ):
            extensions[extension_name] = {
                key: value.pop(key)
                for key in extension_schema['properties'].keys()
                if key in value
            }

        return Request(
            Query(query_body),
            RequestSettings(settings['turbo'], settings['consistent'],
                            settings['debug']), extensions)
def test() -> None:
    cv = threading.Condition()
    query_result = QueryResult({}, {"stats": {}, "sql": ""})
    mock_query_runner = Mock(return_value=query_result)

    def callback_func(args: List[Tuple[str, QueryResult]]) -> None:
        with cv:
            cv.notify()

    mock_callback = Mock(side_effect=callback_func)

    query_body = {
        "selected_columns": ["type", "project_id"],
    }

    events = get_dataset("events")
    query = parse_query(query_body, events)

    events_pipeline = SimplePipelineBuilder(
        query_plan_builder=SingleStorageQueryPlanBuilder(
            storage=get_storage(StorageKey.EVENTS)), )

    errors_pipeline = SimplePipelineBuilder(
        query_plan_builder=SingleStorageQueryPlanBuilder(
            storage=get_storage(StorageKey.ERRORS)), )

    delegator = PipelineDelegator(
        query_pipeline_builders={
            "events": events_pipeline,
            "errors": errors_pipeline
        },
        selector_func=lambda query, referrer: ("events", ["errors"]),
        callback_func=mock_callback,
    )

    with cv:
        request_settings = HTTPRequestSettings()
        delegator.build_execution_pipeline(
            Request(
                "",
                query_body,
                query,
                request_settings,
                "ref",
            ),
            mock_query_runner,
        ).execute()
        cv.wait(timeout=5)

    assert mock_query_runner.call_count == 2

    assert mock_callback.call_args == call(
        query,
        request_settings,
        "ref",
        [
            Result("events", query_result, ANY),
            Result("errors", query_result, ANY)
        ],
    )
Exemple #4
0
def test_col_split_conditions(
    id_column: str, project_column: str, timestamp_column: str, query, expected_result
) -> None:
    dataset = get_dataset("events")
    query = parse_query(query, dataset)
    splitter = ColumnSplitQueryStrategy(id_column, project_column, timestamp_column)
    request = Request("a", query, HTTPRequestSettings(), {}, "r")
    entity = get_entity(query.get_from_clause().key)
    plan = entity.get_query_plan_builder().build_plan(request)

    def do_query(
        query: ClickhouseQuery, request_settings: RequestSettings,
    ) -> QueryResult:
        return QueryResult(
            {
                "data": [
                    {
                        id_column: "asd123",
                        project_column: 123,
                        timestamp_column: "2019-10-01 22:33:42",
                    }
                ]
            },
            {},
        )

    assert (
        splitter.execute(plan.query, HTTPRequestSettings(), do_query) is not None
    ) == expected_result
Exemple #5
0
    def validate(self, value, dataset: Dataset, referrer: str) -> Request:
        try:
            value = validate_jsonschema(value, self.__composite_schema)
        except jsonschema.ValidationError as error:
            raise JsonSchemaValidationException(str(error)) from error

        query_body = {
            key: value.pop(key)
            for key in self.__query_schema["properties"].keys() if key in value
        }
        settings = {
            key: value.pop(key)
            for key in self.__settings_schema["properties"].keys()
            if key in value
        }

        extensions = {}
        for extension_name, extension_schema in self.__extension_schemas.items(
        ):
            extensions[extension_name] = {
                key: value.pop(key)
                for key in extension_schema["properties"].keys()
                if key in value
            }

        query = parse_query(query_body, dataset)

        request_id = uuid.uuid4().hex
        return Request(request_id, query, self.__setting_class(**settings),
                       extensions, referrer)
Exemple #6
0
    def validate(self, value, dataset: Dataset, referrer: str) -> Request:
        value = validate_jsonschema(value, self.__composite_schema)

        query_body = {
            key: value.pop(key)
            for key in self.__query_schema["properties"].keys() if key in value
        }
        settings = {
            key: value.pop(key)
            for key in self.__settings_schema["properties"].keys()
            if key in value
        }

        extensions = {}
        for extension_name, extension_schema in self.__extension_schemas.items(
        ):
            extensions[extension_name] = {
                key: value.pop(key)
                for key in extension_schema["properties"].keys()
                if key in value
            }

        query = parse_query(query_body, dataset)
        return Request(query, self.__setting_class(**settings), extensions,
                       referrer)
Exemple #7
0
def test_select_storage(query_body: MutableMapping[str, Any],
                        is_subscription: bool, expected_table: str) -> None:
    sessions = get_dataset("sessions")
    snql_query = json_to_snql(query_body, "sessions")
    query, snql_anonymized = parse_snql_query(str(snql_query), sessions)
    query_body = json.loads(snql_query.snuba())
    subscription_settings = (SubscriptionQuerySettings
                             if is_subscription else HTTPQuerySettings)

    request = Request(
        id="a",
        original_body=query_body,
        query=query,
        snql_anonymized=snql_anonymized,
        query_settings=subscription_settings(referrer=""),
        attribution_info=AttributionInfo(get_app_id("default"), "blah", None,
                                         None, None),
    )

    def query_runner(query: Query, settings: QuerySettings,
                     reader: Reader) -> QueryResult:
        assert query.get_from_clause().table_name == expected_table
        return QueryResult({}, {})

    sessions.get_default_entity().get_query_pipeline_builder(
    ).build_execution_pipeline(request, query_runner).execute()
Exemple #8
0
def test_no_split(dataset_name: str):
    events = get_dataset(dataset_name)
    query = Query(
        {
            "selected_columns": ["event_id"],
            "conditions": [""],
            "orderby": "event_id",
            "sample": 10,
            "limit": 100,
            "offset": 50,
        },
        events.get_dataset_schemas().get_read_schema().get_data_source()
    )

    @split_query
    def do_query(dataset: Dataset, request: Request, timer: Timer):
        assert request.query == query

    request = Request(
        query,
        RequestSettings(False, False, False),
        {},
    )

    do_query(events, request, None)
Exemple #9
0
def test_alias_validation(query_body: MutableMapping[str, Any],
                          expected_result: bool) -> None:
    events = get_dataset("events")
    query = parse_query(query_body, events)
    query_plan = events.get_query_plan_builder().build_plan(
        Request("", query, HTTPRequestSettings(), {}, ""))

    assert query_plan.query.validate_aliases() == expected_result
Exemple #10
0
def test_events_processing() -> None:
    query_body = {
        "query": """
        MATCH (events)
        SELECT tags[transaction], contexts[browser.name]
        WHERE project_id = 1
        AND timestamp >= toDateTime('2020-01-01 12:00:00')
        AND timestamp < toDateTime('2020-01-02 12:00:00')
        """,
        "dataset": "events",
    }

    events_dataset = get_dataset("events")
    events_entity = events_dataset.get_default_entity()

    query, snql_anonymized = parse_snql_query(query_body["query"],
                                              events_dataset)
    request = Request(
        id="",
        original_body=query_body,
        query=query,
        snql_anonymized=snql_anonymized,
        query_settings=HTTPQuerySettings(referrer=""),
        attribution_info=AttributionInfo(get_app_id("blah"), "blah", None,
                                         None, None),
    )

    def query_runner(query: Query, settings: QuerySettings,
                     reader: Reader) -> QueryResult:
        assert query.get_selected_columns() == [
            SelectedExpression(
                "tags[transaction]",
                Column("_snuba_tags[transaction]", None, "transaction_name"),
            ),
            SelectedExpression(
                "contexts[browser.name]",
                FunctionCall(
                    "_snuba_contexts[browser.name]",
                    "arrayElement",
                    (
                        Column(None, None, "contexts.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (
                                Column(None, None, "contexts.key"),
                                Literal(None, "browser.name"),
                            ),
                        ),
                    ),
                ),
            ),
        ]
        return QueryResult({}, {})

    events_entity.get_query_pipeline_builder().build_execution_pipeline(
        request, query_runner).execute()
def test_sessions_processing() -> None:
    query_body = {
        "selected_columns": ["duration_quantiles", "sessions", "users"],
        "conditions": [
            ["org_id", "=", 1],
            ["project_id", "=", 1],
            ["started", ">", "2020-01-01 12:00:00"],
        ],
    }

    sessions = get_dataset("sessions")
    query = parse_query(query_body, sessions)
    request = Request("", query_body, query, HTTPRequestSettings(), "")

    def query_runner(query: Query, settings: RequestSettings,
                     reader: Reader) -> QueryResult:
        quantiles = tuple(
            Literal(None, quant) for quant in [0.5, 0.75, 0.9, 0.95, 0.99, 1])
        assert query.get_selected_columns() == [
            SelectedExpression(
                "duration_quantiles",
                CurriedFunctionCall(
                    "_snuba_duration_quantiles",
                    FunctionCall(
                        None,
                        "quantilesIfMerge",
                        quantiles,
                    ),
                    (Column(None, None, "duration_quantiles"), ),
                ),
            ),
            SelectedExpression(
                "sessions",
                FunctionCall(
                    "_snuba_sessions",
                    "plus",
                    (
                        FunctionCall(None, "countIfMerge",
                                     (Column(None, None, "sessions"), )),
                        FunctionCall(
                            None,
                            "sumIfMerge",
                            (Column(None, None, "sessions_preaggr"), ),
                        ),
                    ),
                ),
            ),
            SelectedExpression(
                "users",
                FunctionCall("_snuba_users", "uniqIfMerge",
                             (Column(None, None, "users"), )),
            ),
        ]
        return QueryResult({}, {})

    sessions.get_default_entity().get_query_pipeline_builder(
    ).build_execution_pipeline(request, query_runner).execute()
Exemple #12
0
    def col_split(dataset, request: Request, *args, **kwargs):
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id and project_id.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        # The query function may mutate the request body during query
        # evaluation, so we need to copy the body to ensure that the query has
        # not been modified by the time we're ready to run the full query.
        minimal_request = copy.deepcopy(request)
        minimal_request.query.set_selected_columns(MIN_COLS)
        result, status = query_func(dataset, minimal_request, *args, **kwargs)
        del minimal_request

        # If something failed, just return
        if status != 200:
            return result, status

        if result['data']:
            request = copy.deepcopy(request)

            event_ids = list(
                set([event['event_id'] for event in result['data']]))
            request.query.add_conditions([('event_id', 'IN', event_ids)])
            request.query.set_offset(0)
            request.query.set_limit(len(event_ids))

            project_ids = list(
                set([event['project_id'] for event in result['data']]))
            request.extensions['project']['project'] = project_ids

            timestamps = [event['timestamp'] for event in result['data']]
            request.extensions[
                'timeseries']['from_date'] = util.parse_datetime(
                    min(timestamps)).isoformat()
            # We add 1 second since this gets translated to ('timestamp', '<', to_date)
            # and events are stored with a granularity of 1 second.
            request.extensions['timeseries']['to_date'] = (
                util.parse_datetime(max(timestamps)) +
                timedelta(seconds=1)).isoformat()

        return query_func(dataset, request, *args, **kwargs)
Exemple #13
0
def test_tags_processor(query_body, expected_query) -> None:
    state.set_config("ast_tag_processor_enabled", 1)
    dataset = get_dataset("transactions")
    query = parse_query(query_body, dataset)
    request_settings = HTTPRequestSettings()
    request = Request("a", query, request_settings, {}, "r")
    _ = dataset.get_query_plan_builder().build_plan(request)

    assert (DictClickhouseQuery(
        dataset, query, request_settings).format_sql() == expected_query)
def parse_and_process(query_body: MutableMapping[str, Any]) -> ClickhouseQuery:
    dataset = get_dataset("transactions")
    query = parse_query(query_body, dataset)
    request = Request("a", query, HTTPRequestSettings(), {}, "r")
    for p in dataset.get_query_processors():
        p.process_query(query, request.settings)
    plan = dataset.get_query_plan_builder().build_plan(request)

    ArrayJoinKeyValueOptimizer("tags").process_query(plan.query,
                                                     request.settings)
    return plan.query
Exemple #15
0
    def validate(
        self, value: MutableMapping[str, Any], dataset: Dataset, referrer: str
    ) -> Request:
        try:
            value = validate_jsonschema(value, self.__composite_schema)
        except jsonschema.ValidationError as error:
            raise JsonSchemaValidationException(str(error)) from error

        query_body = {
            key: value.pop(key)
            for key in self.__query_schema["properties"].keys()
            if key in value
        }
        settings = {
            key: value.pop(key)
            for key in self.__settings_schema["properties"].keys()
            if key in value
        }

        class_name = self.__setting_class
        if isinstance(class_name, type(HTTPRequestSettings)):
            settings_obj: Union[
                HTTPRequestSettings, SubscriptionRequestSettings
            ] = class_name(**settings)
        elif isinstance(class_name, type(SubscriptionRequestSettings)):
            settings_obj = class_name()

        extensions = {}
        for extension_name, extension_schema in self.__extension_schemas.items():
            extensions[extension_name] = {
                key: value.pop(key)
                for key in extension_schema["properties"].keys()
                if key in value
            }

        if self.__language == Language.SNQL:
            query = parse_snql_query(query_body["query"], dataset)
        else:
            query = parse_query(query_body, dataset)
            apply_query_extensions(query, extensions, settings_obj)

        request_id = uuid.uuid4().hex
        return Request(
            request_id,
            # TODO: Replace this with the actual query raw body.
            # this can have an impact on subscriptions so we need
            # to be careful with the change.
            ChainMap(query_body, *extensions.values()),
            query,
            settings_obj,
            referrer,
        )
def test_select_storage(query_body: MutableMapping[str, Any],
                        expected_table: str) -> None:
    sessions = get_dataset("sessions")
    query = parse_query(query_body, sessions)
    request = Request("", query_body, query, HTTPRequestSettings(), "")

    def query_runner(query: Query, settings: RequestSettings,
                     reader: Reader) -> QueryResult:
        assert query.get_from_clause().table_name == expected_table
        return QueryResult({}, {})

    sessions.get_default_entity().get_query_pipeline_builder(
    ).build_execution_pipeline(request, query_runner).execute()
Exemple #17
0
def build_request(
    body: MutableMapping[str, Any],
    parser: Parser,
    settings_class: Union[Type[HTTPRequestSettings], Type[SubscriptionRequestSettings]],
    schema: RequestSchema,
    dataset: Dataset,
    timer: Timer,
    referrer: str,
) -> Request:
    with sentry_sdk.start_span(description="build_request", op="validate") as span:
        try:
            request_parts = schema.validate(body)
            if settings_class == HTTPRequestSettings:
                settings = {
                    **request_parts.settings,
                    "consistent": _consistent_override(
                        request_parts.settings.get("consistent", False), referrer
                    ),
                }
                settings_obj: Union[
                    HTTPRequestSettings, SubscriptionRequestSettings
                ] = settings_class(**settings)
            elif settings_class == SubscriptionRequestSettings:
                settings_obj = settings_class(
                    consistent=_consistent_override(True, referrer)
                )

            query = parser(request_parts, settings_obj, dataset)

            request_id = uuid.uuid4().hex
            request = Request(
                request_id,
                # TODO: Replace this with the actual query raw body.
                # this can have an impact on subscriptions so we need
                # to be careful with the change.
                ChainMap(request_parts.query, *request_parts.extensions.values()),
                query,
                settings_obj,
                referrer,
            )
        except (InvalidJsonRequestException, InvalidQueryException) as exception:
            record_invalid_request(timer, referrer)
            raise exception
        except Exception as exception:
            record_error_building_request(timer, referrer)
            raise exception

        span.set_data("snuba_query", request.body)

        timer.mark("validate_schema")
        return request
Exemple #18
0
def test_events_processing() -> None:
    query_body = {
        "selected_columns": ["tags[transaction]", "contexts[browser.name]"]
    }

    events_dataset = get_dataset("events")
    events_entity = events_dataset.get_default_entity()
    events_storage = events_entity.get_writable_storage()

    query = parse_query(query_body, events_dataset)
    request = Request("", query_body, query, HTTPRequestSettings(), "")

    def query_runner(query: Query, settings: RequestSettings,
                     reader: Reader) -> QueryResult:

        if events_storage.get_storage_key() == StorageKey.EVENTS:
            transaction_col_name = "transaction"
        else:
            transaction_col_name = "transaction_name"

        assert query.get_selected_columns_from_ast() == [
            SelectedExpression(
                "tags[transaction]",
                Column("_snuba_tags[transaction]", None, transaction_col_name),
            ),
            SelectedExpression(
                "contexts[browser.name]",
                FunctionCall(
                    "_snuba_contexts[browser.name]",
                    "arrayElement",
                    (
                        Column(None, None, "contexts.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (
                                Column(None, None, "contexts.key"),
                                Literal(None, "browser.name"),
                            ),
                        ),
                    ),
                ),
            ),
        ]
        return QueryResult({}, {})

    events_entity.get_query_pipeline_builder().build_execution_pipeline(
        request, query_runner).execute()
def parse_and_process(query_body: MutableMapping[str, Any]) -> ClickhouseQuery:
    dataset = get_dataset("transactions")
    query = parse_query(query_body, dataset)
    request = Request("a", query_body, query, HTTPRequestSettings(), "r")
    entity = get_entity(query.get_from_clause().key)
    for p in entity.get_query_processors():
        p.process_query(query, request.settings)

    ArrayJoinKeyValueOptimizer("tags").process_query(query, request.settings)

    query_plan = SingleStorageQueryPlanBuilder(
        storage=entity.get_writable_storage(),
        mappers=transaction_translator,
    ).build_and_rank_plans(query, request.settings)[0]

    return query_plan.query
Exemple #20
0
def test_col_split(
    dataset_name: str,
    first_query_data: Mapping[str, Any],
    second_query_data: Mapping[str, Any],
):
    @split_query
    def do_query(dataset: Dataset, request: Request, timer: Timer):
        selected_cols = request.query.get_selected_columns()
        if selected_cols == list(first_query_data[0].keys()):
            return RawQueryResult({"data": first_query_data}, {})
        elif selected_cols == list(second_query_data[0].keys()):
            return RawQueryResult({"data": second_query_data}, {})
        else:
            raise ValueError(f"Unexpected selected columns: {selected_cols}")

    events = get_dataset(dataset_name)
    query = Query(
        {
            "selected_columns": list(second_query_data[0].keys()),
            "conditions": [""],
            "orderby": "events.event_id",
            "sample": 10,
            "limit": 100,
            "offset": 50,
        },
        events.get_all_storages()
        [0].get_schemas().get_read_schema().get_data_source(),
    )

    request = Request(
        uuid.uuid4().hex,
        query,
        HTTPRequestSettings(),
        {
            "project": {
                "project": 1
            },
            "timeseries": {
                "from_date": "2019-09-19T10:00:00",
                "to_date": "2019-09-19T12:00:00",
                "granularity": 3600,
            },
        },
        "tests",
    )

    do_query(events, request, None)
def test_nested_optimizer(query_body, expected_condition) -> None:
    transactions = get_dataset("transactions")
    query = parse_query(query_body, transactions)
    request_settings = HTTPRequestSettings()
    request = Request("", query, request_settings, {}, "")

    query_plan = transactions.get_query_plan_builder().build_plan(request)
    processor = NestedFieldConditionOptimizer(
        nested_col="tags",
        flattened_col="tags_map",
        timestamp_cols={"start_ts", "finish_ts"},
        beginning_of_time=datetime(2019, 12, 11, 0, 0, 0),
    )
    clickhouse_query = query_plan.query
    processor.process_query(clickhouse_query, request_settings)

    assert clickhouse_query.get_conditions() == expected_condition
Exemple #22
0
            def run_non_consistent() -> Result:
                request_copy = Request(
                    id=request.id,
                    body=copy.deepcopy(request.body),
                    query=copy.deepcopy(request.query),
                    settings=SubscriptionRequestSettings(consistent=False),
                    referrer=request.referrer,
                )

                return parse_and_run_query(
                    self.__dataset,
                    request_copy,
                    timer,
                    robust=True,
                    concurrent_queries_gauge=self.__concurrent_clickhouse_gauge
                    if not is_consistent_query else None,
                ).result
Exemple #23
0
def test_data_source(
    query_body: MutableMapping[str, Any], expected_table: str,
) -> None:
    request_settings = HTTPRequestSettings()
    dataset = get_dataset("discover")
    query = parse_query(query_body, dataset)
    request = Request("a", query, request_settings, {}, "r")
    for processor in get_dataset("discover").get_query_processors():
        processor.process_query(request.query, request.settings)

    plan = dataset.get_query_plan_builder().build_plan(request)

    for physical_processor in plan.plan_processors:
        physical_processor.process_query(plan.query, request.settings)

    assert plan.query.get_data_source().format_from() == expected_table, json.dumps(
        query_body
    )
def test_events_processing() -> None:
    query_body = {"selected_columns": ["tags[transaction]", "contexts[browser.name]"]}

    events = get_dataset("events")
    query = parse_query(query_body, events)
    request = Request("", query, HTTPRequestSettings(), {}, "")

    query_plan = (
        events.get_default_entity().get_query_plan_builder().build_plan(request)
    )
    for clickhouse_processor in query_plan.plan_processors:
        clickhouse_processor.process_query(query_plan.query, request.settings)

    def query_runner(
        query: Query, settings: RequestSettings, reader: Reader[SqlQuery]
    ) -> QueryResult:
        assert query.get_selected_columns_from_ast() == [
            SelectedExpression(
                "tags[transaction]", Column("tags[transaction]", None, "transaction")
            ),
            SelectedExpression(
                "contexts[browser.name]",
                FunctionCall(
                    "contexts[browser.name]",
                    "arrayElement",
                    (
                        Column(None, None, "contexts.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (
                                Column(None, None, "contexts.key"),
                                Literal(None, "browser.name"),
                            ),
                        ),
                    ),
                ),
            ),
        ]
        return QueryResult({}, {})

    query_plan.execution_strategy.execute(
        query_plan.query, request.settings, query_runner
    )
def test_sessions_processing() -> None:
    query_body = {
        "selected_columns": ["duration_quantiles", "sessions", "users"]
    }

    sessions = get_dataset("sessions")
    query = parse_query(query_body, sessions)
    request = Request("", query, HTTPRequestSettings(), {}, "")

    query_plan = (sessions.get_default_entity().get_query_plan_builder().
                  build_plan(request))
    for clickhouse_processor in query_plan.plan_processors:
        clickhouse_processor.process_query(query_plan.query, request.settings)

    def query_runner(query: Query, settings: RequestSettings,
                     reader: Reader[SqlQuery]) -> QueryResult:
        assert query.get_selected_columns_from_ast() == [
            SelectedExpression(
                "duration_quantiles",
                CurriedFunctionCall(
                    "duration_quantiles",
                    FunctionCall(
                        None,
                        "quantilesIfMerge",
                        (Literal(None, 0.5), Literal(None, 0.9)),
                    ),
                    (Column(None, None, "duration_quantiles"), ),
                ),
            ),
            SelectedExpression(
                "sessions",
                FunctionCall("sessions", "countIfMerge",
                             (Column(None, None, "sessions"), )),
            ),
            SelectedExpression(
                "users",
                FunctionCall("users", "uniqIfMerge",
                             (Column(None, None, "users"), )),
            ),
        ]
        return QueryResult({}, {})

    query_plan.execution_strategy.execute(query_plan.query, request.settings,
                                          query_runner)
Exemple #26
0
def test_col_split(
    dataset_name: str,
    first_query_data: Mapping[str, Any],
    second_query_data: Mapping[str, Any],
):
    @split_query
    def do_query(dataset: Dataset, request: Request, timer: Timer):
        selected_cols = request.query.get_selected_columns()
        if selected_cols == list(first_query_data[0].keys()):
            return QueryResult({"data": first_query_data}, 200)
        elif selected_cols == list(second_query_data[0].keys()):
            return QueryResult({"data": second_query_data}, 200)
        else:
            raise ValueError(f"Unexpected selected columns: {selected_cols}")

    query = Query({
        "selected_columns": list(second_query_data[0].keys()),
        "conditions": [""],
        "orderby": "events.event_id",
        "sample": 10,
        "limit": 100,
        "offset": 50,
    })

    request = Request(
        query,
        RequestSettings(False, False, False),
        {
            "project": {
                "project": 1
            },
            "timeseries": {
                "from_date": "2019-09-19T10:00:00",
                "to_date": "2019-09-19T12:00:00",
                "granularity": 3600,
            }
        },
    )

    events = get_dataset(dataset_name)
    do_query(events, request, None)
Exemple #27
0
def test_no_split(dataset_name: str):
    events = get_dataset(dataset_name)
    query = Query(
        {
            "selected_columns": ["event_id"],
            "conditions": [""],
            "orderby": "event_id",
            "sample": 10,
            "limit": 100,
            "offset": 50,
        },
        events.get_all_storages()
        [0].get_schemas().get_read_schema().get_data_source(),
    )

    @split_query
    def do_query(dataset: Dataset, request: Request, timer: Timer):
        assert request.query == query

    request = Request(uuid.uuid4().hex, query, HTTPRequestSettings(), {},
                      "tests")

    do_query(events, request, None)
def test_metrics_processing(
    entity_name: str,
    column_name: str,
    entity_key: EntityKey,
    translated_value: Expression,
) -> None:
    settings.ENABLE_DEV_FEATURES = True
    settings.DISABLED_DATASETS = set()

    importlib.reload(factory)
    importlib.reload(storage_factory)
    importlib.reload(cluster)

    query_body = {
        "query": (f"MATCH ({entity_name}) "
                  f"SELECT {column_name} BY org_id, project_id, tags[10] "
                  "WHERE "
                  "timestamp >= toDateTime('2021-05-17 19:42:01') AND "
                  "timestamp < toDateTime('2021-05-17 23:42:01') AND "
                  "org_id = 1 AND "
                  "project_id = 1"),
    }

    metrics_dataset = get_dataset("metrics")
    query = parse_snql_query(query_body["query"], [], metrics_dataset)

    request = Request("", query_body, query, HTTPRequestSettings(), "")

    def query_runner(query: Query, settings: RequestSettings,
                     reader: Reader) -> QueryResult:
        assert query.get_selected_columns() == [
            SelectedExpression(
                "org_id",
                Column("_snuba_org_id", None, "org_id"),
            ),
            SelectedExpression(
                "project_id",
                Column("_snuba_project_id", None, "project_id"),
            ),
            SelectedExpression(
                "tags[10]",
                FunctionCall(
                    "_snuba_tags[10]",
                    "arrayElement",
                    (
                        Column(None, None, "tags.value"),
                        FunctionCall(
                            None,
                            "indexOf",
                            (Column(None, None, "tags.key"), Literal(None,
                                                                     10)),
                        ),
                    ),
                ),
            ),
            SelectedExpression(
                column_name,
                translated_value,
            ),
        ]
        return QueryResult({}, {})

    entity = get_entity(entity_key)
    entity.get_query_pipeline_builder().build_execution_pipeline(
        request, query_runner).execute()
Exemple #29
0
def test_simple():
    request_body = {
        "selected_columns": ["event_id"],
        "orderby": "event_id",
        "sample": 0.1,
        "limit": 100,
        "offset": 50,
        "project": 1,
    }

    query = Query(
        request_body,
        get_storage(
            "events").get_schemas().get_read_schema().get_data_source(),
    )

    request = Request(
        uuid.UUID("a" * 32).hex, query, HTTPRequestSettings(), {}, "search")

    time = TestingClock()

    timer = Timer("test", clock=time)
    time.sleep(0.01)

    message = SnubaQueryMetadata(
        request=request,
        dataset=get_dataset("events"),
        timer=timer,
        query_list=[
            ClickhouseQueryMetadata(
                sql=
                "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100",
                stats={"sample": 10},
                status="success",
                trace_id="b" * 32)
        ]).to_dict()

    processor = (enforce_table_writer(
        get_dataset("querylog")).get_stream_loader().get_processor())

    assert processor.process_message(
        message
    ) == ProcessedMessage(ProcessorAction.INSERT, [{
        "request_id":
        str(uuid.UUID("a" * 32)),
        "request_body":
        '{"limit": 100, "offset": 50, "orderby": "event_id", "project": 1, "sample": 0.1, "selected_columns": ["event_id"]}',
        "referrer":
        "search",
        "dataset":
        get_dataset("events"),
        "projects": [1],
        "organization":
        None,
        "timestamp":
        timer.for_json()["timestamp"],
        "duration_ms":
        10,
        "status":
        "success",
        "clickhouse_queries.sql": [
            "select event_id from sentry_dist sample 0.1 prewhere project_id in (1) limit 50, 100"
        ],
        "clickhouse_queries.status": ["success"],
        "clickhouse_queries.trace_id": [str(uuid.UUID("b" * 32))],
        "clickhouse_queries.duration_ms": [0],
        "clickhouse_queries.stats": ['{"sample": 10}'],
        "clickhouse_queries.final": [0],
        "clickhouse_queries.cache_hit": [0],
        "clickhouse_queries.sample": [10.],
        "clickhouse_queries.max_threads": [0],
        "clickhouse_queries.num_days": [0],
        "clickhouse_queries.clickhouse_table": [""],
        "clickhouse_queries.query_id": [""],
        "clickhouse_queries.is_duplicate": [0],
        "clickhouse_queries.consistent": [0],
    }])
Exemple #30
0
def build_request(
    body: MutableMapping[str, Any],
    parser: Parser,
    settings_class: Union[Type[HTTPQuerySettings], Type[SubscriptionQuerySettings]],
    schema: RequestSchema,
    dataset: Dataset,
    timer: Timer,
    referrer: str,
    custom_processing: Optional[CustomProcessors] = None,
) -> Request:
    with sentry_sdk.start_span(description="build_request", op="validate") as span:
        try:
            request_parts = schema.validate(body)
            if settings_class == HTTPQuerySettings:
                query_settings: MutableMapping[str, bool | str] = {
                    **request_parts.query_settings,
                    "consistent": _consistent_override(
                        request_parts.query_settings.get("consistent", False), referrer
                    ),
                }
                query_settings["referrer"] = referrer
                # TODO: referrer probably doesn't need to be passed in, it should be from the body
                settings_obj: Union[
                    HTTPQuerySettings, SubscriptionQuerySettings
                ] = settings_class(
                    **query_settings,
                )
            elif settings_class == SubscriptionQuerySettings:
                settings_obj = settings_class(
                    consistent=_consistent_override(True, referrer),
                )
            query, snql_anonymized = parser(
                request_parts, settings_obj, dataset, custom_processing
            )

            project_ids = get_object_ids_in_query_ast(query, "project_id")
            if project_ids is not None and len(project_ids) == 1:
                sentry_sdk.set_tag("snuba_project_id", project_ids.pop())

            org_ids = get_object_ids_in_query_ast(query, "org_id")
            if org_ids is not None and len(org_ids) == 1:
                sentry_sdk.set_tag("snuba_org_id", org_ids.pop())
            attribution_info = dict(request_parts.attribution_info)
            # TODO: clean this up
            attribution_info["app_id"] = get_app_id(
                request_parts.attribution_info["app_id"]
            )
            attribution_info["referrer"] = referrer

            request_id = uuid.uuid4().hex
            request = Request(
                id=request_id,
                # TODO: Replace this with the actual query raw body.
                # this can have an impact on subscriptions so we need
                # to be careful with the change.
                original_body=body,
                query=query,
                attribution_info=AttributionInfo(**attribution_info),
                query_settings=settings_obj,
                snql_anonymized=snql_anonymized,
            )
        except (InvalidJsonRequestException, InvalidQueryException) as exception:
            record_invalid_request(timer, referrer)
            raise exception
        except Exception as exception:
            record_error_building_request(timer, referrer)
            raise exception

        span.set_data(
            "snuba_query_parsed",
            repr(query).split("\n"),
        )
        span.set_data(
            "snuba_query_raw",
            textwrap.wrap(repr(request.original_body), 100, break_long_words=False),
        )
        sentry_sdk.add_breadcrumb(
            category="query_info",
            level="info",
            message="snuba_query_raw",
            data={
                "query": textwrap.wrap(
                    repr(request.original_body), 100, break_long_words=False
                )
            },
        )

        timer.mark("validate_schema")
        return request