def build_request( self, dataset: Dataset, timestamp: datetime, offset: Optional[int], timer: Timer ) -> Request: """ Returns a Request that can be used to run a query via `parse_and_run_query`. :param dataset: The Dataset to build the request for :param timestamp: Date that the query should run up until :param offset: Maximum offset we should query for """ schema = RequestSchema.build_with_extensions( dataset.get_extensions(), SubscriptionRequestSettings, ) extra_conditions: Sequence[Condition] = [] if offset is not None: extra_conditions = [[["ifnull", ["offset", 0]], "<=", offset]] return validate_request_content( { "project": self.project_id, "conditions": [*self.conditions, *extra_conditions], "aggregations": self.aggregations, "from_date": (timestamp - self.time_window).isoformat(), "to_date": timestamp.isoformat(), }, schema, timer, dataset, SUBSCRIPTION_REFERRER, )
def dataset_query(dataset: Dataset, body, timer: Timer) -> Response: assert http_request.method == "POST" with sentry_sdk.start_span(description="build_schema", op="validate"): schema = RequestSchema.build_with_extensions( dataset.get_extensions(), HTTPRequestSettings ) request = build_request(body, schema, timer, dataset, http_request.referrer) try: result = parse_and_run_query(dataset, request, timer) except QueryException as exception: status = 500 details: Mapping[str, Any] cause = exception.__cause__ if isinstance(cause, RateLimitExceeded): status = 429 details = { "type": "rate-limited", "message": "rate limit exceeded", } elif isinstance(cause, ClickhouseError): details = { "type": "clickhouse", "message": str(cause), "code": cause.code, } elif isinstance(cause, Exception): details = { "type": "unknown", "message": str(cause), } else: raise # exception should have been chained return Response( json.dumps( {"error": details, "timing": timer.for_json(), **exception.extra} ), status, {"Content-Type": "application/json"}, ) payload: MutableMapping[str, Any] = {**result.result, "timing": timer.for_json()} if settings.STATS_IN_RESPONSE or request.settings.get_debug(): payload.update(result.extra) return Response(json.dumps(payload), 200, {"Content-Type": "application/json"})
def dataset_query_view(*, dataset: Dataset, timer: Timer): if http_request.method == "GET": schema = RequestSchema.build_with_extensions( dataset.get_extensions(), HTTPRequestSettings ) return render_template( "query.html", query_template=json.dumps(schema.generate_template(), indent=4,), ) elif http_request.method == "POST": body = parse_request_body(http_request) return dataset_query(dataset, body, timer) else: assert False, "unexpected fallthrough"
def dataset_query(dataset: Dataset, body, timer: Timer) -> Response: assert http_request.method == "POST" ensure_table_exists(dataset) return format_result( run_query( dataset, validate_request_content( body, RequestSchema.build_with_extensions(dataset.get_extensions(), HTTPRequestSettings), timer, dataset, http_request.referrer, ), timer, ))
def parse_and_run_query( dataset: Dataset, request: Request, timer: Timer ) -> ClickhouseQueryResult: from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit( request.extensions["timeseries"] ) if ( request.query.get_sample() is not None and request.query.get_sample() != 1.0 ) and not request.settings.get_turbo(): metrics.increment("sample_without_turbo", tags={"referrer": request.referrer}) extensions = dataset.get_extensions() for name, extension in extensions.items(): extension.get_processor().process_query( request.query, request.extensions[name], request.settings ) request.query.add_conditions(dataset.default_conditions()) if request.settings.get_turbo(): request.query.set_final(False) for processor in dataset.get_query_processors(): processor.process_query(request.query, request.settings) relational_source = request.query.get_data_source() request.query.add_conditions(relational_source.get_mandatory_conditions()) source = relational_source.format_from() with sentry_sdk.start_span(description="create_query", op="db"): # TODO: consider moving the performance logic and the pre_where generation into # ClickhouseQuery since they are Clickhouse specific query = DictClickhouseQuery(dataset, request.query, request.settings) timer.mark("prepare_query") num_days = (to_date - from_date).days stats = { "clickhouse_table": source, "final": request.query.get_final(), "referrer": request.referrer, "num_days": num_days, "sample": request.query.get_sample(), } with sentry_sdk.configure_scope() as scope: if scope.span: scope.span.set_tag("dataset", type(dataset).__name__) scope.span.set_tag("referrer", http_request.referrer) scope.span.set_tag("timeframe_days", num_days) with sentry_sdk.start_span(description=query.format_sql(), op="db") as span: span.set_tag("dataset", type(dataset).__name__) span.set_tag("table", source) try: span.set_tag( "ast_query", AstClickhouseQuery(request.query, request.settings).format_sql(), ) except Exception: logger.exception("Failed to format ast query") result = raw_query( request, query, NativeDriverReader(clickhouse_ro), timer, stats ) with sentry_sdk.configure_scope() as scope: if scope.span: if "max_threads" in stats: scope.span.set_tag("max_threads", stats["max_threads"]) return result
def _run_query_pipeline( dataset: Dataset, request: Request, timer: Timer, query_metadata: SnubaQueryMetadata, ) -> RawQueryResult: """ Runs the query processing and execution pipeline for a Snuba Query. This means it takes a Dataset and a Request and returns the results of the query. This process includes: - Applying dataset specific syntax extensions (QueryExtension) - Applying dataset query processors on the abstract Snuba query. - Using the dataset provided StorageQueryPlanBuilder to build a StorageQueryPlan. This step transforms the Snuba Query into the Storage Query (that is contextual to the storage/s). From this point on none should depend on the dataset. - Executing the storage specific query processors. - Providing the newly built Query and a QueryRunner to the QueryExecutionStrategy to actually run the DB Query. """ # TODO: this will work perfectly with datasets that are not time series. Remove it. from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit( request.extensions["timeseries"]) if (request.query.get_sample() is not None and request.query.get_sample() != 1.0) and not request.settings.get_turbo(): metrics.increment("sample_without_turbo", tags={"referrer": request.referrer}) extensions = dataset.get_extensions() for name, extension in extensions.items(): extension.get_processor().process_query(request.query, request.extensions[name], request.settings) # TODO: Fit this in a query processor. All query transformations should be driven by # datasets/storages and never hardcoded. if request.settings.get_turbo(): request.query.set_final(False) for processor in dataset.get_query_processors(): processor.process_query(request.query, request.settings) storage_query_plan = dataset.get_query_plan_builder().build_plan(request) # TODO: This below should be a storage specific query processor. relational_source = request.query.get_data_source() request.query.add_conditions(relational_source.get_mandatory_conditions()) for processor in storage_query_plan.query_processors: processor.process_query(request.query, request.settings) query_runner = partial( _format_storage_query_and_run, dataset, timer, query_metadata, from_date, to_date, ) return storage_query_plan.execution_strategy.execute(request, query_runner)
def _run_query_pipeline( dataset: Dataset, request: Request, timer: Timer, query_metadata: SnubaQueryMetadata, ) -> QueryResult: """ Runs the query processing and execution pipeline for a Snuba Query. This means it takes a Dataset and a Request and returns the results of the query. This process includes: - Applying dataset specific syntax extensions (QueryExtension) - Applying dataset query processors on the abstract Snuba query. - Using the dataset provided ClickhouseQueryPlanBuilder to build a ClickhouseQueryPlan. This step transforms the Snuba Query into the Storage Query (that is contextual to the storage/s). From this point on none should depend on the dataset. - Executing the plan specific query processors. - Providing the newly built Query, processors to be run for each DB query and a QueryRunner to the QueryExecutionStrategy to actually run the DB Query. """ # TODO: this will work perfectly with datasets that are not time series. Remove it. from_date, to_date = TimeSeriesExtensionProcessor.get_time_limit( request.extensions["timeseries"] ) if ( request.query.get_sample() is not None and request.query.get_sample() != 1.0 ) and not request.settings.get_turbo(): metrics.increment("sample_without_turbo", tags={"referrer": request.referrer}) extensions = dataset.get_extensions() for name, extension in extensions.items(): with sentry_sdk.start_span( description=type(extension.get_processor()).__name__, op="extension" ): extension.get_processor().process_query( request.query, request.extensions[name], request.settings ) # TODO: Fit this in a query processor. All query transformations should be driven by # datasets/storages and never hardcoded. if request.settings.get_turbo(): request.query.set_final(False) for processor in dataset.get_query_processors(): with sentry_sdk.start_span( description=type(processor).__name__, op="processor" ): processor.process_query(request.query, request.settings) query_plan = dataset.get_query_plan_builder().build_plan(request) # From this point on. The logical query should not be used anymore by anyone. # The Clickhouse Query is the one to be used to run the rest of the query pipeline. # TODO: Break the Query Plan execution out of this method. With the division # between plan specific processors and DB query specific processors and with # the soon to come ClickhouseCluster, there is more coupling between the # components of the query plan. for clickhouse_processor in query_plan.plan_processors: with sentry_sdk.start_span( description=type(clickhouse_processor).__name__, op="processor" ): clickhouse_processor.process_query(query_plan.query, request.settings) query_runner = partial( _format_storage_query_and_run, timer, query_metadata, from_date, to_date, request.referrer, ) return query_plan.execution_strategy.execute( query_plan.query, request.settings, query_runner )