def test_get_time_range() -> None: """ Test finding the time range of a query. """ body = { "selected_columns": ["event_id"], "conditions": [ # Cannot test complex conditions based on explicit calls # the `and` and `or` functions, because they would not be # parsed as datetime by the old parser. ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", ">=", "2000-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), [("timestamp", "<", "2019-09-18T12:00:00"), ("project_id", "IN", [1])], ("project_id", "IN", [1]), ], } events = get_dataset("events") query = parse_query(body, events) processors = events.get_default_entity().get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(query, HTTPRequestSettings()) from_date_ast, to_date_ast = get_time_range(identity_translate(query), "timestamp") assert (from_date_ast is not None and isinstance(from_date_ast, datetime) and from_date_ast.isoformat() == "2019-09-18T10:00:00") assert (to_date_ast is not None and isinstance(to_date_ast, datetime) and to_date_ast.isoformat() == "2019-09-19T12:00:00")
def _get_date_range(query: Query) -> Optional[int]: """ Best guess to find the time range for the query. We pick the first column that is compared with a datetime Literal. """ pattern = FunctionCall( Or([String(ConditionFunctions.GT), String(ConditionFunctions.GTE)]), (Column(None, Param("col_name", Any(str))), Literal(Any(datetime))), ) condition = query.get_condition_from_ast() if condition is None: return None for exp in condition: result = pattern.match(exp) if result is not None: from_date, to_date = get_time_range(query, result.string("col_name")) if from_date is None or to_date is None: return None else: return (to_date - from_date).days return None
def test_get_time_range() -> None: """ Test finding the time range of a query. """ body = { "selected_columns": ["event_id"], "conditions": [ ("timestamp", ">=", "2019-09-18T10:00:00"), ("timestamp", ">=", "2000-09-18T10:00:00"), ("timestamp", "<", "2019-09-19T12:00:00"), [("timestamp", "<", "2019-09-18T12:00:00"), ("project_id", "IN", [1])], ("project_id", "IN", [1]), ], } events = get_dataset("events") query = parse_query(body, events) processors = events.get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(query, HTTPRequestSettings()) from_date_ast, to_date_ast = get_time_range(ClickhouseQuery(query), "timestamp") assert (from_date_ast is not None and isinstance(from_date_ast, datetime) and from_date_ast.isoformat() == "2019-09-18T10:00:00") assert (to_date_ast is not None and isinstance(to_date_ast, datetime) and to_date_ast.isoformat() == "2019-09-19T12:00:00")
def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: from_date_ast, to_date_ast = get_time_range(query, "timestamp") assert from_date_ast is not None and isinstance( from_date_ast, datetime) assert to_date_ast is not None and isinstance(to_date_ast, datetime) conditions = query.get_conditions() or [] from_date_str = next( (condition[2] for condition in conditions if condition[0] == "timestamp" and condition[1] == ">="), None, ) to_date_str = next( (condition[2] for condition in conditions if condition[0] == "timestamp" and condition[1] == "<"), None, ) assert from_date_str == from_date_ast.isoformat() assert to_date_str == to_date_ast.isoformat() found_timestamps.append( (from_date_ast.isoformat(), to_date_ast.isoformat())) return QueryResult({"data": []}, {})
def test_get_time_range() -> None: """ Test finding the time range of a query. """ body = """ MATCH (events) SELECT event_id WHERE timestamp >= toDateTime('2019-09-18T10:00:00') AND timestamp >= toDateTime('2000-09-18T10:00:00') AND timestamp < toDateTime('2019-09-19T12:00:00') AND (timestamp < toDateTime('2019-09-18T12:00:00') OR project_id IN tuple(1)) AND project_id IN tuple(1) """ events = get_dataset("events") query, _ = parse_snql_query(body, events) processors = events.get_default_entity().get_query_processors() for processor in processors: if isinstance(processor, TimeSeriesProcessor): processor.process_query(query, HTTPQuerySettings()) from_date_ast, to_date_ast = get_time_range(identity_translate(query), "timestamp") assert (from_date_ast is not None and isinstance(from_date_ast, datetime) and from_date_ast.isoformat() == "2019-09-18T10:00:00") assert (to_date_ast is not None and isinstance(to_date_ast, datetime) and to_date_ast.isoformat() == "2019-09-19T12:00:00")
def process_query(self, query: Query, request_settings: RequestSettings) -> None: # NOTE: the product side is restricted to a 6h window, however it rounds # outwards, which extends the window to 7h. from_date, to_date = get_time_range(query, "started") if not from_date or not to_date or (to_date - from_date) > timedelta(hours=7): raise ValidationException( "Minute-resolution queries are restricted to a 7-hour time window." )
def do_query( query: ClickhouseQuery, request_settings: RequestSettings, ) -> QueryResult: from_date_ast, to_date_ast = get_time_range(query, "timestamp") assert from_date_ast is not None and isinstance(from_date_ast, datetime) assert to_date_ast is not None and isinstance(to_date_ast, datetime) found_timestamps.append((from_date_ast.isoformat(), to_date_ast.isoformat())) return QueryResult({"data": []}, {})
def parse_and_run_query( dataset: Dataset, request: Request, timer: Timer, robust: bool = False, concurrent_queries_gauge: Optional[Gauge] = None, ) -> QueryResult: """ Runs a Snuba Query, then records the metadata about each split query that was run. """ # from_clause = request.query.get_from_clause() start, end = None, None entity_name = "unknown" if isinstance(request.query, LogicalQuery): entity_key = request.query.get_from_clause().key entity = get_entity(entity_key) entity_name = entity_key.value if entity.required_time_column is not None: start, end = get_time_range(request.query, entity.required_time_column) query_metadata = SnubaQueryMetadata( request=request, start_timestamp=start, end_timestamp=end, dataset=get_dataset_name(dataset), entity=entity_name, timer=timer, query_list=[], projects=ProjectsFinder().visit(request.query), snql_anonymized=request.snql_anonymized, ) try: result = _run_query_pipeline( dataset=dataset, request=request, timer=timer, query_metadata=query_metadata, robust=robust, concurrent_queries_gauge=concurrent_queries_gauge, ) _set_query_final(request, result.extra) if not request.query_settings.get_dry_run(): record_query(request, timer, query_metadata, result.extra) except QueryException as error: _set_query_final(request, error.extra) record_query(request, timer, query_metadata, error.extra) raise error return result
def _query_overlaps_replacements( self, query: Query, latest_replacement_time: Optional[datetime], ) -> bool: """ Given a Query and the latest replacement time for any project this query touches, returns whether or not this Query's time range overlaps that replacement. """ query_from, _ = get_time_range(query, "timestamp") return ( latest_replacement_time > query_from if latest_replacement_time and query_from else True )
def v2_selector_function(query: Query, referrer: str) -> Tuple[str, List[str]]: if settings.TRANSACTIONS_UPGRADE_BEGINING_OF_TIME is None or not isinstance( query, ProcessableQuery): return ("transactions_v1", []) range = get_time_range(query, "timestamp") if range[0] is None or range[ 0] < settings.TRANSACTIONS_UPGRADE_BEGINING_OF_TIME: return ("transactions_v1", []) mapping = { Option.TRANSACTIONS: "transactions_v1", Option.TRANSACTIONS_V2: "transactions_v2", } choice = RolloutSelector(Option.TRANSACTIONS, Option.TRANSACTIONS_V2, "transactions").choose(referrer) if choice.secondary is None: return (mapping[choice.primary], []) else: return (mapping[choice.primary], [mapping[choice.secondary]])
def select_storage(self, query: Query, query_settings: QuerySettings) -> StorageAndMappers: # If the passed in `query_settings` arg is an instance of `SubscriptionQuerySettings`, # then it is a crash rate alert subscription, and hence we decide on whether to use the # materialized storage or the raw storage by examining the time_window. # If the `time_window` <=1h, then select the raw storage otherwise select materialized # storage # NOTE: If we were to support other types of subscriptions over the sessions dataset that # do not follow this method used to identify which storage to use, we would need to # find a different way to distinguish them. if isinstance(query_settings, SubscriptionQuerySettings): from_date, to_date = get_time_range(query, "started") if from_date and to_date: use_materialized_storage = to_date - from_date > timedelta( hours=1) else: use_materialized_storage = True else: granularity = extract_granularity_from_query(query, "started") or 3600 use_materialized_storage = granularity >= 3600 and (granularity % 3600) == 0 metrics.increment( "query.selector", tags={ "selected_storage": "materialized" if use_materialized_storage else "raw", }, ) if use_materialized_storage: return StorageAndMappers(self.materialized_storage, sessions_hourly_translators) else: return StorageAndMappers(self.raw_storage, sessions_raw_translators)
def execute( self, query: Query, request_settings: RequestSettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ If a query is: - ORDER BY timestamp DESC - has no grouping - has an offset/limit - has a large time range We know we have to reverse-sort the entire set of rows to return the small chunk at the end of the time range, so optimistically split the time range into smaller increments, and start with the last one, so that we can potentially avoid querying the entire range. """ limit = query.get_limit() if limit is None or query.get_groupby(): return None if query.get_offset() >= 1000: return None orderby = query.get_orderby() if not orderby or orderby[0] != f"-{self.__timestamp_col}": return None conditions = query.get_conditions() or [] from_date_str = next( (condition[2] for condition in conditions if _identify_condition(condition, self.__timestamp_col, ">=")), None, ) to_date_str = next( (condition[2] for condition in conditions if _identify_condition(condition, self.__timestamp_col, "<")), None, ) from_date_ast, to_date_ast = get_time_range(query, self.__timestamp_col) if not from_date_str or not to_date_str: return None date_align, split_step = state.get_configs([("date_align_seconds", 1), ("split_step", 3600) ] # default 1 hour ) to_date = util.parse_datetime(to_date_str, date_align) from_date = util.parse_datetime(from_date_str, date_align) if from_date != from_date_ast: logger.warning( "Mismatch in start date on time splitter.", extra={ "ast": str(from_date_ast), "legacy": str(from_date) }, exc_info=True, ) metrics.increment("mismatch.ast_from_date") remaining_offset = query.get_offset() overall_result = None split_end = to_date split_start = max(split_end - timedelta(seconds=split_step), from_date) total_results = 0 while split_start < split_end and total_results < limit: # We need to make a copy to use during the query execution because we replace # the start-end conditions on the query at each iteration of this loop. split_query = copy.deepcopy(query) _replace_condition(split_query, self.__timestamp_col, ">=", split_start.isoformat()) _replace_ast_condition(split_query, self.__timestamp_col, ">=", LiteralExpr(None, split_start)) _replace_condition(split_query, self.__timestamp_col, "<", split_end.isoformat()) _replace_ast_condition(split_query, self.__timestamp_col, "<", LiteralExpr(None, split_end)) # Because its paged, we have to ask for (limit+offset) results # and set offset=0 so we can then trim them ourselves. split_query.set_offset(0) split_query.set_limit(limit - total_results + remaining_offset) # At every iteration we only append the "data" key from the results returned by # the runner. The "extra" key is only populated at the first iteration of the # loop and never changed. result = runner(split_query, request_settings) if overall_result is None: overall_result = result else: overall_result.result["data"].extend(result.result["data"]) if remaining_offset > 0 and len(overall_result.result["data"]) > 0: to_trim = min(remaining_offset, len(overall_result.result["data"])) overall_result.result["data"] = overall_result.result["data"][ to_trim:] remaining_offset -= to_trim total_results = len(overall_result.result["data"]) if total_results < limit: if len(result.result["data"]) == 0: # If we got nothing from the last query, expand the range by a static factor split_step = split_step * STEP_GROWTH else: # If we got some results but not all of them, estimate how big the time # range should be for the next query based on how many results we got for # our last query and its time range, and how many we have left to fetch. remaining = limit - total_results split_step = split_step * math.ceil( remaining / float(len(result.result["data"]))) # Set the start and end of the next query based on the new range. split_end = split_start try: split_start = max( split_end - timedelta(seconds=split_step), from_date) except OverflowError: split_start = from_date return overall_result
def execute( self, query: Query, request_settings: RequestSettings, runner: SplitQueryRunner, ) -> Optional[QueryResult]: """ If a query is: - ORDER BY timestamp DESC - has no grouping - has an offset/limit - has a large time range We know we have to reverse-sort the entire set of rows to return the small chunk at the end of the time range, so optimistically split the time range into smaller increments, and start with the last one, so that we can potentially avoid querying the entire range. """ limit = query.get_limit() if limit is None or query.get_groupby_from_ast(): return None if query.get_offset() >= 1000: return None orderby = query.get_orderby_from_ast() if (not orderby or orderby[0].direction != OrderByDirection.DESC or not isinstance(orderby[0].expression, ColumnExpr) or not orderby[0].expression.column_name == self.__timestamp_col): return None from_date_ast, to_date_ast = get_time_range(query, self.__timestamp_col) if from_date_ast is None or to_date_ast is None: return None date_align, split_step = state.get_configs([("date_align_seconds", 1), ("split_step", 3600) ] # default 1 hour ) assert isinstance(split_step, int) remaining_offset = query.get_offset() overall_result: Optional[QueryResult] = None split_end = to_date_ast split_start = max(split_end - timedelta(seconds=split_step), from_date_ast) total_results = 0 while split_start < split_end and total_results < limit: # We need to make a copy to use during the query execution because we replace # the start-end conditions on the query at each iteration of this loop. split_query = copy.deepcopy(query) _replace_ast_condition(split_query, self.__timestamp_col, ">=", LiteralExpr(None, split_start)) _replace_ast_condition(split_query, self.__timestamp_col, "<", LiteralExpr(None, split_end)) # Because its paged, we have to ask for (limit+offset) results # and set offset=0 so we can then trim them ourselves. split_query.set_offset(0) split_query.set_limit(limit - total_results + remaining_offset) # At every iteration we only append the "data" key from the results returned by # the runner. The "extra" key is only populated at the first iteration of the # loop and never changed. result = runner(split_query, request_settings) if overall_result is None: overall_result = result else: overall_result.result["data"].extend(result.result["data"]) if remaining_offset > 0 and len(overall_result.result["data"]) > 0: to_trim = min(remaining_offset, len(overall_result.result["data"])) overall_result.result["data"] = overall_result.result["data"][ to_trim:] remaining_offset -= to_trim total_results = len(overall_result.result["data"]) if total_results < limit: if len(result.result["data"]) == 0: # If we got nothing from the last query, expand the range by a static factor split_step = split_step * STEP_GROWTH else: # If we got some results but not all of them, estimate how big the time # range should be for the next query based on how many results we got for # our last query and its time range, and how many we have left to fetch. remaining = limit - total_results split_step = split_step * math.ceil( remaining / float(len(result.result["data"]))) # Set the start and end of the next query based on the new range. split_end = split_start try: split_start = max( split_end - timedelta(seconds=split_step), from_date_ast) except OverflowError: split_start = from_date_ast return overall_result