def test_join_optimizer_two_tables( selected_cols: Sequence[Any], conditions: Sequence[Condition], groupby: Groupby, expected: str, ) -> None: query = Query( { "selected_columns": selected_cols, "conditions": conditions, "arrayjoin": None, "having": [], "groupby": groupby, "aggregations": [], "orderby": None, "limitby": None, "sample": 10, "limit": 100, "offset": 50, "totals": True, "granularity": 60, }, simple_join_structure, ) request_settings = HTTPRequestSettings() optimizer = SimpleJoinOptimizer() optimizer.process_query(query, request_settings) assert query.get_data_source().format_from() == expected
def test_full_query(): query = Query( { "selected_columns": ["c1", "c2", "c3"], "conditions": [["c1", "=", "a"]], "arrayjoin": "tags", "having": [["c4", "=", "c"]], "groupby": ["project_id"], "aggregations": [["count()", "", "count"]], "orderby": "event_id", "limitby": (100, "environment"), "sample": 10, "limit": 100, "offset": 50, "totals": True, "granularity": 60, }, TableSource("my_table", ColumnSet([])), ) assert query.get_selected_columns() == ["c1", "c2", "c3"] assert query.get_aggregations() == [["count()", "", "count"]] assert query.get_groupby() == ["project_id"] assert query.get_conditions() == [["c1", "=", "a"]] assert query.get_arrayjoin() == "tags" assert query.get_having() == [["c4", "=", "c"]] assert query.get_orderby() == "event_id" assert query.get_limitby() == (100, "environment") assert query.get_sample() == 10 assert query.get_limit() == 100 assert query.get_offset() == 50 assert query.has_totals() is True assert query.get_granularity() == 60 assert query.get_data_source().format_from() == "my_table"
def process_query( self, query: Query, request_settings: RequestSettings, ) -> None: from_clause = query.get_data_source() if not isinstance(from_clause, JoinClause): return referenced_columns = query.get_all_referenced_columns() referenced_aliases = set() for qualified_column in referenced_columns: # This will be much better when we will represent columns # with a more structured data type than strings. match = QUALIFIED_COLUMN_REGEX.match(qualified_column) if match: # match[1] is the first parenthesized group in the regex, thus # the table alias. table_alias = match[1] referenced_aliases.add(table_alias) assert (len(referenced_aliases) > 0), "Trying to otpimize a join query without aliases" if len(referenced_aliases) > 1: return from_tables = from_clause.get_tables() table = from_tables[referenced_aliases.pop()] query.set_data_source(table)
def test_prewhere(initial_table, consistent, expected_table) -> None: state.set_config("enable_events_readonly_table", True) body = { "conditions": [ ["d", "=", "1"], ["c", "=", "3"], ["a", "=", "1"], ["b", "=", "2"], ], } cols = ColumnSet([("col", String())]) query = Query( body, TableSource(initial_table, cols, [["time", "=", "1"]], ["c1"]), ) request_settings = HTTPRequestSettings(consistent=consistent) processor = ReadOnlyTableSelector("sentry_dist", "sentry_ro") processor.process_query(query, request_settings) source = query.get_data_source() assert isinstance(source, TableSource) assert source.format_from() == expected_table assert source.get_columns() == cols assert source.get_prewhere_candidates() == ["c1"] assert source.get_mandatory_conditions() == [["time", "=", "1"]]
def test_data_source( self, query_body: MutableMapping[str, Any], expected_dataset: str ): query = Query(query_body, get_dataset_source("discover")) request_settings = HTTPRequestSettings() for processor in get_dataset("discover").get_query_processors(): processor.process_query(query, request_settings) assert ( query.get_data_source().format_from() == get_dataset_source(expected_dataset).format_from() )
def test_empty_query(): query = Query({}, TableSource("my_table", ColumnSet([]))) assert query.get_selected_columns() is None assert query.get_aggregations() is None assert query.get_groupby() is None assert query.get_conditions() is None assert query.get_arrayjoin() is None assert query.get_having() == [] assert query.get_orderby() is None assert query.get_limitby() is None assert query.get_sample() is None assert query.get_limit() is None assert query.get_offset() == 0 assert query.has_totals() is False assert query.get_data_source().format_from() == "my_table"
def process_query(self, query: Query, request_settings: RequestSettings,) -> None: max_prewhere_conditions: int = ( self.__max_prewhere_conditions or settings.MAX_PREWHERE_CONDITIONS ) prewhere_keys = query.get_data_source().get_prewhere_candidates() if not prewhere_keys: return prewhere_conditions: Sequence[Condition] = [] # Add any condition to PREWHERE if: # - It is a single top-level condition (not OR-nested), and # - Any of its referenced columns are in prewhere_keys conditions = query.get_conditions() if not conditions: return prewhere_candidates = [ (util.columns_in_expr(cond[0]), cond) for cond in conditions if util.is_condition(cond) and any(col in prewhere_keys for col in util.columns_in_expr(cond[0])) ] # Use the condition that has the highest priority (based on the # position of its columns in the prewhere keys list) prewhere_candidates = sorted( [ ( min( prewhere_keys.index(col) for col in cols if col in prewhere_keys ), cond, ) for cols, cond in prewhere_candidates ], key=lambda priority_and_col: priority_and_col[0], ) if prewhere_candidates: prewhere_conditions = [cond for _, cond in prewhere_candidates][ :max_prewhere_conditions ] query.set_conditions( list(filter(lambda cond: cond not in prewhere_conditions, conditions)) ) query.set_prewhere(prewhere_conditions)
def process_query(self, query: Query, request_settings: RequestSettings) -> None: readonly_enabled = state.get_config("enable_events_readonly_table", False) if not readonly_enabled: return if request_settings.get_consistent(): return data_source = query.get_data_source() if data_source.format_from() != self.__table_to_replace: return new_source = TableSource( table_name=self.__read_only_table, columns=data_source.get_columns(), mandatory_conditions=data_source.get_mandatory_conditions(), prewhere_candidates=data_source.get_prewhere_candidates(), ) query.set_data_source(new_source)
def __init__( self, query: Query, settings: RequestSettings, ) -> None: # Snuba query structure # Referencing them here directly since it makes it easier # to process this query independently from the Snuba Query # and there is no risk in doing so since they are immutable. self.__selected_columns = query.get_selected_columns_from_ast() self.__condition = query.get_condition_from_ast() self.__groupby = query.get_groupby_from_ast() self.__having = query.get_having_from_ast() self.__orderby = query.get_orderby_from_ast() self.__data_source = query.get_data_source() self.__arrayjoin = query.get_arrayjoin_from_ast() self.__granularity = query.get_granularity() self.__limit = query.get_limit() self.__limitby = query.get_limitby() self.__offset = query.get_offset() if self.__having: assert self.__groupby, "found HAVING clause with no GROUP BY" # Clickhouse specific fields. Some are still in the Snuba # query and have to be moved. self.__turbo = settings.get_turbo() self.__final = query.get_final() self.__sample = query.get_sample() self.__hastotals = query.has_totals() # TODO: Pre where processing will become a step in Clickhouse Query processing # instead of being pulled from the Snuba Query self.__prewhere = query.get_prewhere_ast() self.__settings = settings self.__formatted_query: Optional[str] = None
def __init__( self, dataset: Dataset, query: Query, settings: RequestSettings, ) -> None: parsing_context = ParsingContext() aggregate_exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ column_expr(dataset, gb, query, parsing_context) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ column_expr(dataset, util.tuplify(colname), query, parsing_context) for colname in column_names ] select_clause = u"SELECT {}".format( ", ".join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u"FROM {}".format(query.get_data_source().format_from()) if query.get_final(): from_clause = u"{} FINAL".format(from_clause) if not query.get_data_source().supports_sample(): sample_rate = None else: if query.get_sample(): sample_rate = query.get_sample() elif settings.get_turbo(): sample_rate = snuba_settings.TURBO_SAMPLE_RATE else: sample_rate = None if sample_rate: from_clause = u"{} SAMPLE {}".format(from_clause, sample_rate) join_clause = "" if query.get_arrayjoin(): join_clause = u"ARRAY JOIN {}".format(query.get_arrayjoin()) where_clause = "" if query.get_conditions(): where_clause = u"WHERE {}".format( conditions_expr(dataset, query.get_conditions(), query, parsing_context)) prewhere_clause = "" if query.get_prewhere(): prewhere_clause = u"PREWHERE {}".format( conditions_expr(dataset, query.get_prewhere(), query, parsing_context)) group_clause = "" if groupby: group_clause = "GROUP BY ({})".format(", ".join( column_expr(dataset, gb, query, parsing_context) for gb in groupby)) if query.has_totals(): group_clause = "{} WITH TOTALS".format(group_clause) having_clause = "" having_conditions = query.get_having() if having_conditions: assert groupby, "found HAVING clause with no GROUP BY" having_clause = u"HAVING {}".format( conditions_expr(dataset, having_conditions, query, parsing_context)) order_clause = "" if query.get_orderby(): orderby = [ column_expr(dataset, util.tuplify(ob), query, parsing_context) for ob in util.to_list(query.get_orderby()) ] orderby = [ u"{} {}".format(ob.lstrip("-"), "DESC" if ob.startswith("-") else "ASC") for ob in orderby ] order_clause = u"ORDER BY {}".format(", ".join(orderby)) limitby_clause = "" if query.get_limitby() is not None: limitby_clause = "LIMIT {} BY {}".format(*query.get_limitby()) limit_clause = "" if query.get_limit() is not None: limit_clause = "LIMIT {}, {}".format(query.get_offset(), query.get_limit()) self.__formatted_query = " ".join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause, ] if c ])
def __init__( self, dataset: Dataset, query: Query, settings: RequestSettings, prewhere_conditions: Sequence[str], ) -> None: parsing_context = ParsingContext() aggregate_exprs = [ column_expr(dataset, col, query, parsing_context, alias, agg) for (agg, col, alias) in query.get_aggregations() ] groupby = util.to_list(query.get_groupby()) group_exprs = [ column_expr(dataset, gb, query, parsing_context) for gb in groupby ] column_names = query.get_selected_columns() or [] selected_cols = [ column_expr(dataset, util.tuplify(colname), query, parsing_context) for colname in column_names ] select_clause = u'SELECT {}'.format( ', '.join(group_exprs + aggregate_exprs + selected_cols)) from_clause = u'FROM {}'.format(query.get_data_source().format_from()) if query.get_final(): from_clause = u'{} FINAL'.format(from_clause) if query.get_sample(): sample_rate = query.get_sample() elif settings.get_turbo(): sample_rate = snuba_settings.TURBO_SAMPLE_RATE else: sample_rate = None if sample_rate: from_clause = u'{} SAMPLE {}'.format(from_clause, sample_rate) join_clause = '' if query.get_arrayjoin(): join_clause = u'ARRAY JOIN {}'.format(query.get_arrayjoin()) where_clause = '' if query.get_conditions(): where_clause = u'WHERE {}'.format( conditions_expr(dataset, query.get_conditions(), query, parsing_context)) prewhere_clause = '' if prewhere_conditions: prewhere_clause = u'PREWHERE {}'.format( conditions_expr(dataset, prewhere_conditions, query, parsing_context)) group_clause = '' if groupby: group_clause = 'GROUP BY ({})'.format(', '.join( column_expr(dataset, gb, query, parsing_context) for gb in groupby)) if query.has_totals(): group_clause = '{} WITH TOTALS'.format(group_clause) having_clause = '' having_conditions = query.get_having() if having_conditions: assert groupby, 'found HAVING clause with no GROUP BY' having_clause = u'HAVING {}'.format( conditions_expr(dataset, having_conditions, query, parsing_context)) order_clause = '' if query.get_orderby(): orderby = [ column_expr(dataset, util.tuplify(ob), query, parsing_context) for ob in util.to_list(query.get_orderby()) ] orderby = [ u'{} {}'.format(ob.lstrip('-'), 'DESC' if ob.startswith('-') else 'ASC') for ob in orderby ] order_clause = u'ORDER BY {}'.format(', '.join(orderby)) limitby_clause = '' if query.get_limitby() is not None: limitby_clause = 'LIMIT {} BY {}'.format(*query.get_limitby()) limit_clause = '' if query.get_limit() is not None: limit_clause = 'LIMIT {}, {}'.format(query.get_offset(), query.get_limit()) self.__formatted_query = ' '.join([ c for c in [ select_clause, from_clause, join_clause, prewhere_clause, where_clause, group_clause, having_clause, order_clause, limitby_clause, limit_clause ] if c ])