Ejemplo n.º 1
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        def process_column(exp: Expression) -> Expression:
            if isinstance(exp, Column):
                if exp.column_name == "group_id":
                    return FunctionCall(
                        exp.alias,
                        "nullIf",
                        (
                            Column(None, exp.table_name, exp.column_name),
                            Literal(None, 0),
                        ),
                    )
                elif exp.column_name == "message":
                    # Because of the rename from message->search_message without backfill,
                    # records will have one or the other of these fields.
                    # TODO this can be removed once all data has search_message filled in.
                    return FunctionCall(
                        exp.alias,
                        "coalesce",
                        (
                            Column(None, exp.table_name, "search_message"),
                            Column(None, exp.table_name, exp.column_name),
                        ),
                    )

            return exp

        query.transform_expressions(process_column)
Ejemplo n.º 2
0
 def process_query(self, query: Query,
                   query_settings: QuerySettings) -> None:
     having_clause = query.get_having()
     if not having_clause:
         return None
     selected_columns = query.get_selected_columns()
     uniq_matcher = Param("function", FunctionCallMatch(String("uniq")))
     found_functions = []
     for exp in having_clause:
         match = uniq_matcher.match(exp)
         if match is not None:
             found_functions.append(match.expression("function"))
     if found_functions is not None:
         matcher = _ExpressionOrAliasMatcher(found_functions)
         for col in selected_columns:
             col.expression.accept(matcher)
         if not all(matcher.found_expressions):
             should_throw = get_config("throw_on_uniq_select_and_having",
                                       False)
             error = MismatchedAggregationException(
                 "Aggregation is in HAVING clause but not SELECT",
                 query=str(query))
             if should_throw:
                 raise error
             else:
                 logging.warning(
                     "Aggregation is in HAVING clause but not SELECT",
                     exc_info=True,
                     extra=cast(Dict[str, Any], error.to_dict()),
                 )
Ejemplo n.º 3
0
def test_hexint_column_processor(unprocessed: Expression,
                                 formatted_value: str) -> None:
    unprocessed_query = Query(
        Table("transactions", ColumnSet([])),
        selected_columns=[
            SelectedExpression("column1", Column(None, None, "column1"))
        ],
        condition=unprocessed,
    )

    HexIntColumnProcessor(set(["column1"
                               ])).process_query(unprocessed_query,
                                                 HTTPQuerySettings())
    assert unprocessed_query.get_selected_columns() == [
        SelectedExpression(
            "column1",
            FunctionCall(
                None,
                "lower",
                (FunctionCall(
                    None,
                    "hex",
                    (Column(None, None, "column1"), ),
                ), ),
            ),
        )
    ]

    condition = unprocessed_query.get_condition()
    assert condition is not None
    ret = condition.accept(ClickhouseExpressionFormatter())
    assert ret == formatted_value
Ejemplo n.º 4
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        def transform_nested_column(exp: Expression) -> Expression:
            subscript = match_subscriptable_reference(exp)
            if subscript is None:
                return exp

            if subscript.column_name in self.__specs:
                promoted_col_name = self.__specs[subscript.column_name].get(
                    subscript.key)
                if promoted_col_name is not None:
                    col_type = (query.get_from_clause().get_columns().get(
                        promoted_col_name, None))
                    col_type_name = str(col_type) if col_type else None

                    # We need to pass the content of the promoted column to a toString
                    # function when the promoted column is not a string since the
                    # supported values of mapping columns are strings and the clients
                    # expect such.
                    if not self.__cast_to_string or (
                            col_type_name and "String" in col_type_name
                            and "FixedString" not in col_type_name):
                        return Column(exp.alias, subscript.table_name,
                                      promoted_col_name)
                    else:
                        return FunctionCall(
                            exp.alias,
                            "toString",
                            (Column(None, subscript.table_name,
                                    promoted_col_name), ),
                        )

            return exp

        query.transform_expressions(transform_nested_column)
Ejemplo n.º 5
0
    def process_query(self, query: Query, request_settings: RequestSettings) -> None:
        def process_condition(exp: Expression) -> Expression:
            result = CONDITION_PATTERN.match(exp)
            if result is not None:
                key_column = result.optional_string(KEY_COL_MAPPING_PARAM)
                if key_column == "tags.key":
                    rhs = result.optional_string(KEY_MAPPING_PARAM)
                    table_name = result.optional_string(TABLE_MAPPING_PARAM)
                    replacement = FunctionCall(
                        exp.alias,
                        "has",
                        (Column(None, table_name, "tags.key"), Literal(None, rhs)),
                    )

                    assert isinstance(exp, FunctionCall)
                    if exp.function_name == ConditionFunctions.EQ:
                        replacement = FunctionCall(exp.alias, "not", (replacement,))

                    prev_value = query.get_experiment_value(
                        "empty-string-tag-condition"
                    )
                    if prev_value is not None:
                        return replacement if prev_value == "true" else exp

                    if settings.TESTING or random.random() < 0.5:
                        query.add_experiment("empty-string-tag-condition", "true")
                        return replacement
                    else:
                        query.add_experiment("empty-string-tag-condition", "false")

            return exp

        condition = query.get_condition()
        if condition is not None:
            query.set_ast_condition(condition.transform(process_condition))
Ejemplo n.º 6
0
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None:
    state.set_config("max_group_ids_exclude", 5)
    set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS)

    PostReplacementConsistencyEnforcer(
        "project_id", ReplacerState.EVENTS
    ).process_query(query, HTTPRequestSettings())

    assert query.get_condition_from_ast() == FunctionCall(
        None,
        BooleanFunctions.AND,
        (
            FunctionCall(
                None,
                "notIn",
                (
                    FunctionCall(
                        None, "assumeNotNull", (Column(None, None, "group_id"),)
                    ),
                    FunctionCall(
                        None,
                        "tuple",
                        (Literal(None, 100), Literal(None, 101), Literal(None, 102),),
                    ),
                ),
            ),
            build_in("project_id", [2]),
        ),
    )
    assert not query.get_from_clause().final
Ejemplo n.º 7
0
    def process_query(self, query: Query, request_settings: RequestSettings,) -> None:
        from_clause = query.get_data_source()
        if not isinstance(from_clause, JoinClause):
            return

        referenced_columns = query.get_all_referenced_columns()
        referenced_aliases = set()
        for qualified_column in referenced_columns:
            # This will be much better when we will represent columns
            # with a more structured data type than strings.
            match = QUALIFIED_COLUMN_REGEX.match(qualified_column)
            if match:
                # match[1] is the first parenthesized group in the regex, thus
                # the table alias.
                table_alias = match[1]
                referenced_aliases.add(table_alias)

        assert (
            len(referenced_aliases) > 0
        ), "Trying to otpimize a join query without aliases"
        if len(referenced_aliases) > 1:
            return

        from_tables = from_clause.get_tables()
        table = from_tables[referenced_aliases.pop()]

        query.set_data_source(table)
Ejemplo n.º 8
0
def test_query_data_source() -> None:
    """
    Tests using the Query as a data source
    """

    query = Query(
        Table("my_table", ColumnSet([])),
        selected_columns=[
            SelectedExpression(
                "col1", Column(alias="col1", table_name=None, column_name="col1")
            ),
            SelectedExpression(
                "some_func",
                FunctionCall(
                    "some_func",
                    "f",
                    (Column(alias="col1", table_name=None, column_name="col1"),),
                ),
            ),
            SelectedExpression(
                None, Column(alias="col2", table_name=None, column_name="col2")
            ),
        ],
    )
    assert query.get_columns() == ColumnSet(
        [("col1", Any()), ("some_func", Any()), ("_invalid_alias_2", Any())]
    )
Ejemplo n.º 9
0
def _replace_ast_condition(
    query: Query, field: str, operator: str, new_operand: Expression
) -> None:
    """
    Replaces a condition in the top level AND boolean condition
    in the query WHERE clause.
    """

    def replace_condition(expression: Expression) -> Expression:
        match = FunctionCall(
            String(OPERATOR_TO_FUNCTION[operator]),
            (Param("column", Column(None, String(field))), AnyExpression()),
        ).match(expression)

        return (
            expression
            if match is None
            else replace(
                expression, parameters=(match.expression("column"), new_operand)
            )
        )

    condition = query.get_condition_from_ast()
    if condition is not None:
        query.set_ast_condition(
            combine_and_conditions(
                [
                    replace_condition(c)
                    for c in get_first_level_and_conditions(condition)
                ]
            )
        )
Ejemplo n.º 10
0
def test_without_turbo_without_projects_needing_final(query: ClickhouseQuery) -> None:
    PostReplacementConsistencyEnforcer("project_id", None).process_query(
        query, HTTPRequestSettings()
    )

    assert query.get_condition_from_ast() == build_in("project_id", [2])
    assert not query.get_from_clause().final
Ejemplo n.º 11
0
def test_multiple_not_too_many_excludes(
    query_with_multiple_group_ids: ClickhouseQuery, ) -> None:
    """
    Query is looking for multiple groups and there are not too many groups to exclude, but
    there are fewer groups queried for than replaced.
    """
    enforcer = PostReplacementConsistencyEnforcer("project_id",
                                                  ReplacerState.ERRORS)

    set_project_exclude_groups(
        2,
        [100, 101, 102],
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )

    enforcer._set_query_final(query_with_multiple_group_ids, True)
    state.set_config("max_group_ids_exclude", 5)

    enforcer.process_query(query_with_multiple_group_ids, HTTPQuerySettings())
    assert query_with_multiple_group_ids.get_condition() == build_and(
        build_not_in("group_id", [101, 102]),
        build_and(build_in("project_id", [2]),
                  build_in("group_id", [101, 102])),
    )
    assert not query_with_multiple_group_ids.get_from_clause().final
Ejemplo n.º 12
0
def test_single_too_many_exclude(
        query_with_single_group_id: ClickhouseQuery) -> None:
    """
    Query is looking for a group that has been replaced, and there are too many
    groups to exclude.
    """
    enforcer = PostReplacementConsistencyEnforcer("project_id",
                                                  ReplacerState.ERRORS)

    set_project_exclude_groups(
        2,
        [100, 101, 102],
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )

    enforcer._set_query_final(query_with_single_group_id, True)
    state.set_config("max_group_ids_exclude", 2)

    enforcer.process_query(query_with_single_group_id, HTTPQuerySettings())
    assert query_with_single_group_id.get_condition() == build_and(
        build_not_in("group_id", [101]),
        build_and(build_in("project_id", [2]), build_in("group_id", [101])),
    )
    assert not query_with_single_group_id.get_from_clause().final
Ejemplo n.º 13
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:
        def process_condition(exp: Expression) -> Expression:
            result = CONDITION_PATTERN.match(exp)
            if result is not None:
                key_column = result.optional_string(KEY_COL_MAPPING_PARAM)
                if key_column == "tags.key":
                    rhs = result.optional_string(KEY_MAPPING_PARAM)
                    table_name = result.optional_string(TABLE_MAPPING_PARAM)
                    replacement = FunctionCall(
                        exp.alias,
                        "has",
                        (Column(None, table_name,
                                "tags.key"), Literal(None, rhs)),
                    )

                    assert isinstance(exp, FunctionCall)
                    if exp.function_name == ConditionFunctions.EQ:
                        replacement = FunctionCall(exp.alias, "not",
                                                   (replacement, ))

                    return replacement

            return exp

        condition = query.get_condition()
        if condition is not None:
            query.set_ast_condition(condition.transform(process_condition))
Ejemplo n.º 14
0
def test_not_many_groups_to_exclude(query: ClickhouseQuery) -> None:
    state.set_config("max_group_ids_exclude", 5)
    set_project_exclude_groups(
        2,
        [100, 101, 102],
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )

    PostReplacementConsistencyEnforcer("project_id",
                                       ReplacerState.ERRORS).process_query(
                                           query, HTTPQuerySettings())

    assert query.get_condition() == build_and(
        FunctionCall(
            None,
            "notIn",
            (
                FunctionCall(None, "assumeNotNull",
                             (Column(None, None, "group_id"), )),
                FunctionCall(
                    None,
                    "tuple",
                    (
                        Literal(None, 100),
                        Literal(None, 101),
                        Literal(None, 102),
                    ),
                ),
            ),
        ),
        build_in("project_id", [2]),
    )
    assert not query.get_from_clause().final
Ejemplo n.º 15
0
def test_query_overlaps_replacements_processor(
    query: ClickhouseQuery,
    query_with_timestamp: ClickhouseQuery,
    query_with_future_timestamp: ClickhouseQuery,
) -> None:
    enforcer = PostReplacementConsistencyEnforcer("project_id",
                                                  ReplacerState.ERRORS)

    # replacement time unknown, default to "overlaps" but no groups to exclude so shouldn't be final
    enforcer._set_query_final(query_with_timestamp, True)
    enforcer.process_query(query_with_timestamp, HTTPQuerySettings())
    assert not query_with_timestamp.get_from_clause().final

    # overlaps replacement and should be final due to too many groups to exclude
    state.set_config("max_group_ids_exclude", 2)
    set_project_exclude_groups(
        2,
        [100, 101, 102],
        ReplacerState.ERRORS,
        ReplacementType.
        EXCLUDE_GROUPS,  # Arbitrary replacement type, no impact on tests
    )
    enforcer._set_query_final(query_with_timestamp, False)
    enforcer.process_query(query_with_timestamp, HTTPQuerySettings())
    assert query_with_timestamp.get_from_clause().final

    # query time range unknown and should be final due to too many groups to exclude
    enforcer._set_query_final(query, False)
    enforcer.process_query(query, HTTPQuerySettings())
    assert query.get_from_clause().final

    # doesn't overlap replacements
    enforcer._set_query_final(query_with_future_timestamp, True)
    enforcer.process_query(query_with_future_timestamp, HTTPQuerySettings())
    assert not query_with_future_timestamp.get_from_clause().final
Ejemplo n.º 16
0
def get_filtered_mapping_keys(query: Query, column_name: str) -> Set[str]:
    """
    Identifies the conditions we can apply the arrayFilter optimization
    on.
    Which means: if the arrayJoin is in the select clause, there
    are one or more top level AND condition on the arrayJoin and
    there is no OR condition in the query.
    """
    array_join_found = any(
        array_join_pattern(column_name).match(f) is not None
        for selected in query.get_selected_columns_from_ast() or []
        for f in selected.expression)

    if not array_join_found:
        return set()

    ast_condition = query.get_condition_from_ast()
    cond_keys = (_get_mapping_keys_in_condition(ast_condition, column_name)
                 if ast_condition is not None else set())
    if cond_keys is None:
        # This means we found an OR. Cowardly we give up even though there could
        # be cases where this condition is still optimizable.
        return set()

    ast_having = query.get_having_from_ast()
    having_keys = (_get_mapping_keys_in_condition(ast_having, column_name)
                   if ast_having is not None else set())
    if having_keys is None:
        # Same as above
        return set()

    return cond_keys | having_keys
Ejemplo n.º 17
0
def test_mand_conditions(table: str, mand_conditions: List[FunctionCall]) -> None:

    query = Query(
        Table(
            table,
            ColumnSet([]),
            final=False,
            sampling_rate=None,
            mandatory_conditions=mand_conditions,
        ),
        None,
        None,
        binary_condition(
            BooleanFunctions.AND,
            binary_condition(
                OPERATOR_TO_FUNCTION["="], Column("d", None, "d"), Literal(None, "1"),
            ),
            binary_condition(
                OPERATOR_TO_FUNCTION["="], Column("c", None, "c"), Literal(None, "3"),
            ),
        ),
    )

    query_ast_copy = copy.deepcopy(query)

    request_settings = HTTPRequestSettings(consistent=True)
    processor = MandatoryConditionApplier()
    processor.process_query(query, request_settings)

    query_ast_copy.add_condition_to_ast(combine_and_conditions(mand_conditions))

    assert query.get_condition_from_ast() == query_ast_copy.get_condition_from_ast()
def get_filtered_mapping_keys(
    query: Query,
    extractors: Sequence[Extractor[T]],
    is_skippable_condition: Callable[[Expression], bool],
) -> Sequence[T]:
    """
    Identifies the conditions we can apply the arrayFilter optimization on.

    Which means: if the arrayJoin is in the select clause, there are one or
    more top level AND condition on the arrayJoin and there is no OR condition
    in the query.
    """
    ast_condition = query.get_condition()
    cond_keys: Optional[Set[T]] = (
        get_mapping_keys_in_condition(ast_condition, extractors, is_skippable_condition)
        if ast_condition is not None
        else set()
    )
    if cond_keys is None:
        # This means we found an OR. Cowardly we give up even though there could
        # be cases where this condition is still optimizable.
        return []

    ast_having = query.get_having()
    having_keys: Optional[Set[T]] = (
        get_mapping_keys_in_condition(ast_having, extractors, is_skippable_condition)
        if ast_having is not None
        else set()
    )
    if having_keys is None:
        # Same as above
        return []

    keys = cond_keys | having_keys
    return sorted(list(keys))
Ejemplo n.º 19
0
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        missing_checkers = {checker for checker in self.__condition_checkers}

        def inspect_expression(condition: Expression) -> None:
            top_level = get_first_level_and_conditions(condition)
            for condition in top_level:
                for checker in self.__condition_checkers:
                    if checker in missing_checkers:
                        if checker.check(condition):
                            missing_checkers.remove(checker)

        condition = query.get_condition()
        if condition is not None:
            inspect_expression(condition)

        prewhere = query.get_prewhere_ast()
        if prewhere is not None:
            inspect_expression(prewhere)

        missing_ids = {checker.get_id() for checker in missing_checkers}
        if get_config("mandatory_condition_enforce", 0):
            assert (
                not missing_checkers
            ), f"Missing mandatory columns in query. Missing {missing_ids}"
        else:
            if missing_checkers:
                logger.error(
                    "Query is missing mandatory columns",
                    extra={"missing_checkers": missing_ids},
                )
Ejemplo n.º 20
0
def _replace_condition(query: Query, field: str, operator: str,
                       new_literal: Union[str, List[AnyType]]) -> None:
    query.set_conditions([
        cond if not _identify_condition(cond, field, operator) else
        [field, operator, new_literal]
        for cond in query.get_conditions() or []
    ])
Ejemplo n.º 21
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:

        mandatory_conditions = query.get_from_clause().mandatory_conditions

        if len(mandatory_conditions) > 0:
            query.add_condition_to_ast(
                combine_and_conditions(mandatory_conditions))
Ejemplo n.º 22
0
def test_format_expressions(
    name: str, query: ClickhouseQuery, expected_query: ClickhouseQuery
) -> None:
    MappingColumnPromoter({"tags": {"promoted_tag": "promoted"}}).process_query(
        query, HTTPQuerySettings()
    )

    assert query.get_selected_columns() == expected_query.get_selected_columns()
Ejemplo n.º 23
0
 def _set_query_final(self, query: Query, final: bool) -> None:
     """
     Set the 'final' clause of a Query.
     A query set as final will force ClickHouse to perform a merge
     on the results of the query. This is very performance heavy and
     should be avoided whenever possible.
     """
     query.set_from_clause(replace(query.get_from_clause(), final=final))
Ejemplo n.º 24
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:
        array_joins_in_query = self.__get_array_joins_in_query(query)

        tuple_alias = self.__get_unused_alias(query)

        single_filtered, multiple_filtered = self.get_filtered_arrays(
            query, self.key_columns)

        def replace_expression(expr: Expression) -> Expression:
            match = self.__array_join_pattern.match(expr)

            # The arrayJoins we are looking for are not present, so skip this entirely
            if match is None:
                return expr

            # All of the possible array joins are present
            if array_joins_in_query == set(self.all_columns):
                tuple_index = self.__find_tuple_index(match.string("col"))

                single_index_filtered = {
                    self.__find_tuple_index(column_name): filtered
                    for column_name, filtered in single_filtered.items()
                }

                multiple_indices_filtered = {
                    tuple(
                        self.__find_tuple_index(column)
                        for column in column_names): filtered
                    for column_names, filtered in multiple_filtered.items()
                }

                if single_filtered or multiple_filtered:
                    return filtered_mapping_tuples(
                        expr.alias,
                        tuple_alias,
                        tuple_index,
                        self.all_columns,
                        single_index_filtered,
                        multiple_indices_filtered,
                    )

                return unfiltered_mapping_tuples(expr.alias, tuple_alias,
                                                 tuple_index, self.all_columns)

            # Only array join present is one of the key columns
            elif len(array_joins_in_query) == 1 and any(
                    column in array_joins_in_query
                    for column in self.key_columns):
                column_name = array_joins_in_query.pop()
                if column_name in single_filtered:
                    return filtered_mapping_keys(expr.alias, column_name,
                                                 single_filtered[column_name])

            # No viable optimization
            return expr

        query.transform_expressions(replace_expression)
Ejemplo n.º 25
0
    def _update_conditions(self, query: Query,
                           prewhere_conditions: Sequence[Condition]) -> None:
        conditions = query.get_conditions()
        # This should never ne None at this point, but for mypy this can be None.
        assert conditions is not None

        query.set_conditions(
            [cond for cond in conditions if cond not in prewhere_conditions])
        query.set_prewhere(prewhere_conditions)
Ejemplo n.º 26
0
    def process_query(self, query: Query,
                      query_settings: QuerySettings) -> None:
        single_filtered, multiple_filtered = self.get_filtered_arrays(
            query, self.key_columns)

        bloom_filter_condition = generate_bloom_filter_condition(
            self.column_name, single_filtered, multiple_filtered)

        if bloom_filter_condition:
            query.add_condition_to_ast(bloom_filter_condition)
Ejemplo n.º 27
0
def test_tags_processor(query_body: MutableMapping[str, Any],
                        expected_query: ClickhouseQuery) -> None:
    """
    Tests the whole processing in some notable cases.
    """
    processed = parse_and_process(query_body)
    assert processed.get_selected_columns(
    ) == expected_query.get_selected_columns()
    assert processed.get_condition() == expected_query.get_condition()
    assert processed.get_having() == expected_query.get_having()
Ejemplo n.º 28
0
def test_too_many_groups_to_exclude(query: ClickhouseQuery) -> None:
    state.set_config("max_group_ids_exclude", 2)
    set_project_exclude_groups(2, [100, 101, 102], ReplacerState.EVENTS)

    PostReplacementConsistencyEnforcer(
        "project_id", ReplacerState.EVENTS
    ).process_query(query, HTTPRequestSettings())

    assert query.get_condition_from_ast() == build_in("project_id", [2])
    assert query.get_from_clause().final
Ejemplo n.º 29
0
def test_without_turbo_with_projects_needing_final(
        query: ClickhouseQuery) -> None:
    set_project_needs_final(2, ReplacerState.EVENTS)

    PostReplacementConsistencyEnforcer("project_id",
                                       ReplacerState.EVENTS).process_query(
                                           query, HTTPRequestSettings())

    assert query.get_condition_from_ast() == build_in("project_id", [2])
    assert query.get_final()
Ejemplo n.º 30
0
def _list_array_join(query: Query) -> Columnset:
    ret = set()
    query_arrayjoin = query.get_arrayjoin_from_ast()
    if query_arrayjoin is not None:
        ret |= _get_columns_from_expression(query_arrayjoin)

    for e in query.get_all_expressions():
        if isinstance(e, FunctionCallExpr) and e.function_name == "arrayJoin":
            ret |= _get_columns_from_expression(e)

    return ret