Beispiel #1
0
    def get_project_ids_in_condition(
            condition: Expression) -> Optional[Set[int]]:
        """
        Extract project ids from an expression. Returns None if no project
        if condition is found. It returns an empty set of conflicting project_id
        conditions are found.
        """
        match = FunctionCall(
            None,
            String(ConditionFunctions.EQ),
            (
                Column(column_name=String(project_column)),
                Literal(value=Param("project_id", Any(int))),
            ),
        ).match(condition)
        if match is not None:
            return {match.integer("project_id")}

        match = is_in_condition_pattern(
            Column(column_name=String(project_column))).match(condition)
        if match is not None:
            projects = match.expression("tuple")
            assert isinstance(projects, FunctionCallExpr)
            return {
                l.value
                for l in projects.parameters
                if isinstance(l, LiteralExpr) and isinstance(l.value, int)
            }

        match = FunctionCall(
            None,
            Param(
                "operator",
                Or([String(BooleanFunctions.AND),
                    String(BooleanFunctions.OR)]),
            ),
            (Param("lhs", AnyExpression()), Param("rhs", AnyExpression())),
        ).match(condition)
        if match is not None:
            lhs_projects = get_project_ids_in_condition(
                match.expression("lhs"))
            rhs_projects = get_project_ids_in_condition(
                match.expression("rhs"))
            if lhs_projects is None:
                return rhs_projects
            elif rhs_projects is None:
                return lhs_projects
            else:
                return (lhs_projects & rhs_projects if match.string("operator")
                        == BooleanFunctions.AND else lhs_projects
                        | rhs_projects)

        return None
Beispiel #2
0
    def replace_condition(expression: Expression) -> Expression:
        match = FunctionCall(
            String(OPERATOR_TO_FUNCTION[operator]),
            (Param("column", Column(None, String(field))), AnyExpression()),
        ).match(expression)

        return (expression if match is None else replace(
            expression, parameters=(match.expression("column"), new_operand)))
Beispiel #3
0
 ),
 (
     "Matches a column with all fields",
     Column(
         Param("table_name", AnyOptionalString()),
         Param("column_name", Any(str)),
     ),
     ColumnExpr("alias", "table_name", "test_col"),
     MatchResult({
         "column_name": "test_col",
         "table_name": "table_name"
     }),
 ),
 (
     "Match anything",
     AnyExpression(),
     ColumnExpr(None, None, "something_irrelevant"),
     MatchResult(),
 ),
 (
     "Match a string through Any(str)",
     Column(Param("p_table_name", Any(str)), None),
     ColumnExpr("irrelevant", "table_name", "irrelevant"),
     MatchResult({"p_table_name": "table_name"}),
 ),
 (
     "Match a None string through Any",
     Column(Param("p_table_name", Any(type(None))), None),
     ColumnExpr("irrelevant", None, "irrelevant"),
     MatchResult({"p_table_name": None}),
 ),
Beispiel #4
0
    return binary_condition(alias, function, lhs, literals_tuple(None, rhs))


def __set_condition_pattern(lhs: Pattern[Expression],
                            operator: str) -> FunctionCallPattern:
    return FunctionCallPattern(
        String(operator),
        (
            Param("lhs", lhs),
            Param("tuple", FunctionCallPattern(String("tuple"), None)),
        ),
    )


set_condition_pattern = {
    op: __set_condition_pattern(AnyExpression(), op)
    for op in FUNCTION_TO_OPERATOR
}


def __is_set_condition(exp: Expression, operator: str) -> bool:
    if is_binary_condition(exp, operator):
        if operator in set_condition_pattern:
            if set_condition_pattern[operator].match(exp) is not None:
                assert isinstance(exp, FunctionCall)  # mypy
                assert isinstance(exp.parameters[1], FunctionCall)  # mypy
                # Matchers can't currently match arbitrary numbers of parameters, so test this directly
                return all(
                    isinstance(c, Literal)
                    for c in exp.parameters[1].parameters)
Beispiel #5
0
        String(operator),
        (
            Param("lhs", lhs),
            Param(
                "sequence",
                Or([
                    FunctionCallPattern(String("tuple"), None),
                    FunctionCallPattern(String("array"), None),
                ]),
            ),
        ),
    )


set_condition_pattern = {
    op: __set_condition_pattern(AnyExpression(), op)
    for op in FUNCTION_TO_OPERATOR
}


def __is_set_condition(exp: Expression, operator: str) -> bool:
    if is_any_binary_condition(exp, operator):
        if operator in set_condition_pattern:
            if set_condition_pattern[operator].match(exp) is not None:
                assert isinstance(exp, FunctionCall)  # mypy
                assert isinstance(exp.parameters[1], FunctionCall)  # mypy
                # Matchers can't currently match arbitrary numbers of parameters, so test this directly
                return all(
                    isinstance(c, Literal)
                    for c in exp.parameters[1].parameters)
Beispiel #6
0
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or query.get_aggregations()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition_from_ast() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        total_columns = {(col.table_name, col.column_name)
                         for col in query.get_all_ast_referenced_columns()}

        minimal_query = copy.deepcopy(query)
        minimal_query.set_selected_columns(
            [self.__id_column, self.__project_column, self.__timestamp_column])
        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(self.__id_column,
                               ColumnExpr(None, None, self.__id_column)),
            SelectedExpression(self.__project_column,
                               ColumnExpr(None, None, self.__project_column)),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(None, None, self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        minimal_columns = {
            (col.table_name, col.column_name)
            for col in minimal_query.get_all_ast_referenced_columns()
        }
        if len(total_columns) <= len(minimal_columns):
            return None

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        legacy_references = set(minimal_query.get_all_referenced_columns())
        ast_column_names = {
            c.column_name
            for c in minimal_query.get_all_ast_referenced_columns()
        }
        # Ensures the legacy minimal query (which does not expand alias references)
        # does not contain alias references we removed when creating minimal_query.
        if legacy_references - ast_column_names:
            metrics.increment("columns.skip_invalid_legacy_query")
            return None

        result = runner(minimal_query, request_settings)
        del minimal_query

        if not result.result["data"]:
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_conditions([(self.__id_column, "IN", event_ids)])
        query.add_condition_to_ast(
            in_condition(
                None,
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        # TODO: This is technically wrong. Event ids are unique per project, not globally.
        # So, if the minimal query only returned the same event_id from two projects, we
        # would be underestimating the limit here.
        query.set_limit(len(event_ids))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_condition(
            query,
            self.__project_column,
            "IN",
            project_ids,
        )
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_condition(
            query,
            self.__timestamp_column,
            ">=",
            util.parse_datetime(min(timestamps)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_condition(
            query,
            self.__timestamp_column,
            "<",
            (util.parse_datetime(max(timestamps)) +
             timedelta(seconds=1)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, request_settings)
Beispiel #7
0

def __set_condition_pattern(
    lhs: Pattern[Expression], operator: str
) -> FunctionCallPattern:
    return FunctionCallPattern(
        String(operator),
        (
            Param("lhs", lhs),
            Param("tuple", FunctionCallPattern(String("tuple"), None)),
        ),
    )


set_condition_pattern = {
    op: __set_condition_pattern(AnyExpression(), op) for op in FUNCTION_TO_OPERATOR
}


def __is_set_condition(exp: Expression, operator: str) -> bool:
    if is_any_binary_condition(exp, operator):
        if operator in set_condition_pattern:
            if set_condition_pattern[operator].match(exp) is not None:
                assert isinstance(exp, FunctionCall)  # mypy
                assert isinstance(exp.parameters[1], FunctionCall)  # mypy
                # Matchers can't currently match arbitrary numbers of parameters, so test this directly
                return all(isinstance(c, Literal) for c in exp.parameters[1].parameters)

    return False

Beispiel #8
0
    def execute(
        self,
        query: Query,
        query_settings: QuerySettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        selected_columns = {
            (col.table_name, col.column_name)
            for col in query.get_columns_referenced_in_select()
        }

        if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS:
            metrics.increment("column_splitter.main_query_min_threshold")
            return None

        minimal_query = copy.deepcopy(query)

        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(
                self.__id_column,
                ColumnExpr(self.__id_column, None, self.__id_column),
            ),
            SelectedExpression(
                self.__project_column,
                ColumnExpr(self.__project_column, None, self.__project_column),
            ),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(self.__timestamp_column, None,
                           self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT,
        # they fail on distributed tables. For that specific case, skip the query splitter.
        for orderby in minimal_query.get_orderby():
            if isinstance(orderby.expression,
                          (FunctionCallExpr, CurriedFunctionCallExpr)):
                metrics.increment("column_splitter.orderby_has_a_function")
                return None

        result = runner(minimal_query, query_settings)
        del minimal_query

        if not result.result["data"]:
            metrics.increment("column_splitter.no_data_from_minimal_query")
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_condition_to_ast(
            in_condition(
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        query.set_limit(len(result.result["data"]))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, query_settings)
Beispiel #9
0
            date_string = result.expression("date_string")
            assert isinstance(date_string, Literal)  # mypy
            assert isinstance(date_string.value, str)  # mypy
            return Literal(exp.alias, parse_datetime(date_string.value))

        return exp

    query.transform_expressions(parse)


ARRAY_JOIN_MATCH = FunctionCallMatch(
    Param("function_name", Or([StringMatch("arrayExists"), StringMatch("arrayAll")])),
    (
        Param("column", ColumnMatch(AnyOptionalString(), AnyMatch(str))),
        Param("op", Or([LiteralMatch(StringMatch(op)) for op in OPERATOR_TO_FUNCTION])),
        Param("value", AnyExpression()),
    ),
)


def _array_join_transformation(
    query: Union[CompositeQuery[QueryEntity], LogicalQuery]
) -> None:
    def parse(exp: Expression) -> Expression:
        result = ARRAY_JOIN_MATCH.match(exp)
        if result:
            function_name = result.string("function_name")
            column = result.expression("column")
            assert isinstance(column, Column)
            op_literal = result.expression("op")
            assert isinstance(op_literal, Literal)
Beispiel #10
0
def __set_condition_pattern(
    lhs: Pattern[Expression], operator: str
) -> FunctionCallPattern:
    return FunctionCallPattern(
        None,
        String(operator),
        (
            Param("lhs", lhs),
            Param("tuple", FunctionCallPattern(None, String("tuple"), None)),
        ),
    )


set_condition_pattern = {
    op: __set_condition_pattern(AnyExpression(), op) for op in FUNCTION_TO_OPERATOR
}


def __is_set_condition(exp: Expression, operator: str) -> bool:
    if is_binary_condition(exp, operator):
        if operator in set_condition_pattern:
            if set_condition_pattern[operator].match(exp) is not None:
                assert isinstance(exp, FunctionCall)  # mypy
                assert isinstance(exp.parameters[1], FunctionCall)  # mypy
                # Matchers can't currently match arbitrary numbers of parameters, so test this directly
                return all(isinstance(c, Literal) for c in exp.parameters[1].parameters)

    return False