コード例 #1
0
ファイル: split.py プロジェクト: ruezetle/snuba
    def col_split(dataset, request: Request,
                  column_split_spec: ColumnSplitSpec, *args, **kwargs):
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id and project_id.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        # The query function may mutate the request body during query
        # evaluation, so we need to copy the body to ensure that the query has
        # not been modified by the time we're ready to run the full query.
        minimal_request = copy.deepcopy(request)
        minimal_request.query.set_selected_columns(
            column_split_spec.get_min_columns())
        result = query_func(dataset, minimal_request, *args, **kwargs)
        del minimal_request

        if result.result["data"]:
            request = copy.deepcopy(request)

            event_ids = list(
                set([
                    event[column_split_spec.id_column]
                    for event in result.result["data"]
                ]))
            request.query.add_conditions([(column_split_spec.id_column, "IN",
                                           event_ids)])
            request.query.set_offset(0)
            request.query.set_limit(len(event_ids))

            project_ids = list(
                set([
                    event[column_split_spec.project_column]
                    for event in result.result["data"]
                ]))
            request.extensions["project"]["project"] = project_ids

            timestamp_field = column_split_spec.timestamp_column
            timestamps = [
                event[timestamp_field] for event in result.result["data"]
            ]
            request.extensions[
                "timeseries"]["from_date"] = util.parse_datetime(
                    min(timestamps)).isoformat()
            # We add 1 second since this gets translated to ('timestamp', '<', to_date)
            # and events are stored with a granularity of 1 second.
            request.extensions["timeseries"]["to_date"] = (
                util.parse_datetime(max(timestamps)) +
                timedelta(seconds=1)).isoformat()

        return query_func(dataset, request, *args, **kwargs)
コード例 #2
0
    def get_time_limit(
        cls, timeseries_extension: Mapping[str, Any]
    ) -> Tuple[datetime, datetime]:
        max_days, date_align = state.get_configs(
            [("max_days", None), ("date_align_seconds", 1)]
        )

        to_date = parse_datetime(timeseries_extension["to_date"], date_align)
        from_date = parse_datetime(timeseries_extension["from_date"], date_align)
        assert from_date <= to_date

        if max_days is not None and (to_date - from_date).days > max_days:
            from_date = to_date - timedelta(days=max_days)

        return (from_date, to_date)
コード例 #3
0
 def process_condition(self, condition) -> Tuple[str, str, Any]:
     lhs, op, lit = condition
     if (lhs in self.__time_parse_columns
             and op in (">", "<", ">=", "<=", "=", "!=")
             and isinstance(lit, str)):
         lit = parse_datetime(lit)
     return lhs, op, lit
コード例 #4
0
 def process_condition(self, condition) -> Tuple[str, str, any]:
     lhs, op, lit = condition
     if (lhs in self.__time_parse_columns
             and op in ('>', '<', '>=', '<=', '=', '!=')
             and isinstance(lit, str)):
         lit = parse_datetime(lit)
     return lhs, op, lit
コード例 #5
0
ファイル: parser.py プロジェクト: pombredanne/snuba
    def parse(exp: Expression) -> Expression:
        result = DATETIME_MATCH.match(exp)
        if result is not None:
            date_string = result.expression("date_string")
            assert isinstance(date_string, Literal)  # mypy
            assert isinstance(date_string.value, str)  # mypy
            return Literal(exp.alias, parse_datetime(date_string.value))

        return exp
コード例 #6
0
    def col_split(dataset, request: Request, *args, **kwargs):
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id and project_id.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        # The query function may mutate the request body during query
        # evaluation, so we need to copy the body to ensure that the query has
        # not been modified by the time we're ready to run the full query.
        minimal_request = copy.deepcopy(request)
        minimal_request.query.set_selected_columns(MIN_COLS)
        result, status = query_func(dataset, minimal_request, *args, **kwargs)
        del minimal_request

        # If something failed, just return
        if status != 200:
            return result, status

        if result['data']:
            request = copy.deepcopy(request)

            event_ids = list(
                set([event['event_id'] for event in result['data']]))
            request.query.add_conditions([('event_id', 'IN', event_ids)])
            request.query.set_offset(0)
            request.query.set_limit(len(event_ids))

            project_ids = list(
                set([event['project_id'] for event in result['data']]))
            request.extensions['project']['project'] = project_ids

            timestamps = [event['timestamp'] for event in result['data']]
            request.extensions[
                'timeseries']['from_date'] = util.parse_datetime(
                    min(timestamps)).isoformat()
            # We add 1 second since this gets translated to ('timestamp', '<', to_date)
            # and events are stored with a granularity of 1 second.
            request.extensions['timeseries']['to_date'] = (
                util.parse_datetime(max(timestamps)) +
                timedelta(seconds=1)).isoformat()

        return query_func(dataset, request, *args, **kwargs)
コード例 #7
0
    def __process_condition(self, exp: Expression) -> Expression:
        result = self.condition_match.match(exp)
        if result is not None:
            literal = result.expression("literal")
            assert isinstance(exp, FunctionCall)  # mypy
            assert isinstance(literal, Literal)  # mypy
            try:
                value = parse_datetime(str(literal.value))
            except ValueError as err:
                column_name = result.string("column_name")
                raise InvalidQueryException(
                    f"Illegal datetime in condition on column {column_name}: '{literal.value}''"
                ) from err

            return FunctionCall(
                exp.alias,
                exp.function_name,
                (exp.parameters[0], Literal(literal.alias, value)),
            )

        return exp
コード例 #8
0
         Column("my_time", None, "time"),
         Literal(None, "2020-01-01"),
     ),
     FunctionCall(
         "my_time",
         "toStartOfHour",
         (Column(None, None, "finish_ts"), Literal(None, "Universal")),
     ),
     binary_condition(
         ConditionFunctions.EQ,
         FunctionCall(
             "my_time",
             "toStartOfHour",
             (Column(None, None, "finish_ts"), Literal(None, "Universal")),
         ),
         Literal(None, parse_datetime("2020-01-01")),
     ),
     "(toStartOfHour(finish_ts, 'Universal') AS my_time)",
     "equals((toStartOfHour(finish_ts, 'Universal') AS my_time), toDateTime('2020-01-01T00:00:00', 'Universal'))",
     id="granularity-3600-simple-condition",
 ),
 pytest.param(
     60,
     binary_condition(
         BooleanFunctions.AND,
         binary_condition(
             ConditionFunctions.EQ,
             Column("my_time", None, "time"),
             Literal(None, "2020-01-01"),
         ),
         binary_condition(
コード例 #9
0
ファイル: tagsmap.py プロジェクト: anthonynsimon/snuba
    def process_query(self, query: Query,
                      request_settings: RequestSettings) -> None:
        conditions = query.get_conditions()
        if not conditions:
            return

        # Enable the processor only if we have enough data in the flattened
        # columns. Which have been deployed at BEGINNING_OF_TIME. If the query
        # starts earlier than that we do not apply the optimization.
        if self.__beginning_of_time:
            apply_optimization = False
            for condition in conditions:
                if (is_condition(condition) and isinstance(condition[0], str)
                        and condition[0] in self.__timestamp_cols
                        and condition[1] in (">=", ">")
                        and isinstance(condition[2], str)):
                    try:
                        start_ts = parse_datetime(condition[2])
                        if (start_ts -
                                self.__beginning_of_time).total_seconds() > 0:
                            apply_optimization = True
                    except Exception:
                        # We should not get here, it means the from timestamp is malformed
                        # Returning here is just for safety
                        logger.error(
                            "Cannot parse start date for NestedFieldOptimizer: %r",
                            condition,
                        )
                        return
            if not apply_optimization:
                return

        # Do not use flattened tags if tags are being unpacked anyway. In that case
        # using flattened tags only implies loading an additional column thus making
        # the query heavier and slower
        if self.__has_tags(query.get_arrayjoin_from_ast()):
            return
        if query.get_groupby_from_ast():
            for expression in query.get_groupby_from_ast():
                if self.__has_tags(expression):
                    return
        if self.__has_tags(query.get_having_from_ast()):
            return

        if query.get_orderby_from_ast():
            for orderby in query.get_orderby_from_ast():
                if self.__has_tags(orderby.expression):
                    return

        new_conditions = []
        positive_like_expression: List[str] = []
        negative_like_expression: List[str] = []

        for c in conditions:
            keyvalue = self.__is_optimizable(c, self.__nested_col)
            if not keyvalue:
                new_conditions.append(c)
            else:
                expression = f"{escape_field(keyvalue.nested_col_key)}={escape_field(keyvalue.value)}"
                if keyvalue.operand == Operand.EQ:
                    positive_like_expression.append(expression)
                else:
                    negative_like_expression.append(expression)

        if positive_like_expression:
            # Positive conditions "=" are all merged together in one LIKE expression
            positive_like_expression = sorted(positive_like_expression)
            like_formatted = f"%|{'|%|'.join(positive_like_expression)}|%"
            new_conditions.append(
                [self.__flattened_col, "LIKE", like_formatted])

        for expression in negative_like_expression:
            # Negative conditions "!=" cannot be merged together. We can still transform
            # them into NOT LIKE statements, but each condition has to be one
            # statement.
            not_like_formatted = f"%|{expression}|%"
            new_conditions.append(
                [self.__flattened_col, "NOT LIKE", not_like_formatted])

        query.set_conditions(new_conditions)
コード例 #10
0
         Column("my_time", None, "time"),
         Literal(None, "2020-01-01"),
     ),
     FunctionCall(
         "my_time",
         "toStartOfHour",
         (Column(None, None, "finish_ts"), Literal(None, "Universal")),
     ),
     binary_condition(
         ConditionFunctions.EQ,
         FunctionCall(
             "my_time",
             "toStartOfHour",
             (Column(None, None, "finish_ts"), Literal(None, "Universal")),
         ),
         Literal(None, parse_datetime("2020-01-01")),
     ),
     "(toStartOfHour(finish_ts, 'Universal') AS my_time)",
     "equals((toStartOfHour(finish_ts, 'Universal') AS my_time), toDateTime('2020-01-01T00:00:00', 'Universal'))",
     id="granularity-3600-simple-condition",
 ),
 pytest.param(
     60,
     binary_condition(
         BooleanFunctions.AND,
         binary_condition(
             ConditionFunctions.EQ,
             Column("my_time", None, "time"),
             Literal(None, "2020-01-01"),
         ),
         binary_condition(
コード例 #11
0
ファイル: split.py プロジェクト: ruezetle/snuba
    def time_split(dataset, request: Request, *args, **kwargs):
        """
        If a query is:
            - ORDER BY timestamp DESC
            - has no grouping
            - has an offset/limit
            - has a large time range
        We know we have to reverse-sort the entire set of rows to return the small
        chunk at the end of the time range, so optimistically split the time range
        into smaller increments, and start with the last one, so that we can potentially
        avoid querying the entire range.
        """
        date_align, split_step = state.get_configs([("date_align_seconds", 1),
                                                    ("split_step", 3600)
                                                    ]  # default 1 hour
                                                   )

        query_limit = request.query.get_limit()
        limit = query_limit if query_limit is not None else 0
        remaining_offset = request.query.get_offset()

        to_date = util.parse_datetime(
            request.extensions["timeseries"]["to_date"], date_align)
        from_date = util.parse_datetime(
            request.extensions["timeseries"]["from_date"], date_align)

        overall_result = None
        split_end = to_date
        split_start = max(split_end - timedelta(seconds=split_step), from_date)
        total_results = 0
        while split_start < split_end and total_results < limit:
            request.extensions["timeseries"][
                "from_date"] = split_start.isoformat()
            request.extensions["timeseries"]["to_date"] = split_end.isoformat()
            # Because its paged, we have to ask for (limit+offset) results
            # and set offset=0 so we can then trim them ourselves.
            request.query.set_offset(0)
            request.query.set_limit(limit - total_results + remaining_offset)

            # The query function may mutate the request body during query
            # evaluation, so we need to copy the body to ensure that the query
            # has not been modified in between this call and the next loop
            # iteration, if needed.
            # XXX: The extra data is carried across from the initial response
            # and never updated.
            result = query_func(dataset, copy.deepcopy(request), *args,
                                **kwargs)

            if overall_result is None:
                overall_result = result
            else:
                overall_result.result["data"].extend(result.result["data"])

            if remaining_offset > 0 and len(overall_result.result["data"]) > 0:
                to_trim = min(remaining_offset,
                              len(overall_result.result["data"]))
                overall_result.result["data"] = overall_result.result["data"][
                    to_trim:]
                remaining_offset -= to_trim

            total_results = len(overall_result.result["data"])

            if total_results < limit:
                if len(result.result["data"]) == 0:
                    # If we got nothing from the last query, expand the range by a static factor
                    split_step = split_step * STEP_GROWTH
                else:
                    # If we got some results but not all of them, estimate how big the time
                    # range should be for the next query based on how many results we got for
                    # our last query and its time range, and how many we have left to fetch.
                    remaining = limit - total_results
                    split_step = split_step * math.ceil(
                        remaining / float(len(result.result["data"])))

                # Set the start and end of the next query based on the new range.
                split_end = split_start
                try:
                    split_start = max(
                        split_end - timedelta(seconds=split_step), from_date)
                except OverflowError:
                    split_start = from_date

        return overall_result
コード例 #12
0
ファイル: split.py プロジェクト: anthonynsimon/snuba
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        If a query is:
            - ORDER BY timestamp DESC
            - has no grouping
            - has an offset/limit
            - has a large time range
        We know we have to reverse-sort the entire set of rows to return the small
        chunk at the end of the time range, so optimistically split the time range
        into smaller increments, and start with the last one, so that we can potentially
        avoid querying the entire range.
        """
        limit = query.get_limit()
        if limit is None or query.get_groupby():
            return None

        if query.get_offset() >= 1000:
            return None

        orderby = query.get_orderby()
        if not orderby or orderby[0] != f"-{self.__timestamp_col}":
            return None

        conditions = query.get_conditions() or []
        from_date_str = next(
            (condition[2] for condition in conditions
             if _identify_condition(condition, self.__timestamp_col, ">=")),
            None,
        )

        to_date_str = next(
            (condition[2] for condition in conditions
             if _identify_condition(condition, self.__timestamp_col, "<")),
            None,
        )
        from_date_ast, to_date_ast = get_time_range(query,
                                                    self.__timestamp_col)

        if not from_date_str or not to_date_str:
            return None

        date_align, split_step = state.get_configs([("date_align_seconds", 1),
                                                    ("split_step", 3600)
                                                    ]  # default 1 hour
                                                   )
        to_date = util.parse_datetime(to_date_str, date_align)
        from_date = util.parse_datetime(from_date_str, date_align)

        if from_date != from_date_ast:
            logger.warning(
                "Mismatch in start date on time splitter.",
                extra={
                    "ast": str(from_date_ast),
                    "legacy": str(from_date)
                },
                exc_info=True,
            )
            metrics.increment("mismatch.ast_from_date")

        remaining_offset = query.get_offset()

        overall_result = None
        split_end = to_date
        split_start = max(split_end - timedelta(seconds=split_step), from_date)
        total_results = 0
        while split_start < split_end and total_results < limit:
            # We need to make a copy to use during the query execution because we replace
            # the start-end conditions on the query at each iteration of this loop.
            split_query = copy.deepcopy(query)

            _replace_condition(split_query, self.__timestamp_col, ">=",
                               split_start.isoformat())
            _replace_ast_condition(split_query, self.__timestamp_col, ">=",
                                   LiteralExpr(None, split_start))
            _replace_condition(split_query, self.__timestamp_col, "<",
                               split_end.isoformat())
            _replace_ast_condition(split_query, self.__timestamp_col, "<",
                                   LiteralExpr(None, split_end))

            # Because its paged, we have to ask for (limit+offset) results
            # and set offset=0 so we can then trim them ourselves.
            split_query.set_offset(0)
            split_query.set_limit(limit - total_results + remaining_offset)

            # At every iteration we only append the "data" key from the results returned by
            # the runner. The "extra" key is only populated at the first iteration of the
            # loop and never changed.
            result = runner(split_query, request_settings)

            if overall_result is None:
                overall_result = result
            else:
                overall_result.result["data"].extend(result.result["data"])

            if remaining_offset > 0 and len(overall_result.result["data"]) > 0:
                to_trim = min(remaining_offset,
                              len(overall_result.result["data"]))
                overall_result.result["data"] = overall_result.result["data"][
                    to_trim:]
                remaining_offset -= to_trim

            total_results = len(overall_result.result["data"])

            if total_results < limit:
                if len(result.result["data"]) == 0:
                    # If we got nothing from the last query, expand the range by a static factor
                    split_step = split_step * STEP_GROWTH
                else:
                    # If we got some results but not all of them, estimate how big the time
                    # range should be for the next query based on how many results we got for
                    # our last query and its time range, and how many we have left to fetch.
                    remaining = limit - total_results
                    split_step = split_step * math.ceil(
                        remaining / float(len(result.result["data"])))

                # Set the start and end of the next query based on the new range.
                split_end = split_start
                try:
                    split_start = max(
                        split_end - timedelta(seconds=split_step), from_date)
                except OverflowError:
                    split_start = from_date

        return overall_result
コード例 #13
0
ファイル: split.py プロジェクト: anthonynsimon/snuba
    def execute(
        self,
        query: Query,
        request_settings: RequestSettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or query.get_aggregations()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition_from_ast() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        total_columns = {(col.table_name, col.column_name)
                         for col in query.get_all_ast_referenced_columns()}

        minimal_query = copy.deepcopy(query)
        minimal_query.set_selected_columns(
            [self.__id_column, self.__project_column, self.__timestamp_column])
        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(self.__id_column,
                               ColumnExpr(None, None, self.__id_column)),
            SelectedExpression(self.__project_column,
                               ColumnExpr(None, None, self.__project_column)),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(None, None, self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        minimal_columns = {
            (col.table_name, col.column_name)
            for col in minimal_query.get_all_ast_referenced_columns()
        }
        if len(total_columns) <= len(minimal_columns):
            return None

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        legacy_references = set(minimal_query.get_all_referenced_columns())
        ast_column_names = {
            c.column_name
            for c in minimal_query.get_all_ast_referenced_columns()
        }
        # Ensures the legacy minimal query (which does not expand alias references)
        # does not contain alias references we removed when creating minimal_query.
        if legacy_references - ast_column_names:
            metrics.increment("columns.skip_invalid_legacy_query")
            return None

        result = runner(minimal_query, request_settings)
        del minimal_query

        if not result.result["data"]:
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_conditions([(self.__id_column, "IN", event_ids)])
        query.add_condition_to_ast(
            in_condition(
                None,
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        # TODO: This is technically wrong. Event ids are unique per project, not globally.
        # So, if the minimal query only returned the same event_id from two projects, we
        # would be underestimating the limit here.
        query.set_limit(len(event_ids))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_condition(
            query,
            self.__project_column,
            "IN",
            project_ids,
        )
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_condition(
            query,
            self.__timestamp_column,
            ">=",
            util.parse_datetime(min(timestamps)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_condition(
            query,
            self.__timestamp_column,
            "<",
            (util.parse_datetime(max(timestamps)) +
             timedelta(seconds=1)).isoformat(),
        )
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, request_settings)
コード例 #14
0
ファイル: split.py プロジェクト: getsentry/snuba
    def execute(
        self,
        query: Query,
        query_settings: QuerySettings,
        runner: SplitQueryRunner,
    ) -> Optional[QueryResult]:
        """
        Split query in 2 steps if a large number of columns is being selected.
            - First query only selects event_id, project_id and timestamp.
            - Second query selects all fields for only those events.
            - Shrink the date range.
        """
        limit = query.get_limit()
        if (limit is None or limit == 0 or query.get_groupby()
                or not query.get_selected_columns()):
            return None

        if limit > settings.COLUMN_SPLIT_MAX_LIMIT:
            metrics.increment("column_splitter.query_above_limit")
            return None

        # Do not split if there is already a = or IN condition on an ID column
        id_column_matcher = FunctionCall(
            Or([String(ConditionFunctions.EQ),
                String(ConditionFunctions.IN)]),
            (
                Column(None, String(self.__id_column)),
                AnyExpression(),
            ),
        )

        for expr in query.get_condition() or []:
            match = id_column_matcher.match(expr)

            if match:
                return None

        # We need to count the number of table/column name pairs
        # not the number of distinct Column objects in the query
        # so to avoid counting aliased columns multiple times.
        selected_columns = {
            (col.table_name, col.column_name)
            for col in query.get_columns_referenced_in_select()
        }

        if len(selected_columns) < settings.COLUMN_SPLIT_MIN_COLS:
            metrics.increment("column_splitter.main_query_min_threshold")
            return None

        minimal_query = copy.deepcopy(query)

        # TODO: provide the table alias name to this splitter if we ever use it
        # in joins.
        minimal_query.set_ast_selected_columns([
            SelectedExpression(
                self.__id_column,
                ColumnExpr(self.__id_column, None, self.__id_column),
            ),
            SelectedExpression(
                self.__project_column,
                ColumnExpr(self.__project_column, None, self.__project_column),
            ),
            SelectedExpression(
                self.__timestamp_column,
                ColumnExpr(self.__timestamp_column, None,
                           self.__timestamp_column),
            ),
        ])

        for exp in minimal_query.get_all_expressions():
            if exp.alias in (
                    self.__id_column,
                    self.__project_column,
                    self.__timestamp_column,
            ) and not (isinstance(exp, ColumnExpr)
                       and exp.column_name == exp.alias):
                logger.warning(
                    "Potential alias shadowing due to column splitter",
                    extra={"expression": exp},
                    exc_info=True,
                )

        # Ensures the AST minimal query is actually runnable on its own.
        if not minimal_query.validate_aliases():
            return None

        # There is a Clickhouse bug where if functions in the ORDER BY clause are not in the SELECT,
        # they fail on distributed tables. For that specific case, skip the query splitter.
        for orderby in minimal_query.get_orderby():
            if isinstance(orderby.expression,
                          (FunctionCallExpr, CurriedFunctionCallExpr)):
                metrics.increment("column_splitter.orderby_has_a_function")
                return None

        result = runner(minimal_query, query_settings)
        del minimal_query

        if not result.result["data"]:
            metrics.increment("column_splitter.no_data_from_minimal_query")
            return None

        # Making a copy just in case runner returned None (which would drive the execution
        # strategy to ignore the result of this splitter and try the next one).
        query = copy.deepcopy(query)

        event_ids = list(
            set([event[self.__id_column] for event in result.result["data"]]))
        if len(event_ids) > settings.COLUMN_SPLIT_MAX_RESULTS:
            # We may be runing a query that is beyond clickhouse maximum query size,
            # so we cowardly abandon.
            metrics.increment(
                "column_splitter.intermediate_results_beyond_limit")
            return None

        query.add_condition_to_ast(
            in_condition(
                ColumnExpr(None, None, self.__id_column),
                [LiteralExpr(None, e_id) for e_id in event_ids],
            ))
        query.set_offset(0)
        query.set_limit(len(result.result["data"]))

        project_ids = list(
            set([
                event[self.__project_column] for event in result.result["data"]
            ]))
        _replace_ast_condition(
            query,
            self.__project_column,
            "IN",
            literals_tuple(None,
                           [LiteralExpr(None, p_id) for p_id in project_ids]),
        )

        timestamps = [
            event[self.__timestamp_column] for event in result.result["data"]
        ]
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            ">=",
            LiteralExpr(None, util.parse_datetime(min(timestamps))),
        )
        # We add 1 second since this gets translated to ('timestamp', '<', to_date)
        # and events are stored with a granularity of 1 second.
        _replace_ast_condition(
            query,
            self.__timestamp_column,
            "<",
            LiteralExpr(
                None,
                (util.parse_datetime(max(timestamps)) + timedelta(seconds=1)),
            ),
        )

        return runner(query, query_settings)
コード例 #15
0
    def wrapper(*args, **kwargs):
        body = args[0]
        use_split, date_align, split_step = state.get_configs([
            ('use_split', 0),
            ('date_align_seconds', 1),
            ('split_step', 3600),  # default 1 hour
        ])
        to_date = util.parse_datetime(body['to_date'], date_align)
        from_date = util.parse_datetime(body['from_date'], date_align)
        limit = body.get('limit', 0)
        remaining_offset = body.get('offset', 0)

        if (use_split and limit and not body.get('groupby')
                and body.get('orderby') == '-timestamp'):
            overall_result = None
            split_end = to_date
            split_start = max(split_end - timedelta(seconds=split_step),
                              from_date)
            total_results = 0
            status = 0
            while split_start < split_end and total_results < limit:
                body['from_date'] = split_start.isoformat()
                body['to_date'] = split_end.isoformat()
                # Because its paged, we have to ask for (limit+offset) results
                # and set offset=0 so we can then trim them ourselves.
                body['offset'] = 0
                body['limit'] = limit - total_results + remaining_offset
                result, status = query_func(*args, **kwargs)

                # If something failed, discard all progress and just return that
                if status != 200:
                    overall_result = result
                    break

                if overall_result is None:
                    overall_result = result
                else:
                    overall_result['data'].extend(result['data'])

                if remaining_offset > 0 and len(overall_result['data']) > 0:
                    to_trim = min(remaining_offset,
                                  len(overall_result['data']))
                    overall_result['data'] = overall_result['data'][to_trim:]
                    remaining_offset -= to_trim

                total_results = len(overall_result['data'])

                if total_results < limit:
                    if len(result['data']) == 0:
                        # If we got nothing from the last query, jump straight to the max time range
                        split_end = split_start
                        split_start = from_date
                    else:
                        # Estimate how big the time range should be for the next query based on
                        # how many results we got for our last query and its time range, and how
                        # many we have left to fetch
                        remaining = limit - total_results
                        split_step = split_step * math.ceil(
                            remaining / float(len(result['data'])))
                        split_end = split_start
                        try:
                            split_start = max(
                                split_end - timedelta(seconds=split_step),
                                from_date)
                        except OverflowError:
                            split_start = from_date
            return overall_result, status
        else:
            return query_func(*args, **kwargs)
コード例 #16
0
def parse_and_run_query(validated_body, timer):
    body = deepcopy(validated_body)
    turbo = body.get('turbo', False)
    max_days, table, date_align, config_sample, force_final, max_group_ids_exclude = state.get_configs([
        ('max_days', None),
        ('clickhouse_table', settings.CLICKHOUSE_TABLE),
        ('date_align_seconds', 1),
        ('sample', 1),
        # 1: always use FINAL, 0: never use final, undefined/None: use project setting.
        ('force_final', 0 if turbo else None),
        ('max_group_ids_exclude', settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE),
    ])
    stats = {}
    to_date = util.parse_datetime(body['to_date'], date_align)
    from_date = util.parse_datetime(body['from_date'], date_align)
    assert from_date <= to_date

    if max_days is not None and (to_date - from_date).days > max_days:
        from_date = to_date - timedelta(days=max_days)

    where_conditions = body.get('conditions', [])
    where_conditions.extend([
        ('timestamp', '>=', from_date),
        ('timestamp', '<', to_date),
        ('deleted', '=', 0),
    ])
    # NOTE: we rely entirely on the schema to make sure that regular snuba
    # queries are required to send a project_id filter. Some other special
    # internal query types do not require a project_id filter.
    project_ids = util.to_list(body['project'])
    if project_ids:
        where_conditions.append(('project_id', 'IN', project_ids))

    having_conditions = body.get('having', [])

    aggregate_exprs = [
        util.column_expr(col, body, alias, agg)
        for (agg, col, alias) in body['aggregations']
    ]
    groupby = util.to_list(body['groupby'])
    group_exprs = [util.column_expr(gb, body) for gb in groupby]

    selected_cols = [util.column_expr(util.tuplify(colname), body)
                     for colname in body.get('selected_columns', [])]

    select_exprs = group_exprs + aggregate_exprs + selected_cols
    select_clause = u'SELECT {}'.format(', '.join(select_exprs))

    from_clause = u'FROM {}'.format(table)

    # For now, we only need FINAL if:
    #    1. The project has been marked as needing FINAL (in redis) because of recent
    #       replacements (and it affects too many groups for us just to exclude
    #       those groups from the query)
    #    OR
    #    2. the force_final setting = 1
    needs_final, exclude_group_ids = get_projects_query_flags(project_ids)
    if len(exclude_group_ids) > max_group_ids_exclude:
        # Cap the number of groups to exclude by query and flip to using FINAL if necessary
        needs_final = True
        exclude_group_ids = []

    used_final = False
    if force_final == 1 or (force_final is None and needs_final):
        from_clause = u'{} FINAL'.format(from_clause)
        used_final = True
    elif exclude_group_ids:
        where_conditions.append(('group_id', 'NOT IN', exclude_group_ids))

    sample = body.get('sample', settings.TURBO_SAMPLE_RATE if turbo else config_sample)
    if sample != 1:
        from_clause = u'{} SAMPLE {}'.format(from_clause, sample)

    joins = []

    if 'arrayjoin' in body:
        joins.append(u'ARRAY JOIN {}'.format(body['arrayjoin']))
    join_clause = ' '.join(joins)

    where_clause = ''
    if where_conditions:
        where_conditions = list(set(util.tuplify(where_conditions)))
        where_clause = u'WHERE {}'.format(util.conditions_expr(where_conditions, body))

    prewhere_conditions = []
    if settings.PREWHERE_KEYS:
        # Add any condition to PREWHERE if:
        # - It is a single top-level condition (not OR-nested), and
        # - Any of its referenced columns are in PREWHERE_KEYS
        prewhere_candidates = [
            (util.columns_in_expr(cond[0]), cond)
            for cond in where_conditions if util.is_condition(cond) and
            any(col in settings.PREWHERE_KEYS for col in util.columns_in_expr(cond[0]))
        ]
        # Use the condition that has the highest priority (based on the
        # position of its columns in the PREWHERE_KEYS list)
        prewhere_candidates = sorted([
            (min(settings.PREWHERE_KEYS.index(col) for col in cols if col in settings.PREWHERE_KEYS), cond)
            for cols, cond in prewhere_candidates
        ])
        if prewhere_candidates:
            prewhere_conditions = [cond for _, cond in prewhere_candidates][:settings.MAX_PREWHERE_CONDITIONS]

    prewhere_clause = ''
    if prewhere_conditions:
        prewhere_clause = u'PREWHERE {}'.format(util.conditions_expr(prewhere_conditions, body))

    having_clause = ''
    if having_conditions:
        assert groupby, 'found HAVING clause with no GROUP BY'
        having_clause = u'HAVING {}'.format(util.conditions_expr(having_conditions, body))

    group_clause = ', '.join(util.column_expr(gb, body) for gb in groupby)
    if group_clause:
        if body.get('totals', False):
            group_clause = 'GROUP BY ({}) WITH TOTALS'.format(group_clause)
        else:
            group_clause = 'GROUP BY ({})'.format(group_clause)

    order_clause = ''
    if body.get('orderby'):
        orderby = [util.column_expr(util.tuplify(ob), body) for ob in util.to_list(body['orderby'])]
        orderby = [u'{} {}'.format(
            ob.lstrip('-'),
            'DESC' if ob.startswith('-') else 'ASC'
        ) for ob in orderby]
        order_clause = u'ORDER BY {}'.format(', '.join(orderby))

    limitby_clause = ''
    if 'limitby' in body:
        limitby_clause = 'LIMIT {} BY {}'.format(*body['limitby'])

    limit_clause = ''
    if 'limit' in body:
        limit_clause = 'LIMIT {}, {}'.format(body.get('offset', 0), body['limit'])

    sql = ' '.join([c for c in [
        select_clause,
        from_clause,
        join_clause,
        prewhere_clause,
        where_clause,
        group_clause,
        having_clause,
        order_clause,
        limitby_clause,
        limit_clause
    ] if c])

    timer.mark('prepare_query')

    stats.update({
        'clickhouse_table': table,
        'final': used_final,
        'referrer': request.referrer,
        'num_days': (to_date - from_date).days,
        'num_projects': len(project_ids),
        'sample': sample,
    })

    return util.raw_query(
        validated_body, sql, clickhouse_ro, timer, stats
    )