Esempio n. 1
0
    def test_alias_in_alias(self):
        body = {'groupby': ['tags_key', 'tags_value']}
        assert column_expr('tags_key', body) == (
            '(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) '
            'AS all_tags))[1] AS tags_key)')

        # If we want to use `tags_key` again, make sure we use the
        # already-created alias verbatim
        assert column_expr('tags_key', body) == 'tags_key'
        # If we also want to use `tags_value`, make sure that we use
        # the `all_tags` alias instead of re-expanding the tags arrayJoin
        assert column_expr('tags_value',
                           body) == '((all_tags)[2] AS tags_value)'
Esempio n. 2
0
 def test_duplicate_expression_alias(self):
     body = {
         'aggregations': [
             ['topK(3)', 'logger', 'dupe_alias'],
             ['uniq', 'environment', 'dupe_alias'],
         ]
     }
     # In the case where 2 different expressions are aliased
     # to the same thing, one ends up overwriting the other.
     # This may not be ideal as it may mask bugs in query conditions
     exprs = [
         column_expr(col, body, alias, agg)
         for (agg, col, alias) in body['aggregations']
     ]
     assert exprs == ['(topK(3)(logger) AS dupe_alias)', 'dupe_alias']
Esempio n. 3
0
    def test_column_expr(self):
        body = {'granularity': 86400}
        # Single tag expression
        assert column_expr('tags[foo]', body.copy()) ==\
            "(tags.value[indexOf(tags.key, \'foo\')] AS `tags[foo]`)"

        # Promoted tag expression / no translation
        assert column_expr('tags[server_name]', body.copy()) ==\
            "(server_name AS `tags[server_name]`)"

        # Promoted tag expression / with translation
        assert column_expr('tags[app.device]', body.copy()) ==\
            "(app_device AS `tags[app.device]`)"

        # All tag keys expression
        assert column_expr(
            'tags_key', body.copy()) == ('(arrayJoin(tags.key) AS tags_key)')

        # If we are going to use both tags_key and tags_value, expand both
        tag_group_body = {'groupby': ['tags_key', 'tags_value']}
        assert column_expr('tags_key', tag_group_body) == (
            '(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) '
            'AS all_tags))[1] AS tags_key)')

        assert column_expr('time', body.copy()) ==\
            "(toDate(timestamp) AS time)"

        assert column_expr('col', body.copy(), aggregate='sum') ==\
            "(sum(col) AS col)"

        assert column_expr(None, body.copy(), alias='sum', aggregate='sum') ==\
            "sum"  # This should probably be an error as its an aggregate with no column

        assert column_expr('col', body.copy(), alias='summation', aggregate='sum') ==\
            "(sum(col) AS summation)"

        # Special cases where count() doesn't need a column
        assert column_expr('', body.copy(), alias='count', aggregate='count()') ==\
            "(count() AS count)"

        assert column_expr('', body.copy(), alias='aggregate', aggregate='count()') ==\
            "(count() AS aggregate)"

        # Columns that need escaping
        assert column_expr('sentry:release', body.copy()) == '`sentry:release`'

        # Columns that start with a negative sign (used in orderby to signify
        # sort order) retain the '-' sign outside the escaping backticks (if any)
        assert column_expr('-timestamp', body.copy()) == '-timestamp'
        assert column_expr('-sentry:release',
                           body.copy()) == '-`sentry:release`'

        # A 'column' that is actually a string literal
        assert column_expr('\'hello world\'', body.copy()) == '\'hello world\''

        # Complex expressions (function calls) involving both string and column arguments
        assert column_expr(tuplify(['concat', ['a', '\':\'', 'b']]),
                           body.copy()) == 'concat(a, \':\', b)'

        group_id_body = body.copy()
        assert column_expr('issue', group_id_body) == '(group_id AS issue)'
Esempio n. 4
0
    def test_conditions_expr(self):
        conditions = [['a', '=', 1]]
        assert conditions_expr(conditions, {}) == 'a = 1'

        conditions = [[['a', '=', 1]]]
        assert conditions_expr(conditions, {}) == 'a = 1'

        conditions = [['a', '=', 1], ['b', '=', 2]]
        assert conditions_expr(conditions, {}) == 'a = 1 AND b = 2'

        conditions = [[['a', '=', 1], ['b', '=', 2]]]
        assert conditions_expr(conditions, {}) == '(a = 1 OR b = 2)'

        conditions = [[['a', '=', 1], ['b', '=', 2]], ['c', '=', 3]]
        assert conditions_expr(conditions, {}) == '(a = 1 OR b = 2) AND c = 3'

        conditions = [[['a', '=', 1], ['b', '=', 2]],
                      [['c', '=', 3], ['d', '=', 4]]]
        assert conditions_expr(conditions,
                               {}) == '(a = 1 OR b = 2) AND (c = 3 OR d = 4)'

        # Malformed condition input
        conditions = [[['a', '=', 1], []]]
        assert conditions_expr(conditions, {}) == 'a = 1'

        # Test column expansion
        conditions = [[['tags[foo]', '=', 1], ['b', '=', 2]]]
        expanded = column_expr('tags[foo]', {})
        assert conditions_expr(conditions,
                               {}) == '({} = 1 OR b = 2)'.format(expanded)

        # Test using alias if column has already been expanded in SELECT clause
        reuse_body = {}
        conditions = [[['tags[foo]', '=', 1], ['b', '=', 2]]]
        column_expr('tags[foo]',
                    reuse_body)  # Expand it once so the next time is aliased
        assert conditions_expr(conditions,
                               reuse_body) == '(`tags[foo]` = 1 OR b = 2)'

        # Test special output format of LIKE
        conditions = [['primary_hash', 'LIKE', '%foo%']]
        assert conditions_expr(conditions, {}) == 'primary_hash LIKE \'%foo%\''

        conditions = tuplify(
            [[['notEmpty', ['arrayElement', ['exception_stacks.type', 1]]],
              '=', 1]])
        assert conditions_expr(
            conditions,
            {}) == 'notEmpty(arrayElement(exception_stacks.type, 1)) = 1'

        conditions = tuplify([[['notEmpty', ['tags[sentry:user]']], '=', 1]])
        assert conditions_expr(
            conditions,
            {}) == 'notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 1'

        conditions = tuplify([[['notEmpty', ['tags_key']], '=', 1]])
        assert conditions_expr(
            conditions,
            {}) == 'notEmpty((arrayJoin(tags.key) AS tags_key)) = 1'

        conditions = tuplify([
            [[['notEmpty', ['tags[sentry:environment]']], '=', 'dev'],
             [['notEmpty', ['tags[sentry:environment]']], '=', 'prod']],
            [[['notEmpty', ['tags[sentry:user]']], '=', 'joe'],
             [['notEmpty', ['tags[sentry:user]']], '=', 'bob']],
        ])
        assert conditions_expr(conditions, {}) == \
                """(notEmpty((tags.value[indexOf(tags.key, 'sentry:environment')] AS `tags[sentry:environment]`)) = 'dev' OR notEmpty(`tags[sentry:environment]`) = 'prod') AND (notEmpty((`sentry:user` AS `tags[sentry:user]`)) = 'joe' OR notEmpty(`tags[sentry:user]`) = 'bob')"""

        # Test scalar condition on array column is expanded as an iterator.
        conditions = [['exception_frames.filename', 'LIKE', '%foo%']]
        assert conditions_expr(
            conditions, {}
        ) == 'arrayExists(x -> assumeNotNull(x LIKE \'%foo%\'), exception_frames.filename)'

        # Test negative scalar condition on array column is expanded as an all() type iterator.
        conditions = [['exception_frames.filename', 'NOT LIKE', '%foo%']]
        assert conditions_expr(
            conditions, {}
        ) == 'arrayAll(x -> assumeNotNull(x NOT LIKE \'%foo%\'), exception_frames.filename)'
Esempio n. 5
0
    def test_column_expr(self):
        body = {'granularity': 86400}
        # Single tag expression
        assert column_expr(self.dataset, 'tags[foo]', body.copy()) ==\
            "(tags.value[indexOf(tags.key, \'foo\')] AS `tags[foo]`)"

        # Promoted tag expression / no translation
        assert column_expr(self.dataset, 'tags[server_name]', body.copy()) ==\
            "(server_name AS `tags[server_name]`)"

        # Promoted tag expression / with translation
        assert column_expr(self.dataset, 'tags[app.device]', body.copy()) ==\
            "(app_device AS `tags[app.device]`)"

        # All tag keys expression
        assert column_expr(
            self.dataset, 'tags_key',
            body.copy()) == ('(arrayJoin(tags.key) AS tags_key)')

        # If we are going to use both tags_key and tags_value, expand both
        tag_group_body = {'groupby': ['tags_key', 'tags_value']}
        assert column_expr(self.dataset, 'tags_key', tag_group_body) == (
            '(((arrayJoin(arrayMap((x,y) -> [x,y], tags.key, tags.value)) '
            'AS all_tags))[1] AS tags_key)')

        assert column_expr(self.dataset, 'time', body.copy()) ==\
            "(toDate(timestamp) AS time)"

        assert column_expr(self.dataset, 'rtime', body.copy()) ==\
            "(toDate(received) AS rtime)"

        assert column_expr(self.dataset, 'col', body.copy(), aggregate='sum') ==\
            "(sum(col) AS col)"

        assert column_expr(self.dataset, 'col', body.copy(), alias='summation', aggregate='sum') ==\
            "(sum(col) AS summation)"

        # Special cases where count() doesn't need a column
        assert column_expr(self.dataset, '', body.copy(), alias='count', aggregate='count()') ==\
            "(count() AS count)"

        assert column_expr(self.dataset, '', body.copy(), alias='aggregate', aggregate='count()') ==\
            "(count() AS aggregate)"

        # Columns that need escaping
        assert column_expr(self.dataset, 'sentry:release',
                           body.copy()) == '`sentry:release`'

        # Columns that start with a negative sign (used in orderby to signify
        # sort order) retain the '-' sign outside the escaping backticks (if any)
        assert column_expr(self.dataset, '-timestamp',
                           body.copy()) == '-timestamp'
        assert column_expr(self.dataset, '-sentry:release',
                           body.copy()) == '-`sentry:release`'

        # A 'column' that is actually a string literal
        assert column_expr(self.dataset, '\'hello world\'',
                           body.copy()) == '\'hello world\''

        # Complex expressions (function calls) involving both string and column arguments
        assert column_expr(self.dataset,
                           tuplify(['concat', ['a', '\':\'', 'b']]),
                           body.copy()) == 'concat(a, \':\', b)'

        group_id_body = body.copy()
        assert column_expr(self.dataset, 'issue',
                           group_id_body) == '(nullIf(group_id, 0) AS issue)'
        assert column_expr(
            self.dataset, 'group_id',
            group_id_body) == '(nullIf(group_id, 0) AS group_id)'

        # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where a number was expected.
        assert column_expr(self.dataset,
                           'tags[environment]',
                           body.copy(),
                           alias='unique_envs',
                           aggregate='uniq'
                           ) == "(ifNull(uniq(environment), 0) AS unique_envs)"
Esempio n. 6
0
    def format(self) -> str:
        """Generate a SQL string from the parameters."""
        body = self.__request.body
        query = self.__request.query
        source = self.__dataset \
            .get_dataset_schemas() \
            .get_read_schema() \
            .get_data_source()

        aggregate_exprs = [
            util.column_expr(self.__dataset, col, body, alias, agg)
            for (agg, col, alias) in query.get_aggregations()
        ]
        groupby = util.to_list(query.get_groupby())
        group_exprs = [
            util.column_expr(self.__dataset, gb, body) for gb in groupby
        ]
        column_names = query.get_selected_columns() or []
        selected_cols = [
            util.column_expr(self.__dataset, util.tuplify(colname), body)
            for colname in column_names
        ]
        select_clause = u'SELECT {}'.format(
            ', '.join(group_exprs + aggregate_exprs + selected_cols))

        from_clause = u'FROM {}'.format(source)
        if self.__final:
            from_clause = u'{} FINAL'.format(from_clause)
        if query.get_sample():
            from_clause = u'{} SAMPLE {}'.format(from_clause,
                                                 query.get_sample())

        join_clause = ''
        if 'arrayjoin' in body:
            join_clause = u'ARRAY JOIN {}'.format(body['arrayjoin'])

        where_clause = ''
        if query.get_conditions():
            where_clause = u'WHERE {}'.format(
                util.conditions_expr(self.__dataset, query.get_conditions(),
                                     body))

        prewhere_clause = ''
        if self.__prewhere_conditions:
            prewhere_clause = u'PREWHERE {}'.format(
                util.conditions_expr(self.__dataset,
                                     self.__prewhere_conditions, body))

        group_clause = ''
        if groupby:
            group_clause = 'GROUP BY ({})'.format(', '.join(
                util.column_expr(self.__dataset, gb, body) for gb in groupby))
            if body.get('totals', False):
                group_clause = '{} WITH TOTALS'.format(group_clause)

        having_clause = ''
        having_conditions = body.get('having', [])
        if having_conditions:
            assert groupby, 'found HAVING clause with no GROUP BY'
            having_clause = u'HAVING {}'.format(
                util.conditions_expr(self.__dataset, having_conditions, body))

        order_clause = ''
        if query.get_orderby():
            orderby = [
                util.column_expr(self.__dataset, util.tuplify(ob), body)
                for ob in util.to_list(query.get_orderby())
            ]
            orderby = [
                u'{} {}'.format(ob.lstrip('-'),
                                'DESC' if ob.startswith('-') else 'ASC')
                for ob in orderby
            ]
            order_clause = u'ORDER BY {}'.format(', '.join(orderby))

        limitby_clause = ''
        if 'limitby' in body:
            limitby_clause = 'LIMIT {} BY {}'.format(*body['limitby'])

        limit_clause = ''
        if 'limit' in body:
            limit_clause = 'LIMIT {}, {}'.format(query.get_offset(),
                                                 body['limit'])

        return ' '.join([
            c for c in [
                select_clause, from_clause, join_clause, prewhere_clause,
                where_clause, group_clause, having_clause, order_clause,
                limitby_clause, limit_clause
            ] if c
        ])
Esempio n. 7
0
def parse_and_run_query(validated_body, timer):
    body = deepcopy(validated_body)
    turbo = body.get('turbo', False)
    max_days, table, date_align, config_sample, force_final, max_group_ids_exclude = state.get_configs([
        ('max_days', None),
        ('clickhouse_table', settings.CLICKHOUSE_TABLE),
        ('date_align_seconds', 1),
        ('sample', 1),
        # 1: always use FINAL, 0: never use final, undefined/None: use project setting.
        ('force_final', 0 if turbo else None),
        ('max_group_ids_exclude', settings.REPLACER_MAX_GROUP_IDS_TO_EXCLUDE),
    ])
    stats = {}
    to_date = util.parse_datetime(body['to_date'], date_align)
    from_date = util.parse_datetime(body['from_date'], date_align)
    assert from_date <= to_date

    if max_days is not None and (to_date - from_date).days > max_days:
        from_date = to_date - timedelta(days=max_days)

    where_conditions = body.get('conditions', [])
    where_conditions.extend([
        ('timestamp', '>=', from_date),
        ('timestamp', '<', to_date),
        ('deleted', '=', 0),
    ])
    # NOTE: we rely entirely on the schema to make sure that regular snuba
    # queries are required to send a project_id filter. Some other special
    # internal query types do not require a project_id filter.
    project_ids = util.to_list(body['project'])
    if project_ids:
        where_conditions.append(('project_id', 'IN', project_ids))

    having_conditions = body.get('having', [])

    aggregate_exprs = [
        util.column_expr(col, body, alias, agg)
        for (agg, col, alias) in body['aggregations']
    ]
    groupby = util.to_list(body['groupby'])
    group_exprs = [util.column_expr(gb, body) for gb in groupby]

    selected_cols = [util.column_expr(util.tuplify(colname), body)
                     for colname in body.get('selected_columns', [])]

    select_exprs = group_exprs + aggregate_exprs + selected_cols
    select_clause = u'SELECT {}'.format(', '.join(select_exprs))

    from_clause = u'FROM {}'.format(table)

    # For now, we only need FINAL if:
    #    1. The project has been marked as needing FINAL (in redis) because of recent
    #       replacements (and it affects too many groups for us just to exclude
    #       those groups from the query)
    #    OR
    #    2. the force_final setting = 1
    needs_final, exclude_group_ids = get_projects_query_flags(project_ids)
    if len(exclude_group_ids) > max_group_ids_exclude:
        # Cap the number of groups to exclude by query and flip to using FINAL if necessary
        needs_final = True
        exclude_group_ids = []

    used_final = False
    if force_final == 1 or (force_final is None and needs_final):
        from_clause = u'{} FINAL'.format(from_clause)
        used_final = True
    elif exclude_group_ids:
        where_conditions.append(('group_id', 'NOT IN', exclude_group_ids))

    sample = body.get('sample', settings.TURBO_SAMPLE_RATE if turbo else config_sample)
    if sample != 1:
        from_clause = u'{} SAMPLE {}'.format(from_clause, sample)

    joins = []

    if 'arrayjoin' in body:
        joins.append(u'ARRAY JOIN {}'.format(body['arrayjoin']))
    join_clause = ' '.join(joins)

    where_clause = ''
    if where_conditions:
        where_conditions = list(set(util.tuplify(where_conditions)))
        where_clause = u'WHERE {}'.format(util.conditions_expr(where_conditions, body))

    prewhere_conditions = []
    if settings.PREWHERE_KEYS:
        # Add any condition to PREWHERE if:
        # - It is a single top-level condition (not OR-nested), and
        # - Any of its referenced columns are in PREWHERE_KEYS
        prewhere_candidates = [
            (util.columns_in_expr(cond[0]), cond)
            for cond in where_conditions if util.is_condition(cond) and
            any(col in settings.PREWHERE_KEYS for col in util.columns_in_expr(cond[0]))
        ]
        # Use the condition that has the highest priority (based on the
        # position of its columns in the PREWHERE_KEYS list)
        prewhere_candidates = sorted([
            (min(settings.PREWHERE_KEYS.index(col) for col in cols if col in settings.PREWHERE_KEYS), cond)
            for cols, cond in prewhere_candidates
        ])
        if prewhere_candidates:
            prewhere_conditions = [cond for _, cond in prewhere_candidates][:settings.MAX_PREWHERE_CONDITIONS]

    prewhere_clause = ''
    if prewhere_conditions:
        prewhere_clause = u'PREWHERE {}'.format(util.conditions_expr(prewhere_conditions, body))

    having_clause = ''
    if having_conditions:
        assert groupby, 'found HAVING clause with no GROUP BY'
        having_clause = u'HAVING {}'.format(util.conditions_expr(having_conditions, body))

    group_clause = ', '.join(util.column_expr(gb, body) for gb in groupby)
    if group_clause:
        if body.get('totals', False):
            group_clause = 'GROUP BY ({}) WITH TOTALS'.format(group_clause)
        else:
            group_clause = 'GROUP BY ({})'.format(group_clause)

    order_clause = ''
    if body.get('orderby'):
        orderby = [util.column_expr(util.tuplify(ob), body) for ob in util.to_list(body['orderby'])]
        orderby = [u'{} {}'.format(
            ob.lstrip('-'),
            'DESC' if ob.startswith('-') else 'ASC'
        ) for ob in orderby]
        order_clause = u'ORDER BY {}'.format(', '.join(orderby))

    limitby_clause = ''
    if 'limitby' in body:
        limitby_clause = 'LIMIT {} BY {}'.format(*body['limitby'])

    limit_clause = ''
    if 'limit' in body:
        limit_clause = 'LIMIT {}, {}'.format(body.get('offset', 0), body['limit'])

    sql = ' '.join([c for c in [
        select_clause,
        from_clause,
        join_clause,
        prewhere_clause,
        where_clause,
        group_clause,
        having_clause,
        order_clause,
        limitby_clause,
        limit_clause
    ] if c])

    timer.mark('prepare_query')

    stats.update({
        'clickhouse_table': table,
        'final': used_final,
        'referrer': request.referrer,
        'num_days': (to_date - from_date).days,
        'num_projects': len(project_ids),
        'sample': sample,
    })

    return util.raw_query(
        validated_body, sql, clickhouse_ro, timer, stats
    )