def visitColumn(self, exp: Column) -> str: ret = [] if exp.table_name: ret.append(escape_identifier(exp.table_name) or "") ret.append(".") ret.append(escape_identifier(exp.column_name) or "") return self.__alias("".join(ret), exp.alias)
def __string_col(self, col: str) -> str: col_type = self.__columns.get(col, None) col_type = str(col_type) if col_type else None if col_type and "String" in col_type and "FixedString" not in col_type: return escape_identifier(col) else: return "toString({})".format(escape_identifier(col))
def function_expr(fn: str, args_expr: str = "") -> str: """ DEPRECATED. Please do not add anything else here. In order to manipulate the query, create a QueryProcessor and register it into your dataset. Generate an expression for a given function name and an already-evaluated args expression. This is a place to define convenience functions that evaluate to more complex expressions. """ if fn.startswith("apdex("): match = APDEX_FUNCTION_RE.match(fn) if match: return "(countIf({col} <= {satisfied}) + (countIf(({col} > {satisfied}) AND ({col} <= {tolerated})) / 2)) / count()".format( col=escape_identifier(match.group(1)), satisfied=match.group(2), tolerated=int(match.group(2)) * 4, ) raise ValueError("Invalid format for apdex()") elif fn.startswith("impact("): match = IMPACT_FUNCTION_RE.match(fn) if match: apdex = "(countIf({col} <= {satisfied}) + (countIf(({col} > {satisfied}) AND ({col} <= {tolerated})) / 2)) / count()".format( col=escape_identifier(match.group(1)), satisfied=match.group(2), tolerated=int(match.group(2)) * 4, ) return "(1 - {apdex}) + ((1 - (1 / sqrt(uniq({user_col})))) * 3)".format( apdex=apdex, user_col=escape_identifier(match.group(3)), ) raise ValueError("Invalid format for impact()") # For functions with no args, (or static args) we allow them to already # include them as part of the function name, eg, "count()" or "sleep(1)" if not args_expr and fn.endswith(")"): return fn # Convenience topK function eg "top10", "top3" etc. topk = TOPK_FUNCTION_RE.match(fn) if topk: return "topK({})({})".format(topk.group(1), args_expr) # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where # a number was expected. if fn == "uniq": return "ifNull({}({}), 0)".format(fn, args_expr) # emptyIfNull(col) is a simple pseudo function supported by Snuba that expands # to the actual clickhouse function ifNull(col, '') Until we figure out the best # way to disambiguate column names from string literals in complex functions. if fn == "emptyIfNull" and args_expr: return "ifNull({}, '')".format(args_expr) # default: just return fn(args_expr) return "{}({})".format(fn, args_expr)
def __init__(self, base_name: Optional[str], name: str, type: ColumnType) -> None: self.base_name = base_name self.name = name self.type = type self.flattened = ("{}.{}".format(self.base_name, self.name) if self.base_name else self.name) self.escaped = escape_identifier(self.flattened)
def visit_column(self, exp: Column) -> str: ret = [] ret_unescaped = [] if exp.table_name: ret.append(escape_identifier(exp.table_name) or "") ret_unescaped.append(exp.table_name or "") ret.append(".") ret_unescaped.append(".") ret.append(escape_identifier(exp.column_name) or "") ret_unescaped.append(exp.column_name) # De-clutter the output query by not applying an alias to a # column if the column name is the same as the alias to make # the query more readable. # This happens often since we apply column aliases during # parsing so the names are preserved during query processing. if exp.alias != "".join(ret_unescaped): return self.__alias("".join(ret), exp.alias) else: return "".join(ret)
def __init__(self, base_name: Optional[str], name: str, type: ColumnType[TModifiers]) -> None: self.base_name = base_name self.name = name self.type = type self.flattened = ("{}.{}".format(self.base_name, self.name) if self.base_name else self.name) escaped = escape_identifier(self.flattened) assert escaped is not None self.escaped: str = escaped
def column_expr( self, column_name: str, query: Query, parsing_context: ParsingContext, table_alias: str = "", ) -> Union[None, Any]: """ Return an expression for the column name. Handle special column aliases that evaluate to something else. """ return escape_identifier(qualified_column(column_name, table_alias))
def test_escape_identifier(self): assert escape_identifier(None) is None assert escape_identifier("") == "" assert escape_identifier("foo") == "foo" assert escape_identifier("foo.bar") == "foo.bar" assert escape_identifier("foo:bar") == "`foo:bar`" # Even though backtick characters in columns should be # disallowed by the query schema, make sure we dont allow # injection anyway. assert escape_identifier("`") == r"`\``" assert escape_identifier("production`; --") == r"`production\`; --`"
def visit_column(self, exp: Column) -> str: ret = [] ret_unescaped = [] if exp.table_name: ret.append(escape_identifier(exp.table_name) or "") ret_unescaped.append(exp.table_name or "") ret.append(".") ret_unescaped.append(".") # If there is a table name and the column name contains a ".", # then we need to escape the column name using alias regex rules # to clearly demarcate the table and columns ret.append(escape_alias(exp.column_name) or "") else: ret.append(escape_identifier(exp.column_name) or "") ret_unescaped.append(exp.column_name) # De-clutter the output query by not applying an alias to a # column if the column name is the same as the alias to make # the query more readable. # This happens often since we apply column aliases during # parsing so the names are preserved during query processing. if exp.alias != "".join(ret_unescaped): return self._alias("".join(ret), exp.alias) else: return "".join(ret)
def for_schema(self) -> str: return "{} {}".format(escape_identifier(self.name), self.type.for_schema())
def process_delete_tag( message: Mapping[str, Any], schema: TableSchema, tag_column_map: Mapping[str, Mapping[str, str]], promoted_tags: Mapping[str, Sequence[str]], ) -> Optional[Replacement]: tag = message["tag"] if not tag: return None assert isinstance(tag, str) timestamp = datetime.strptime(message["datetime"], settings.PAYLOAD_DATETIME_FORMAT) tag_column_name = tag_column_map["tags"].get(tag, tag) is_promoted = tag in promoted_tags["tags"] where = """\ WHERE project_id = %(project_id)s AND received <= CAST('%(timestamp)s' AS DateTime) AND NOT deleted """ if is_promoted: prewhere = " PREWHERE %(tag_column)s IS NOT NULL " else: prewhere = " PREWHERE has(`tags.key`, %(tag_str)s) " insert_query_template = ( """\ INSERT INTO %(dist_write_table_name)s (%(all_columns)s) SELECT %(select_columns)s FROM %(dist_read_table_name)s FINAL """ + prewhere + where ) all_columns = [ col for col in schema.get_columns() if Materialized not in col.type.get_all_modifiers() ] select_columns = [] for col in all_columns: if is_promoted and col.flattened == tag_column_name: select_columns.append("NULL") elif col.flattened == "tags.key": select_columns.append( "arrayFilter(x -> (indexOf(`tags.key`, x) != indexOf(`tags.key`, %s)), `tags.key`)" % escape_string(tag) ) elif col.flattened == "tags.value": select_columns.append( "arrayMap(x -> arrayElement(`tags.value`, x), arrayFilter(x -> x != indexOf(`tags.key`, %s), arrayEnumerate(`tags.value`)))" % escape_string(tag) ) elif col.flattened == "_tags_flattened": select_columns.append(FLATTENED_COLUMN_TEMPLATE % escape_string(tag)) else: select_columns.append(col.escaped) all_column_names = [col.escaped for col in all_columns] query_args = { "all_columns": ", ".join(all_column_names), "select_columns": ", ".join(select_columns), "project_id": message["project_id"], "tag_str": escape_string(tag), "tag_column": escape_identifier(tag_column_name), "timestamp": timestamp.strftime(DATETIME_FORMAT), } count_query_template = ( """\ SELECT count() FROM %(dist_read_table_name)s FINAL """ + prewhere + where ) query_time_flags = (NEEDS_FINAL, message["project_id"]) return Replacement( count_query_template, insert_query_template, query_args, query_time_flags )
def function_expr(fn: str, args_expr: str = "") -> str: """ DEPRECATED. Please do not add anything else here. In order to manipulate the query, create a QueryProcessor and register it into your dataset. Generate an expression for a given function name and an already-evaluated args expression. This is a place to define convenience functions that evaluate to more complex expressions. """ if fn.startswith("apdex("): match = APDEX_FUNCTION_RE.match(fn) if match: return "(countIf({col} <= {satisfied}) + (countIf(({col} > {satisfied}) AND ({col} <= {tolerated})) / 2)) / count()".format( col=escape_identifier(match.group(1)), satisfied=match.group(2), tolerated=int(match.group(2)) * 4, ) raise ValueError("Invalid format for apdex()") elif fn.startswith("impact("): match = IMPACT_FUNCTION_RE.match(fn) if match: apdex = "(countIf({col} <= {satisfied}) + (countIf(({col} > {satisfied}) AND ({col} <= {tolerated})) / 2)) / count()".format( col=escape_identifier(match.group(1)), satisfied=match.group(2), tolerated=int(match.group(2)) * 4, ) return "(1 - {apdex}) + ((1 - (1 / sqrt(uniq({user_col})))) * 3)".format( apdex=apdex, user_col=escape_identifier(match.group(3)), ) raise ValueError("Invalid format for impact()") elif fn.startswith("failure_rate("): match = FAILURE_RATE_FUNCTION_RE.match(fn) if match: return "countIf(notIn(transaction_status, tuple({ok}, {cancelled}, {unknown}))) / count()".format( ok=SPAN_STATUS_NAME_TO_CODE["ok"], cancelled=SPAN_STATUS_NAME_TO_CODE["cancelled"], unknown=SPAN_STATUS_NAME_TO_CODE["unknown"], ) raise ValueError("Invalid format for failure_rate()") # For functions with no args, (or static args) we allow them to already # include them as part of the function name, eg, "count()" or "sleep(1)" if not args_expr and fn.endswith(")"): return fn # Convenience topK function eg "top10", "top3" etc. topk = TOPK_FUNCTION_RE.match(fn) if topk: return "topK({})({})".format(topk.group(1), args_expr) # turn uniq() into ifNull(uniq(), 0) so it doesn't return null where # a number was expected. if fn == "uniq": return "ifNull({}({}), 0)".format(fn, args_expr) # emptyIfNull(col) is a simple pseudo function supported by Snuba that expands # to the actual clickhouse function ifNull(col, '') Until we figure out the best # way to disambiguate column names from string literals in complex functions. if fn == "emptyIfNull" and args_expr: return "ifNull({}, '')".format(args_expr) # Workaround for https://github.com/ClickHouse/ClickHouse/issues/11622 # Some distributed queries fail when arrays are passed as array(1,2,3) # and work when they are passed as [1, 2, 3] if get_config("format_clickhouse_arrays", 1) and fn == "array": return f"[{args_expr}]" # default: just return fn(args_expr) return "{}({})".format(fn, args_expr)
def process_delete_tag( message: ReplacementMessage, all_columns: Sequence[FlattenedColumn], tag_column_map: Mapping[str, Mapping[str, str]], promoted_tags: Mapping[str, Sequence[str]], use_promoted_prewhere: bool, schema: WritableTableSchema, ) -> Optional[Replacement]: tag = message.data["tag"] if not tag: return None assert isinstance(tag, str) timestamp = datetime.strptime( message.data["datetime"], settings.PAYLOAD_DATETIME_FORMAT ) tag_column_name = tag_column_map["tags"].get(tag, tag) is_promoted = tag in promoted_tags["tags"] where = """\ WHERE project_id = %(project_id)s AND received <= CAST('%(timestamp)s' AS DateTime) AND NOT deleted """ if is_promoted and use_promoted_prewhere: prewhere = " PREWHERE %(tag_column)s IS NOT NULL " else: prewhere = " PREWHERE has(`tags.key`, %(tag_str)s) " insert_query_template = ( """\ INSERT INTO %(table_name)s (%(all_columns)s) SELECT %(select_columns)s FROM %(table_name)s FINAL """ + prewhere + where ) select_columns = [] for col in all_columns: if is_promoted and col.flattened == tag_column_name: # The promoted tag columns of events are non nullable, but those of # errors are non nullable. We check the column against the schema # to determine whether to write an empty string or NULL. column_type = schema.get_data_source().get_columns().get(tag_column_name) assert column_type is not None is_nullable = column_type.type.has_modifier(Nullable) if is_nullable: select_columns.append("NULL") else: select_columns.append("''") elif col.flattened == "tags.key": select_columns.append( "arrayFilter(x -> (indexOf(`tags.key`, x) != indexOf(`tags.key`, %s)), `tags.key`)" % escape_string(tag) ) elif col.flattened == "tags.value": select_columns.append( "arrayMap(x -> arrayElement(`tags.value`, x), arrayFilter(x -> x != indexOf(`tags.key`, %s), arrayEnumerate(`tags.value`)))" % escape_string(tag) ) else: select_columns.append(col.escaped) all_column_names = [col.escaped for col in all_columns] query_args = { "all_columns": ", ".join(all_column_names), "select_columns": ", ".join(select_columns), "project_id": message.data["project_id"], "tag_str": escape_string(tag), "tag_column": escape_identifier(tag_column_name), "timestamp": timestamp.strftime(DATETIME_FORMAT), } count_query_template = ( """\ SELECT count() FROM %(table_name)s FINAL """ + prewhere + where ) query_time_flags = (NEEDS_FINAL, message.data["project_id"]) return LegacyReplacement( count_query_template, insert_query_template, query_args, query_time_flags, replacement_type=message.action_type, replacement_message_metadata=message.metadata, )
def __escape_identifier_enforce(self, expr: str) -> str: ret = escape_identifier(expr) # This is for the type checker. escape_identifier can return # None if the input is None. Here the input is not None. assert ret is not None return ret