def test_nested_parenthesis_boolean(): actual = WhereClauseParser.parse( to_tokens("(col = 1 and col2=4) or (col = 2 and (col =6 or col=9))")) expected = WhereClause( BooleanCondition( "or", Parenthesis( BooleanCondition( "and", Condition(Column("col"), "=", Integer(1)), Condition(Column("col2"), "=", Integer(4)), )), Parenthesis( BooleanCondition( "and", Condition(Column("col"), "=", Integer(2)), Parenthesis( BooleanCondition( "or", Condition(Column("col"), "=", Integer(6)), Condition(Column("col"), "=", Integer(9)), )), )), )) assert actual == expected
def test_multiple_args_boolean_condition(): actual = WhereClauseParser.parse( to_tokens("(col = 1 and col2=4 and col3=4)")) expected = WhereClause( Parenthesis( BooleanCondition( "and", Condition(Column("col"), "=", Integer(1)), BooleanCondition( "and", Condition(Column("col2"), "=", Integer(4)), Condition(Column("col3"), "=", Integer(4)), ), ), )) assert actual == expected
def test_parenthesis_boolean_where_clause(): actual = WhereClauseParser.parse( to_tokens("(col = 3 and field = 5) or (f2 or f3)")) expected = WhereClause( BooleanCondition( "or", Parenthesis( BooleanCondition( "and", Condition(Column("col"), "=", Integer(3)), Condition(Column("field"), "=", Integer(5)), )), Parenthesis(BooleanCondition("or", Column("f2"), Column("f3"))), )) assert actual == expected
def test_boolean_condition(): sql = BooleanCondition( "and", Condition(Column("col"), "=", Integer(1)), Condition(Column("col2"), "=", Integer(4)), ) expected = "col = 1 AND col2 = 4" assert transform(sql) == expected.strip()
def test_boolean_where_clause(): actual = WhereClauseParser.parse(to_tokens("col = 3 and field = 5")) expected = WhereClause( BooleanCondition( "and", Condition(Column("col"), "=", Integer(3)), Condition(Column("field"), "=", Integer(5)), )) assert actual == expected
def test_boolean_condition(): actual, _ = ExpressionParser.parse(to_tokens("f1 IS NOT NULL AND f2 > 0 fnew")) expected = Alias( BooleanCondition( "AND", Condition(Column("f1"), "is not", Null()), Condition(Column("f2"), ">", Integer(0)), ), with_as=False, alias="fnew", ) assert actual == expected
def test_between_where_clause(): actual = WhereClauseParser.parse(to_tokens("col between 3 and 5")) expected = WhereClause( Condition( Column("col"), "between", BooleanCondition( "and", Integer(3), Integer(5), ), )) assert actual == expected
def test_consecutive_parenthesis(): actual = ExpressionParser.parse(to_tokens("((col+1) = 3 AND col2=4)")) expected = Parenthesis( BooleanCondition( "and", Condition(Parenthesis(Addition(Column("col"), Integer(1))), "=", Integer(3)), Condition( Column("col2"), "=", Integer(4), ), )) assert actual == expected
def test_boolean_condition_as_expression(): sql = "field is not null and col > 0" actual, _ = ExpressionParser.parse(to_tokens(sql)) expected = BooleanCondition( "and", Condition( Column("field"), "is not", Null(), ), Condition( Column("col"), ">", Integer(0), ), ) assert actual == expected
def test_select_boolean_condition_expression(): sql = "select field is not null and col > 0 from t;" actual = SQLStatementParser.parse(to_tokens(sql)) expected = SelectStatement( expressions=[ BooleanCondition( "and", Condition( Column("field"), "is not", Null(), ), Condition( Column("col"), ">", Integer(0), ), ) ], from_statement=Table(Column("t")), ) assert actual == expected
def parse( tokens, is_right_hand=False, can_be_type=False, can_alias=True, until_one_of=None, first_token=None, is_chained_columns=False, ) -> Tuple[Expression, Any]: until_one_of = until_one_of or [] main_token = first_token or next(tokens) next_token = None if main_token in String.QUOTES: expression = StringParser.parse(tokens, main_token) elif main_token.isdigit(): expression = Integer(main_token) elif main_token.replace(".", "").isdigit(): expression = Float(main_token) elif lower(main_token) in Boolean.BOOLEAN_VALUES: expression = Boolean(main_token) elif lower(main_token) in Null.VALUES: expression = Null() elif lower(main_token) == Negation.PREDICATE: rest_expression, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of, ) expression = Negation(rest_expression) elif main_token == "(": argument_tokens = get_tokens_until_closing_parenthesis(tokens) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Parenthesis(*arguments) elif main_token == "[": argument_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=["]"]) assert next_token == "]", next_token arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Array(*arguments) next_token = next(tokens, None) elif lower(main_token) == "case": argument_tokens, next_token = get_tokens_until_one_of( tokens, ["end"]) assert lower(next_token) == "end" next_token = next(tokens, None) expression = CaseParser.parse(iter(argument_tokens)) elif lower(main_token) == "select": argument_tokens, next_token = get_tokens_until_one_of(tokens, []) next_token = next(tokens, None) expression = SelectStatementParser.parse(iter(argument_tokens)) else: expression = None if next_token is None: next_token = next(tokens, None) # Expressions that need the next_token to be read if expression is None: if next_token is not None and next_token == "(": if lower(main_token) == "cast": column_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=["as"]) column, _ = ExpressionParser.parse( iter(column_tokens), is_right_hand=True, until_one_of=until_one_of, ) assert lower(next_token) == "as", next_token next_token = next(tokens) cast_type = Type(next_token) expression = CastFunctionCall(column, cast_type) next_token = next(tokens) assert lower(next_token) == ")", next_token elif lower(main_token) == "array_agg": next_token = next(tokens) if lower(next_token) == "distinct": distinct = True first_token = None else: distinct = False first_token = next_token column_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=[ ")", "ignore", "respects", "order", "limit" ], first_token=first_token, ) column, _ = ExpressionParser.parse( iter(column_tokens), until_one_of=until_one_of) ignore_nulls = respect_nulls = False if lower(next_token) == "ignore": next_token = next(tokens) assert lower(next_token) == "nulls" ignore_nulls = True next_token = next(tokens) elif lower(next_token) == "respect": next_token = next(tokens) assert lower(next_token) == "nulls" respect_nulls = True next_token = next(tokens) if lower(next_token) == "order": next_token = next(tokens) assert lower(next_token) == "by" expression_tokens, next_token = get_tokens_until_one_of( tokens, ["limit", ")"]) order_bys = OrderByParser.parse( iter(expression_tokens)) else: order_bys = None limit = None if lower(next_token) == "limit": next_token = next(tokens) limit = int(next_token) next_token = next(tokens) assert lower(next_token) == ")", next_token expression = ArrayAggFunctionCall( column=column, distinct=distinct, ignore_nulls=ignore_nulls, respect_nulls=respect_nulls, order_bys=order_bys, limit=limit, ) elif lower(main_token) == "count": next_token = next(tokens) if lower(next_token) == "distinct": distinct = True first_token = None else: distinct = False first_token = next_token argument_tokens = get_tokens_until_closing_parenthesis( tokens, first_token=first_token) arguments = ExpressionListParser.parse( iter(argument_tokens)) expression = CountFunctionCall(*arguments, distinct=distinct) else: argument_tokens = get_tokens_until_closing_parenthesis( tokens) arguments_can_be_type = can_be_type or any( lower(t) == "timestamp_trunc" for t in argument_tokens) arguments = ExpressionListParser.parse( iter(argument_tokens), can_be_type=arguments_can_be_type) expression = FunctionCall(main_token, *arguments) next_token = next(tokens, None) if next_token and lower(next_token) == "filter": next_next_token = next(tokens) assert next_next_token == "(", next_next_token argument_tokens = get_tokens_until_closing_parenthesis( tokens) assert lower( argument_tokens[0]) == "where", argument_tokens filter_condition, next_token = ExpressionParser.parse( iter(argument_tokens[1:]), can_alias=False, ) expression = FilteredFunctionCall(expression, filter_condition) next_token = next(tokens, None) elif (next_token is not None and lower(main_token) in DatePartExtraction.PARTS and lower(next_token) == "from"): rest_expression, next_token = ExpressionParser.parse( tokens, until_one_of=until_one_of) expression = DatePartExtraction(main_token, rest_expression) elif lower(main_token) in Type.VALUES and can_be_type: expression = Type(main_token) elif next_token is not None and next_token == "[": argument_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=["]"]) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Index( Column(main_token), arguments) # left item will not always be a column next_token = next(tokens, None) elif next_token is not None and main_token == "-" and next_token.isdigit( ): expression = Integer(-int(next_token)) next_token = next(tokens, None) elif (next_token is not None and main_token == "-" and next_token.replace(".", "").isdigit()): expression = Float(-float(next_token)) next_token = next(tokens, None) elif (lower(main_token) in String.PREFIXES and next_token is not None and lower(next_token) in String.QUOTES): expression = StringParser.parse(tokens, start_quote=next_token, prefix=main_token) else: expression = Column(main_token) if lower(next_token) == "over": opening_parenthesis = next(tokens, None) if opening_parenthesis != "(": raise ParsingError("expected '('") argument_tokens = iter( get_tokens_until_closing_parenthesis(tokens)) argument_next_token = next(argument_tokens, None) if lower(argument_next_token) == "partition": argument_next_token = next(argument_tokens, None) if not argument_next_token or lower( argument_next_token) != "by": raise ParsingError("Missing BY after PARTITION") expression_tokens, argument_next_token = get_tokens_until_one_of( argument_tokens, ["order", "rows", "range"]) partition_by = ExpressionListParser.parse( iter(expression_tokens)) else: partition_by = None if lower(argument_next_token) == "order": argument_next_token = next(argument_tokens, None) if not argument_next_token or lower( argument_next_token) != "by": raise ParsingError("Missing BY after ORDER") expression_tokens, argument_next_token = get_tokens_until_one_of( argument_tokens, ["rows", "range"]) order_by = OrderByParser.parse(iter(expression_tokens)) else: order_by = None if lower(argument_next_token) in ("rows", "range"): rows_range = argument_next_token expression_tokens, _ = get_tokens_until_one_of( argument_tokens, []) frame_clause: Optional[WindowFrameClause] = WindowFrameClause( rows_range, " ".join(expression_tokens)) else: frame_clause = None expression = AnalyticsClause( expression, partition_by=partition_by, order_by=order_by, frame_clause=frame_clause, ) next_token = next(tokens, None) while next_token == ".": right_hand, next_token = ExpressionParser.parse( tokens, until_one_of=until_one_of, is_chained_columns=True) expression = ChainedColumns(expression, right_hand) if next_token and next_token in ("+", "-", "*", "/") and not is_chained_columns: left_hand = expression symbol = next_token right_hand, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of, ) expression = ArithmaticOperator(symbol, left_hand, right_hand) while next_token == "[": argument_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=["]"]) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Index(expression, arguments) next_token = next(tokens, None) if is_right_hand or is_chained_columns: return expression, next_token if lower(next_token) in Condition.PREDICATES: first_token = None symbol = next_token if lower(next_token) == "is": next_next_token = next(tokens) if lower(next_next_token) == "not": symbol = "is not" else: first_token = next_next_token elif lower(next_token) == "not": next_next_token = next(tokens) if lower(next_next_token) == "in": symbol = "not in" else: first_token = next_next_token right_hand, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of, first_token=first_token, ) expression = Condition(expression, symbol, right_hand) elif lower(next_token) == "between": symbol = next_token right_hand_left, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of) if lower(next_token) != "and": raise ParsingError("expected AND") right_hand_right, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of) right_hand = BooleanCondition( "and", right_hand_left, right_hand_right, ) expression = Condition(expression, symbol, right_hand) elif next_token in BitwiseOperation.OPERATORS: operator = next_token right_hand, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of) expression = BitwiseOperation(expression, operator, right_hand) if lower(next_token) in BooleanCondition.PREDICATES: left_hand = expression symbol = next_token right_hand, next_token = ExpressionParser.parse( tokens, until_one_of=until_one_of) right_alias = None if isinstance(right_hand, Alias): right_alias = right_hand right_hand = right_hand.expression expression = BooleanCondition(symbol, left_hand, right_hand) if right_alias is not None: right_alias.expression = expression expression = right_alias if lower(next_token) == "except": opening_parenthesis = next(tokens, None) if opening_parenthesis != "(": raise ParsingError("expected '('") argument_tokens = get_tokens_until_closing_parenthesis(tokens) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = ExceptClause(expression, arguments) next_token = next(tokens, None) if (next_token is not None and next_token != ")" and not (next_token in String.QUOTES and isinstance(expression, String)) and next_token != ";" and lower(next_token) not in until_one_of and can_alias): if lower(next_token) == "as": with_as = True alias, _ = ExpressionParser.parse(tokens, is_right_hand=True, until_one_of=until_one_of) else: with_as = False alias = next_token if alias in String.QUOTES: alias = StringParser.parse(tokens, alias) return Alias(expression, alias, with_as), next(tokens, None) return expression, next_token
def parse(tokens, is_right_hand=False): main_token = next(tokens) next_token = None if main_token in String.QUOTES: expression = StringParser.parse(tokens, main_token) elif main_token.isdigit(): expression = Integer(main_token) elif main_token in Boolean.BOOLEAN_VALUES: expression = Boolean(main_token) elif main_token in Null.VALUES: expression = Null() elif main_token in Type.VALUES: expression = Type(main_token) elif main_token == "(": argument_tokens = get_tokens_until_closing_parenthesis(tokens) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Parenthesis(*arguments) elif main_token == "case": argument_tokens, next_token = get_tokens_until_one_of( tokens, ["end"]) assert next_token == "end" next_token = next(tokens, None) expression = CaseParser.parse(iter(argument_tokens)) elif main_token == "select": argument_tokens, next_token = get_tokens_until_one_of(tokens, []) next_token = next(tokens, None) expression = SelectStatementParser.parse(iter(argument_tokens)) else: expression = None if next_token is None: next_token = next(tokens, None) # Expressions that need the next_token to be read if expression is None: if next_token is not None and next_token == "(": argument_tokens = get_tokens_until_closing_parenthesis(tokens) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = FunctionCall(main_token, *arguments) next_token = next(tokens, None) elif next_token is not None and next_token == "[": argument_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=["]"]) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Index( Column(main_token), arguments) # left item will not always be a column next_token = next(tokens, None) elif next_token is not None and main_token == "-" and next_token.isdigit( ): expression = Integer(-int(next_token)) next_token = next(tokens, None) elif (main_token in String.PREFIXES and next_token is not None and next_token in String.QUOTES): expression = StringParser.parse(tokens, start_quote=next_token, prefix=main_token) else: expression = Column(main_token) if next_token == "over": opening_parenthesis = next(tokens, None) if opening_parenthesis != "(": raise ParsingError("expected '('") argument_tokens = iter( get_tokens_until_closing_parenthesis(tokens)) argument_next_token = next(argument_tokens, None) if argument_next_token == "partition": argument_next_token = next(argument_tokens, None) if not argument_next_token or argument_next_token != "by": raise ParsingError("Missing BY after PARTITION") expression_tokens, argument_next_token = get_tokens_until_one_of( argument_tokens, ["order", "rows", "range"]) partition_by = ExpressionListParser.parse( iter(expression_tokens)) else: partition_by = None if argument_next_token == "order": argument_next_token = next(argument_tokens, None) if not argument_next_token or argument_next_token != "by": raise ParsingError("Missing BY after ORDER") expression_tokens, argument_next_token = get_tokens_until_one_of( argument_tokens, ["rows", "range"]) order_by = OrderByParser.parse(iter(expression_tokens)) else: order_by = None if argument_next_token in ("rows", "range"): rows_range = argument_next_token expression_tokens, _ = get_tokens_until_one_of( argument_tokens, []) frame_clause = WindowFrameClause(rows_range, " ".join(expression_tokens)) else: frame_clause = None expression = AnalyticsClause( expression, partition_by=partition_by, order_by=order_by, frame_clause=frame_clause, ) next_token = next(tokens, None) if next_token and next_token in ("+", "-", "*", "/"): left_hand = expression symbol = next_token right_hand, next_token = ExpressionParser.parse(tokens, is_right_hand=True) expression = ArithmaticOperator(symbol, left_hand, right_hand) if is_right_hand: return expression, next_token if next_token in Condition.PREDICATES: symbol = next_token if next_token == "is": next_next_token = next(tokens) if next_next_token == "not": symbol = "is not" else: tokens, _ = get_tokens_until_one_of( tokens, [], first_token=next_next_token) tokens = iter(tokens) right_hand, next_token = ExpressionParser.parse(tokens, is_right_hand=True) expression = Condition(expression, symbol, right_hand) elif next_token == "between": symbol = next_token right_hand_left, next_token = ExpressionParser.parse( tokens, is_right_hand=True) if next_token != "and": raise ParsingError("expected AND") right_hand_right, next_token = ExpressionParser.parse( tokens, is_right_hand=True) right_hand = BooleanCondition( "and", right_hand_left, right_hand_right, ) expression = Condition(expression, symbol, right_hand) if next_token in BooleanCondition.PREDICATES: left_hand = expression symbol = next_token right_hand = ExpressionParser.parse(tokens) expression = BooleanCondition(symbol, left_hand, right_hand) next_token = next(tokens, None) if next_token == "except": opening_parenthesis = next(tokens, None) if opening_parenthesis != "(": raise ParsingError("expected '('") argument_tokens = get_tokens_until_closing_parenthesis(tokens) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = ExceptClause(expression, arguments) next_token = next(tokens, None) if (next_token is not None and next_token != ")" and next_token != "'" and next_token != '"' and next_token != "`" and next_token != ";"): if next_token == "as": with_as = True alias, _ = ExpressionParser.parse(tokens, is_right_hand=True) else: with_as = False alias = next_token return Alias(expression, alias, with_as) return expression
def test_nested_joins(): sql = """ SELECT COALESCE(sq_1.col, sq_2.col) f0_ FROM (SELECT ANY_VALUE(col) col, LAST_VALUE(ANY_VALUE(col2)) OVER (PARTITION BY ANY_VALUE(col) ORDER BY SUM(clicks) ASC, SUM(metric) ASC RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) last, hash FROM (SELECT * FROM `events` WHERE _TABLE_SUFFIX BETWEEN '20200410' AND '20200510') JOIN (SELECT * EXCEPT (hash) FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY hash) AS rn FROM `test-table` WHERE _TABLE_SUFFIX BETWEEN '20200401' AND '20200501') WHERE rn = 1) USING (hash) GROUP BY hash) sq_1 FULL OUTER JOIN (SELECT ANY_VALUE(col) col, hash FROM (SELECT * FROM `events` WHERE _TABLE_SUFFIX BETWEEN '20200310' AND '20200410') JOIN (SELECT * EXCEPT (hash), FROM (SELECT *, ROW_NUMBER() OVER (PARTITION BY hash) AS rn FROM `test-table` WHERE _TABLE_SUFFIX BETWEEN '20200301' AND '20200401') WHERE rn = 1) USING (hash) GROUP BY hash) sq_2 ON sq_1.hash = sq_2.hash WHERE sq_1.last = 1 GROUP BY f0_ """ # noqa actual = SQLStatementParser.parse(to_tokens(sql)) expected = SelectStatement( expressions=[ Alias( FunctionCall("coalesce", Column("sq_1.col"), Column("sq_2.col")), "f0_", with_as=False, ) ], from_statement=Join( "FULL OUTER JOIN", left_from=Alias( Parenthesis( SelectStatement( expressions=[ Alias( FunctionCall("any_value", Column("col")), "col", with_as=False, ), Alias( AnalyticsClause( FunctionCall( "last_value", FunctionCall("any_value", Column("col2")), ), partition_by=[ FunctionCall("any_value", Column("col")) ], order_by=OrderByClause( OrderByItem( FunctionCall( "sum", Column("clicks")), has_asc=True, ), OrderByItem( FunctionCall( "sum", Column("metric")), has_asc=True, ), ), frame_clause=WindowFrameClause( "range", "between unbounded preceding and unbounded following", # noqa ), ), "last", with_as=False, ), Column("hash"), ], from_statement=Join( join_type="JOIN", left_from=Parenthesis( SelectStatement( expressions=[Column("*")], from_statement=Table( String("events", quotes="`")), where_clause=WhereClause( Condition( Column("_table_suffix"), "between", BooleanCondition( "and", String("20200410", quotes="'"), String("20200510", quotes="'"), ), )), semi_colon=False, )), right_from=Parenthesis( SelectStatement( expressions=[ ExceptClause(Column("*"), [Column("hash")]) ], from_statement=Parenthesis( SelectStatement( expressions=[ Column("*"), Alias( AnalyticsClause( FunctionCall( "row_number"), partition_by=[ Column("hash") ], order_by=None, frame_clause=None, ), Column("rn"), with_as=True, ), ], from_statement=Table( String("test-table", quotes="`")), where_clause=WhereClause( Condition( Column("_table_suffix"), "between", BooleanCondition( "and", String("20200401", quotes="'"), String("20200501", quotes="'"), ), )), semi_colon=False, )), where_clause=WhereClause( Condition(Column("rn"), "=", Integer(1))), semi_colon=False, )), on=None, using=UsingClause(Parenthesis(Column("hash"))), ), group_by_clause=GroupByClause(Column("hash")), semi_colon=False, )), "sq_1", with_as=False, ), right_from=Alias( Parenthesis( SelectStatement( expressions=[ Alias( FunctionCall("any_value", Column("col")), "col", with_as=False, ), Column("hash"), ], from_statement=Join( join_type="JOIN", left_from=Parenthesis( SelectStatement( expressions=[Column("*")], from_statement=Table( String("events", quotes="`")), where_clause=WhereClause( Condition( Column("_table_suffix"), "between", BooleanCondition( "and", String("20200310", quotes="'"), String("20200410", quotes="'"), ), )), semi_colon=False, )), right_from=Parenthesis( SelectStatement( expressions=[ ExceptClause(Column("*"), [Column("hash")]) ], from_statement=Parenthesis( SelectStatement( expressions=[ Column("*"), Alias( AnalyticsClause( FunctionCall( "row_number"), partition_by=[ Column("hash") ], order_by=None, frame_clause=None, ), Column("rn"), with_as=True, ), ], from_statement=Table( String("test-table", quotes="`")), where_clause=WhereClause( Condition( Column("_table_suffix"), "between", BooleanCondition( "and", String("20200301", quotes="'"), String("20200401", quotes="'"), ), )), semi_colon=False, ), ), where_clause=WhereClause( Condition(Column("rn"), "=", Integer(1))), semi_colon=False, )), on=None, using=UsingClause(Parenthesis(Column("hash"))), ), group_by_clause=GroupByClause(Column("hash")), semi_colon=False, )), "sq_2", with_as=False, ), on=OnClause( Condition(Column("sq_1.hash"), "=", Column("sq_2.hash"))), using=None, ), where_clause=WhereClause( Condition(Column("sq_1.last"), "=", Integer(1))), group_by_clause=GroupByClause(Column("f0_")), semi_colon=False, ) assert actual == expected