def test_multiple_indices_access(): actual = ExpressionParser.parse(to_tokens("array[index(0), 'foo'] alias")) expected = Alias( Index( Column("array"), [FunctionCall("index", Integer(0)), String("foo", quotes="'")], ), with_as=False, alias="alias", ) assert actual == expected
def parse( tokens, is_right_hand=False, can_be_type=False, can_alias=True, until_one_of=None, first_token=None, is_chained_columns=False, ) -> Tuple[Expression, Any]: until_one_of = until_one_of or [] main_token = first_token or next(tokens) next_token = None if main_token in String.QUOTES: expression = StringParser.parse(tokens, main_token) elif main_token.isdigit(): expression = Integer(main_token) elif main_token.replace(".", "").isdigit(): expression = Float(main_token) elif lower(main_token) in Boolean.BOOLEAN_VALUES: expression = Boolean(main_token) elif lower(main_token) in Null.VALUES: expression = Null() elif lower(main_token) == Negation.PREDICATE: rest_expression, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of, ) expression = Negation(rest_expression) elif main_token == "(": argument_tokens = get_tokens_until_closing_parenthesis(tokens) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Parenthesis(*arguments) elif main_token == "[": argument_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=["]"]) assert next_token == "]", next_token arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Array(*arguments) next_token = next(tokens, None) elif lower(main_token) == "case": argument_tokens, next_token = get_tokens_until_one_of( tokens, ["end"]) assert lower(next_token) == "end" next_token = next(tokens, None) expression = CaseParser.parse(iter(argument_tokens)) elif lower(main_token) == "select": argument_tokens, next_token = get_tokens_until_one_of(tokens, []) next_token = next(tokens, None) expression = SelectStatementParser.parse(iter(argument_tokens)) else: expression = None if next_token is None: next_token = next(tokens, None) # Expressions that need the next_token to be read if expression is None: if next_token is not None and next_token == "(": if lower(main_token) == "cast": column_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=["as"]) column, _ = ExpressionParser.parse( iter(column_tokens), is_right_hand=True, until_one_of=until_one_of, ) assert lower(next_token) == "as", next_token next_token = next(tokens) cast_type = Type(next_token) expression = CastFunctionCall(column, cast_type) next_token = next(tokens) assert lower(next_token) == ")", next_token elif lower(main_token) == "array_agg": next_token = next(tokens) if lower(next_token) == "distinct": distinct = True first_token = None else: distinct = False first_token = next_token column_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=[ ")", "ignore", "respects", "order", "limit" ], first_token=first_token, ) column, _ = ExpressionParser.parse( iter(column_tokens), until_one_of=until_one_of) ignore_nulls = respect_nulls = False if lower(next_token) == "ignore": next_token = next(tokens) assert lower(next_token) == "nulls" ignore_nulls = True next_token = next(tokens) elif lower(next_token) == "respect": next_token = next(tokens) assert lower(next_token) == "nulls" respect_nulls = True next_token = next(tokens) if lower(next_token) == "order": next_token = next(tokens) assert lower(next_token) == "by" expression_tokens, next_token = get_tokens_until_one_of( tokens, ["limit", ")"]) order_bys = OrderByParser.parse( iter(expression_tokens)) else: order_bys = None limit = None if lower(next_token) == "limit": next_token = next(tokens) limit = int(next_token) next_token = next(tokens) assert lower(next_token) == ")", next_token expression = ArrayAggFunctionCall( column=column, distinct=distinct, ignore_nulls=ignore_nulls, respect_nulls=respect_nulls, order_bys=order_bys, limit=limit, ) elif lower(main_token) == "count": next_token = next(tokens) if lower(next_token) == "distinct": distinct = True first_token = None else: distinct = False first_token = next_token argument_tokens = get_tokens_until_closing_parenthesis( tokens, first_token=first_token) arguments = ExpressionListParser.parse( iter(argument_tokens)) expression = CountFunctionCall(*arguments, distinct=distinct) else: argument_tokens = get_tokens_until_closing_parenthesis( tokens) arguments_can_be_type = can_be_type or any( lower(t) == "timestamp_trunc" for t in argument_tokens) arguments = ExpressionListParser.parse( iter(argument_tokens), can_be_type=arguments_can_be_type) expression = FunctionCall(main_token, *arguments) next_token = next(tokens, None) if next_token and lower(next_token) == "filter": next_next_token = next(tokens) assert next_next_token == "(", next_next_token argument_tokens = get_tokens_until_closing_parenthesis( tokens) assert lower( argument_tokens[0]) == "where", argument_tokens filter_condition, next_token = ExpressionParser.parse( iter(argument_tokens[1:]), can_alias=False, ) expression = FilteredFunctionCall(expression, filter_condition) next_token = next(tokens, None) elif (next_token is not None and lower(main_token) in DatePartExtraction.PARTS and lower(next_token) == "from"): rest_expression, next_token = ExpressionParser.parse( tokens, until_one_of=until_one_of) expression = DatePartExtraction(main_token, rest_expression) elif lower(main_token) in Type.VALUES and can_be_type: expression = Type(main_token) elif next_token is not None and next_token == "[": argument_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=["]"]) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Index( Column(main_token), arguments) # left item will not always be a column next_token = next(tokens, None) elif next_token is not None and main_token == "-" and next_token.isdigit( ): expression = Integer(-int(next_token)) next_token = next(tokens, None) elif (next_token is not None and main_token == "-" and next_token.replace(".", "").isdigit()): expression = Float(-float(next_token)) next_token = next(tokens, None) elif (lower(main_token) in String.PREFIXES and next_token is not None and lower(next_token) in String.QUOTES): expression = StringParser.parse(tokens, start_quote=next_token, prefix=main_token) else: expression = Column(main_token) if lower(next_token) == "over": opening_parenthesis = next(tokens, None) if opening_parenthesis != "(": raise ParsingError("expected '('") argument_tokens = iter( get_tokens_until_closing_parenthesis(tokens)) argument_next_token = next(argument_tokens, None) if lower(argument_next_token) == "partition": argument_next_token = next(argument_tokens, None) if not argument_next_token or lower( argument_next_token) != "by": raise ParsingError("Missing BY after PARTITION") expression_tokens, argument_next_token = get_tokens_until_one_of( argument_tokens, ["order", "rows", "range"]) partition_by = ExpressionListParser.parse( iter(expression_tokens)) else: partition_by = None if lower(argument_next_token) == "order": argument_next_token = next(argument_tokens, None) if not argument_next_token or lower( argument_next_token) != "by": raise ParsingError("Missing BY after ORDER") expression_tokens, argument_next_token = get_tokens_until_one_of( argument_tokens, ["rows", "range"]) order_by = OrderByParser.parse(iter(expression_tokens)) else: order_by = None if lower(argument_next_token) in ("rows", "range"): rows_range = argument_next_token expression_tokens, _ = get_tokens_until_one_of( argument_tokens, []) frame_clause: Optional[WindowFrameClause] = WindowFrameClause( rows_range, " ".join(expression_tokens)) else: frame_clause = None expression = AnalyticsClause( expression, partition_by=partition_by, order_by=order_by, frame_clause=frame_clause, ) next_token = next(tokens, None) while next_token == ".": right_hand, next_token = ExpressionParser.parse( tokens, until_one_of=until_one_of, is_chained_columns=True) expression = ChainedColumns(expression, right_hand) if next_token and next_token in ("+", "-", "*", "/") and not is_chained_columns: left_hand = expression symbol = next_token right_hand, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of, ) expression = ArithmaticOperator(symbol, left_hand, right_hand) while next_token == "[": argument_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=["]"]) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Index(expression, arguments) next_token = next(tokens, None) if is_right_hand or is_chained_columns: return expression, next_token if lower(next_token) in Condition.PREDICATES: first_token = None symbol = next_token if lower(next_token) == "is": next_next_token = next(tokens) if lower(next_next_token) == "not": symbol = "is not" else: first_token = next_next_token elif lower(next_token) == "not": next_next_token = next(tokens) if lower(next_next_token) == "in": symbol = "not in" else: first_token = next_next_token right_hand, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of, first_token=first_token, ) expression = Condition(expression, symbol, right_hand) elif lower(next_token) == "between": symbol = next_token right_hand_left, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of) if lower(next_token) != "and": raise ParsingError("expected AND") right_hand_right, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of) right_hand = BooleanCondition( "and", right_hand_left, right_hand_right, ) expression = Condition(expression, symbol, right_hand) elif next_token in BitwiseOperation.OPERATORS: operator = next_token right_hand, next_token = ExpressionParser.parse( tokens, is_right_hand=True, until_one_of=until_one_of) expression = BitwiseOperation(expression, operator, right_hand) if lower(next_token) in BooleanCondition.PREDICATES: left_hand = expression symbol = next_token right_hand, next_token = ExpressionParser.parse( tokens, until_one_of=until_one_of) right_alias = None if isinstance(right_hand, Alias): right_alias = right_hand right_hand = right_hand.expression expression = BooleanCondition(symbol, left_hand, right_hand) if right_alias is not None: right_alias.expression = expression expression = right_alias if lower(next_token) == "except": opening_parenthesis = next(tokens, None) if opening_parenthesis != "(": raise ParsingError("expected '('") argument_tokens = get_tokens_until_closing_parenthesis(tokens) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = ExceptClause(expression, arguments) next_token = next(tokens, None) if (next_token is not None and next_token != ")" and not (next_token in String.QUOTES and isinstance(expression, String)) and next_token != ";" and lower(next_token) not in until_one_of and can_alias): if lower(next_token) == "as": with_as = True alias, _ = ExpressionParser.parse(tokens, is_right_hand=True, until_one_of=until_one_of) else: with_as = False alias = next_token if alias in String.QUOTES: alias = StringParser.parse(tokens, alias) return Alias(expression, alias, with_as), next(tokens, None) return expression, next_token
def parse(tokens, is_right_hand=False): main_token = next(tokens) next_token = None if main_token in String.QUOTES: expression = StringParser.parse(tokens, main_token) elif main_token.isdigit(): expression = Integer(main_token) elif main_token in Boolean.BOOLEAN_VALUES: expression = Boolean(main_token) elif main_token in Null.VALUES: expression = Null() elif main_token in Type.VALUES: expression = Type(main_token) elif main_token == "(": argument_tokens = get_tokens_until_closing_parenthesis(tokens) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Parenthesis(*arguments) elif main_token == "case": argument_tokens, next_token = get_tokens_until_one_of( tokens, ["end"]) assert next_token == "end" next_token = next(tokens, None) expression = CaseParser.parse(iter(argument_tokens)) elif main_token == "select": argument_tokens, next_token = get_tokens_until_one_of(tokens, []) next_token = next(tokens, None) expression = SelectStatementParser.parse(iter(argument_tokens)) else: expression = None if next_token is None: next_token = next(tokens, None) # Expressions that need the next_token to be read if expression is None: if next_token is not None and next_token == "(": argument_tokens = get_tokens_until_closing_parenthesis(tokens) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = FunctionCall(main_token, *arguments) next_token = next(tokens, None) elif next_token is not None and next_token == "[": argument_tokens, next_token = get_tokens_until_one_of( tokens, stop_words=["]"]) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = Index( Column(main_token), arguments) # left item will not always be a column next_token = next(tokens, None) elif next_token is not None and main_token == "-" and next_token.isdigit( ): expression = Integer(-int(next_token)) next_token = next(tokens, None) elif (main_token in String.PREFIXES and next_token is not None and next_token in String.QUOTES): expression = StringParser.parse(tokens, start_quote=next_token, prefix=main_token) else: expression = Column(main_token) if next_token == "over": opening_parenthesis = next(tokens, None) if opening_parenthesis != "(": raise ParsingError("expected '('") argument_tokens = iter( get_tokens_until_closing_parenthesis(tokens)) argument_next_token = next(argument_tokens, None) if argument_next_token == "partition": argument_next_token = next(argument_tokens, None) if not argument_next_token or argument_next_token != "by": raise ParsingError("Missing BY after PARTITION") expression_tokens, argument_next_token = get_tokens_until_one_of( argument_tokens, ["order", "rows", "range"]) partition_by = ExpressionListParser.parse( iter(expression_tokens)) else: partition_by = None if argument_next_token == "order": argument_next_token = next(argument_tokens, None) if not argument_next_token or argument_next_token != "by": raise ParsingError("Missing BY after ORDER") expression_tokens, argument_next_token = get_tokens_until_one_of( argument_tokens, ["rows", "range"]) order_by = OrderByParser.parse(iter(expression_tokens)) else: order_by = None if argument_next_token in ("rows", "range"): rows_range = argument_next_token expression_tokens, _ = get_tokens_until_one_of( argument_tokens, []) frame_clause = WindowFrameClause(rows_range, " ".join(expression_tokens)) else: frame_clause = None expression = AnalyticsClause( expression, partition_by=partition_by, order_by=order_by, frame_clause=frame_clause, ) next_token = next(tokens, None) if next_token and next_token in ("+", "-", "*", "/"): left_hand = expression symbol = next_token right_hand, next_token = ExpressionParser.parse(tokens, is_right_hand=True) expression = ArithmaticOperator(symbol, left_hand, right_hand) if is_right_hand: return expression, next_token if next_token in Condition.PREDICATES: symbol = next_token if next_token == "is": next_next_token = next(tokens) if next_next_token == "not": symbol = "is not" else: tokens, _ = get_tokens_until_one_of( tokens, [], first_token=next_next_token) tokens = iter(tokens) right_hand, next_token = ExpressionParser.parse(tokens, is_right_hand=True) expression = Condition(expression, symbol, right_hand) elif next_token == "between": symbol = next_token right_hand_left, next_token = ExpressionParser.parse( tokens, is_right_hand=True) if next_token != "and": raise ParsingError("expected AND") right_hand_right, next_token = ExpressionParser.parse( tokens, is_right_hand=True) right_hand = BooleanCondition( "and", right_hand_left, right_hand_right, ) expression = Condition(expression, symbol, right_hand) if next_token in BooleanCondition.PREDICATES: left_hand = expression symbol = next_token right_hand = ExpressionParser.parse(tokens) expression = BooleanCondition(symbol, left_hand, right_hand) next_token = next(tokens, None) if next_token == "except": opening_parenthesis = next(tokens, None) if opening_parenthesis != "(": raise ParsingError("expected '('") argument_tokens = get_tokens_until_closing_parenthesis(tokens) arguments = ExpressionListParser.parse(iter(argument_tokens)) expression = ExceptClause(expression, arguments) next_token = next(tokens, None) if (next_token is not None and next_token != ")" and next_token != "'" and next_token != '"' and next_token != "`" and next_token != ";"): if next_token == "as": with_as = True alias, _ = ExpressionParser.parse(tokens, is_right_hand=True) else: with_as = False alias = next_token return Alias(expression, alias, with_as) return expression
def test_index_function_access(): actual = ExpressionParser.parse(to_tokens("array[index(0)]")) expected = Index(Column("array"), [FunctionCall("index", Integer(0))]) assert actual == expected
def test_index_access_alias(): actual = ExpressionParser.parse(to_tokens("array[0] alias")) expected = Alias(Index(Column("array"), [Integer(0)]), with_as=False, alias="alias") assert actual == expected
def test_index_access(): actual = ExpressionParser.parse(to_tokens("array[0]")) expected = Index(Column("array"), [Integer(0)]) assert actual == expected