def __vectorize(self, tokenlist): token_list = TokenList(list(tokenlist.flatten())) # print(token_list.tokens) for x in token_list: if x.ttype is Comparison: idx_comp_op = token_list.token_index( x) #Index of comparison operator attr = token_list.token_prev( idx_comp_op, skip_ws=True, skip_cm=True)[1].value #Name of the attribute print(attr) comp_op = x # print(comp_op) if comp_op.value == '<' or comp_op.value == '<=': lit_dir = 'ub' elif comp_op.value == '>' or comp_op.value == '>=': lit_dir = 'lb' else: lit_dir = 'bi' # print(lit_dir) try: lit = float( token_list.token_next( idx_comp_op, skip_ws=True, skip_cm=True)[1].value) #literal value except ValueError: print("Possible join, skipping") continue # print(lit) if lit_dir == 'bi': self.query_vec['_'.join([attr, 'lb'])] = lit self.query_vec['_'.join([attr, 'ub'])] = lit continue self.query_vec['_'.join([attr, lit_dir ])] = lit #lit_dir is either lb or ub
def parse(sql: str) -> SqlMeta: if sql is None: raise ValueError("A sql statement must be provided.") # Tokenize the SQL statement statements = sqlparse.parse(sql) # We assume only one statement in SQL tokens = TokenList(statements[0].tokens) log.debug(f"Successfully tokenized sql statement: {tokens}") in_tables = [] out_tables = [] idx, token = tokens.token_next_by(t=T.Keyword) while token: if _is_in_table(token): idx, in_table = _get_table(tokens, idx) in_tables.append(in_table) elif _is_out_table(token): idx, out_table = _get_table(tokens, idx) out_tables.append(out_table) idx, token = tokens.token_next_by(t=T.Keyword, idx=idx) return SqlMeta(in_tables, out_tables)
def parse_cte(self, idx, tokens: TokenList): gidx, group = tokens.token_next(idx, skip_ws=True, skip_cm=True) # handle recursive keyword if group.match(T.Keyword, values=['RECURSIVE']): gidx, group = tokens.token_next(gidx, skip_ws=True, skip_cm=True) if not group.is_group: return [], None # get CTE name offset = 1 cte_name = group.token_first(skip_ws=True, skip_cm=True) self.ctes.add(cte_name.value) # AS keyword offset, as_keyword = group.token_next(offset, skip_ws=True, skip_cm=True) if not as_keyword.match(T.Keyword, values=['AS']): raise RuntimeError(f"CTE does not have AS keyword at index {gidx}") offset, parens = group.token_next(offset, skip_ws=True, skip_cm=True) if isinstance(parens, Parenthesis) or parens.is_group: # Parse CTE using recursion. return cte_name.value, self.recurse(TokenList( parens.tokens)).in_tables raise RuntimeError( f"Parens {parens} are not Parenthesis at index {gidx}")
def test_group_parentheses(): tokens = [ Token(T.Keyword, 'CREATE'), Token(T.Whitespace, ' '), Token(T.Keyword, 'TABLE'), Token(T.Whitespace, ' '), Token(T.Name, 'table_name'), Token(T.Whitespace, ' '), Token(T.Punctuation, '('), Token(T.Name, 'id'), Token(T.Whitespace, ' '), Token(T.Keyword, 'SERIAL'), Token(T.Whitespace, ' '), Token(T.Keyword, 'CHECK'), Token(T.Punctuation, '('), Token(T.Name, 'id'), Token(T.Operator, '='), Token(T.Number, '0'), Token(T.Punctuation, ')'), Token(T.Punctuation, ')'), Token(T.Punctuation, ';'), ] expected_tokens = TokenList([ Token(T.Keyword, 'CREATE'), Token(T.Keyword, 'TABLE'), Token(T.Name, 'table_name'), Parenthesis([ Token(T.Punctuation, '('), Token(T.Name, 'id'), Token(T.Keyword, 'SERIAL'), Token(T.Keyword, 'CHECK'), Parenthesis([ Token(T.Punctuation, '('), Token(T.Name, 'id'), Token(T.Operator, '='), Token(T.Number, '0'), Token(T.Punctuation, ')'), ]), Token(T.Punctuation, ')'), ]), Token(T.Punctuation, ';'), ]) grouped_tokens = group_parentheses(tokens) stdout = sys.stdout try: sys.stdout = StringIO() expected_tokens._pprint_tree() a = sys.stdout.getvalue() sys.stdout = StringIO() grouped_tokens._pprint_tree() b = sys.stdout.getvalue() finally: sys.stdout = stdout assert_multi_line_equal(a, b)
def get_query_tokens(query): """ :type query str :rtype: list[sqlparse.sql.Token] """ tokens = TokenList(sqlparse.parse(query)[0].tokens).flatten() # print([(token.value, token.ttype) for token in tokens]) return [token for token in tokens if token.ttype is not Whitespace]
def get_query_tokens(query: str) -> List[sqlparse.sql.Token]: query = preprocess_query(query) parsed = sqlparse.parse(query) # handle empty queries (#12) if not parsed: return [] tokens = TokenList(parsed[0].tokens).flatten() return [token for token in tokens if token.ttype is not Whitespace]
def to_clickhouse(cls, schema: str, query: str): """ parse ddl query :param schema: :param query: :return: """ token_list = TokenList() parsed = sqlparse.parse(query)[0] token_list = cls._add_token(schema, parsed, parsed.tokens, token_list) return str(token_list)
def parse(cls, sql: str, default_schema: Optional[str] = None) -> SqlMeta: if sql is None: raise ValueError("A sql statement must be provided.") # Tokenize the SQL statement statements = sqlparse.parse(sql) # We assume only one statement in SQL tokens = TokenList(statements[0].tokens) log.debug(f"Successfully tokenized sql statement: {tokens}") parser = cls(default_schema) return parser.recurse(tokens)
def _process_statement_tokens(cls, statement_tokens, filter_string): """ This function processes the tokens in a statement to ensure that the correct parsing behavior occurs. In typical cases, the statement tokens will contain just the comparison - in this case, no additional processing occurs. In the case when a filter string contains the IN operator, this function parses those tokens into a Comparison object, which will be parsed by _get_comparison_for_model_registry. :param statement_tokens: List of tokens from a statement :param filter_string: Filter string from which the parsed statement tokens originate. Used for informative logging :return: List of tokens """ expected = "Expected search filter with single comparison operator. e.g. name='myModelName'" token_list = [] if len(statement_tokens) == 0: raise MlflowException( "Invalid filter '%s'. Could not be parsed. %s" % (filter_string, expected), error_code=INVALID_PARAMETER_VALUE, ) elif len(statement_tokens) == 1: if isinstance(statement_tokens[0], Comparison): token_list = statement_tokens else: raise MlflowException( "Invalid filter '%s'. Could not be parsed. %s" % (filter_string, expected), error_code=INVALID_PARAMETER_VALUE, ) elif len(statement_tokens) > 1: comparison_subtokens = [] for token in statement_tokens: if isinstance(token, Comparison): raise MlflowException( "Search filter '%s' contains multiple expressions. " "%s " % (filter_string, expected), error_code=INVALID_PARAMETER_VALUE, ) elif cls._is_list_component_token(token): comparison_subtokens.append(token) elif not token.is_whitespace: break # if we have fewer than 3, that means we have an incomplete statement. if len(comparison_subtokens) == 3: token_list = [Comparison(TokenList(comparison_subtokens))] else: raise MlflowException( "Invalid filter '%s'. Could not be parsed. %s" % (filter_string, expected), error_code=INVALID_PARAMETER_VALUE, ) return token_list
def group_parentheses(tokens): stack = [[]] for token in tokens: if token.is_whitespace: continue if token.match(T.Punctuation, '('): stack.append([token]) else: stack[-1].append(token) if token.match(T.Punctuation, ')'): group = stack.pop() stack[-1].append(Parenthesis(group)) return TokenList(stack[0])
def extract_from_column(self): ''' columns_group can collect all tokens between 'DML SELECT' and 'Keyword FROM' [<DML 'SELECT' at 0x3655A08>, <Whitespace ' ' at 0x3655A68>, <IdentifierList 'me.Sap...' at 0x366E228>, <Newline ' ' at 0x3665948>, <Keyword 'FROM' at 0x36659A8>, <Whitespace ' ' at 0x3665A08>, <IdentifierList 'SODS2....' at 0x366E390>, <Whitespace ' ' at 0x3667228>, <IdentifierList 't,SHAR...' at 0x366E480>, <Newline ' ' at 0x3667528>] ''' tokens = self.getTokens() tokenlist = TokenList(tokens) cols_idx,cols_item = [] , [] cols_group = [] ''' cols_item only keep the columns between select and from. Notic : exists many groups if sql have union/union all token , so need use cols_group to collect it. ''' fetch_col_flag = False for idx, item in enumerate(tokens): before_idx,before_item = tokenlist.token_prev(idx,skip_ws=True) next_idx,next_item = tokenlist.token_next(idx,skip_ws=True) if not next_item : break #capture up first column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ (before_item.ttype == Keyword.DML or before_item.value.upper() == 'DISTINCT'): cols_idx.append(idx) fetch_col_flag = True cols_item = [] if fetch_col_flag == True: cols_item.append(item) #capture up last column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ next_item.ttype is Keyword and next_item.value.upper() == 'FROM': cols_idx.append(idx) fetch_col_flag = False cols_group.append (''.join([ item.value for item in cols_item])) ''' the cols_idx like [[10,12],[24,26]],it's two-dimnsn list , --> flatten to [10,11,12,24,25,26] ''' cols_idxes = sum([list(range(cols_idx[2*i],cols_idx[2*i+1]+1)) for i in range(int(len(cols_idx)/2))],[]) keep_tokens = [ item for idx,item in enumerate(tokens) if idx not in cols_idxes ] self.tokens = keep_tokens self.tokens_val = [item.value for item in tokens] return cols_group
def get_query_tokens(query: str) -> List[sqlparse.sql.Token]: """ :type query str :rtype: list[sqlparse.sql.Token] """ query = preprocess_query(query) parsed = sqlparse.parse(query) # handle empty queries (#12) if not parsed: return [] tokens = TokenList(parsed[0].tokens).flatten() # print([(token.value, token.ttype) for token in tokens]) return [token for token in tokens if token.ttype is not Whitespace]
def tokens(self) -> List[SQLToken]: """ Tokenizes the query """ if self._tokens is not None: return self._tokens parsed = sqlparse.parse(self.query) tokens = [] # handle empty queries (#12) if not parsed: return tokens sqlparse_tokens = TokenList(parsed[0].tokens).flatten() non_empty_tokens = [ token for token in sqlparse_tokens if token.ttype is not Whitespace ] last_keyword = None for index, tok in enumerate(non_empty_tokens): token = SQLToken( tok=tok, index=index, subquery_level=self._subquery_level, last_keyword=last_keyword, ) if index > 0: # create links between consecutive tokens token.previous_token = tokens[index - 1] tokens[index - 1].next_token = token if token.is_left_parenthesis: self._determine_opening_parenthesis_type(token=token) elif token.is_right_parenthesis: self._determine_closing_parenthesis_type(token=token) if tok.is_keyword and tok.normalized not in KEYWORDS_IGNORED: last_keyword = tok.normalized token.is_in_nested_function = self._is_in_nested_function tokens.append(token) self._tokens = tokens return tokens
def parse(cls, sql: str, default_schema: Optional[str] = None) -> SqlMeta: if sql is None: raise ValueError("A sql statement must be provided.") # Tokenize the SQL statement sql_statements = sqlparse.parse(sql) sql_parser = cls(default_schema) sql_meta = SqlMeta([], []) for sql_statement in sql_statements: tokens = TokenList(sql_statement.tokens) log.debug(f"Successfully tokenized sql statement: {tokens}") result = sql_parser.recurse(tokens) # Add the in / out tables (if any) to the sql meta sql_meta.add_in_tables(result.in_tables) sql_meta.add_out_tables(result.out_tables) return sql_meta
def extract_from_column(self): ''' pick up all tokens between 'DML SELECT' and 'Keyword FROM' [<DML 'SELECT' at 0x3655A08>, <Whitespace ' ' at 0x3655A68>, <IdentifierList 'me.Sap...' at 0x366E228>, <Newline ' ' at 0x3665948>, <Keyword 'FROM' at 0x36659A8>, <Whitespace ' ' at 0x3665A08>, <IdentifierList 'SODS2....' at 0x366E390>, <Whitespace ' ' at 0x3667228>, <IdentifierList 't,SHAR...' at 0x366E480>, <Newline ' ' at 0x3667528>] ''' tokens = self.getTokens() tokenlist = TokenList(tokens) cols_idx,cols_item = [] , [] cols_group = [] fetch_col_flag = False for idx, item in enumerate(tokens): before_idx,before_item = tokenlist.token_prev(idx,skip_ws=True) next_idx,next_item = tokenlist.token_next(idx,skip_ws=True) if not next_item : break #capture up first column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ (before_item.ttype == Keyword.DML or before_item.value.upper() == 'DISTINCT'): cols_idx.append(idx) fetch_col_flag = True cols_item = [] if fetch_col_flag == True: cols_item.append(item) #capture up last column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ next_item.ttype is Keyword and next_item.value.upper() == 'FROM': cols_idx.append(idx) fetch_col_flag = False cols_group.append (cols_item) cols_idxes = sum([list(range(cols_idx[2*i],cols_idx[2*i+1]+1)) for i in range(int(len(cols_idx)/2))],[]) left_tokens = [ item for idx,item in enumerate(tokens) if idx not in cols_idxes ]
def __projections(self, token, tokenlist): idx = tokenlist.token_index(token) afs_list_idx, afs = tokenlist.token_next(idx, skip_ws=True, skip_cm=True) afs_list = TokenList(list(afs.flatten())) for af in afs_list: # Get AFs if af.value.lower() in ['avg', 'count', 'sum', 'min', 'max']: # if af not in self.afs_dic: # self.afs_dic[af.value] = [] af_idx = afs_list.token_index(af) punc_idx, _ = afs_list.token_next(af_idx, skip_ws=True, skip_cm=True) attr_idx, attr = afs_list.token_next(punc_idx, skip_ws=True, skip_cm=True) if attr.ttype is not Wildcard: self.afs.append('_'.join([af.value, attr.value])) else: self.afs.append(af.value)
def convert_expression_to_python(token): if not token.is_group: if token.value.upper() == 'TRUE': return 'sql.true()' elif token.value.upper() == 'FALSE': return 'sql.false()' elif token.ttype == T.Name: return 'sql.literal_column({0!r})'.format(str(token.value)) else: return 'sql.text({0!r})'.format(str(token.value)) if isinstance(token, Parenthesis): return '({0})'.format(convert_expression_to_python(TokenList(token.tokens[1:-1]))) elif len(token.tokens) == 1: return convert_expression_to_python(token.tokens[0]) elif len(token.tokens) == 3 and token.tokens[1].ttype == T.Comparison: lhs = convert_expression_to_python(token.tokens[0]) rhs = convert_expression_to_python(token.tokens[2]) op = token.tokens[1].value if op == '=': op = '==' return '{0} {1} {2}'.format(lhs, op, rhs) elif len(token.tokens) == 3 and token.tokens[1].match(T.Keyword, 'IN') and isinstance(token.tokens[2], Parenthesis): lhs = convert_expression_to_python(token.tokens[0]) rhs = [convert_expression_to_python(t) for t in token.tokens[2].tokens[1:-1] if not t.match(T.Punctuation, ',')] return '{0}.in_({1!r})'.format(lhs, tuple(rhs)) elif len(token.tokens) == 4 and token.tokens[1].match(T.Comparison, '~') and token.tokens[2].match(T.Name, 'E') and token.tokens[3].ttype == T.String.Single: lhs = convert_expression_to_python(token.tokens[0]) pattern = token.tokens[3].value.replace('\\\\', '\\') return 'regexp({0}, {1})'.format(lhs, pattern) elif len(token.tokens) == 3 and token.tokens[1].match(T.Keyword, 'IS') and token.tokens[2].match(T.Keyword, 'NULL'): lhs = convert_expression_to_python(token.tokens[0]) return '{0} == None'.format(lhs) elif len(token.tokens) == 3 and token.tokens[1].match(T.Keyword, 'IS') and token.tokens[2].match(T.Keyword, 'NOT NULL'): lhs = convert_expression_to_python(token.tokens[0]) return '{0} != None'.format(lhs) else: parts = [] op = None idx = -1 while True: new_idx, op_token = token.token_next_by(m=(T.Keyword, ('AND', 'OR')), idx=idx) if op_token is None: break if op is None: op = op_token.normalized assert op == op_token.normalized new_tokens = token.tokens[idx+1:new_idx] if len(new_tokens) == 1: parts.append(convert_expression_to_python(new_tokens[0])) else: parts.append(convert_expression_to_python(TokenList(new_tokens))) idx = new_idx + 1 if idx == -1: raise ValueError('unknown expression - {0}'.format(token)) new_tokens = token.tokens[idx:] if len(new_tokens) == 1: parts.append(convert_expression_to_python(new_tokens[0])) else: parts.append(convert_expression_to_python(TokenList(new_tokens))) return 'sql.{0}_({1})'.format(op.lower(), ', '.join(parts))
def _get_body_tokens(self): idx, body_token = self.token_next_by(i=Parenthesis) if body_token is not None: return TokenList(body_token.tokens[1:-1])
def get_tables(statement): aliases = [] tables = [] parsed = sqlparse.parse(statement) if not parsed or len(parsed) == 0: return set() # flatten the identifiers tokens = [t for t in TokenList(parsed[0].tokens).flatten()] curr_name = '' alias_candidate = '' last_non_whitespace = None _state = State.SEARCHING for t in tokens: if _state == State.SEARCHING: if SqlParser.is_table_keyword(t): _state = State.FINDING_TABLE if (t.is_keyword and t.normalized == 'AS' and last_non_whitespace.ttype is Name): alias_candidate = last_non_whitespace _state = State.CHECKING_ALIAS elif _state == State.FINDING_TABLE: if t.ttype is Name or t.ttype is Literal.String.Symbol: curr_name += t.value _state = State.BUILDING_TABLE elif t.ttype is Punctuation or t.is_keyword: _state = State.SEARCHING elif _state == State.BUILDING_TABLE: if t.ttype is Name or t.ttype is Literal.String.Symbol or ( t.value == '.'): curr_name += t.value else: if curr_name not in tables: tables.append(curr_name) curr_name = '' if t.is_whitespace: _state = State.FINDING_ALIAS else: _state = State.SEARCHING elif _state == State.FINDING_ALIAS: if t.ttype is Name: curr_name += t.value _state = State.BUILDING_ALIAS elif SqlParser.is_table_keyword(t): _state = State.FINDING_TABLE elif t.ttype is Punctuation or t.is_keyword: _state = State.SEARCHING elif _state == State.BUILDING_ALIAS: if t.ttype is Name or t.value == '.': curr_name += t.value else: aliases.append(curr_name) curr_name = '' if SqlParser.is_table_keyword(t): _state = State.FINDING_TABLE else: _state = State.SEARCHING elif _state == State.CHECKING_ALIAS: if SqlParser.is_table_keyword(t): _state = State.FINDING_TABLE elif not t.is_whitespace: if t.value == '(': aliases.append(alias_candidate.value) _state = State.SEARCHING if not t.is_whitespace: last_non_whitespace = t # Naive way to get rid of aliases tables = [t for t in tables if t.split('.')[0] not in aliases] return tables
def extract(token_list): tokens = list(TokenList(token_list).flatten()) for token in tokens: if token.is_whitespace(): token.value = " " return TokenList(tokens)