def parse_cte(self, idx, tokens: TokenList): gidx, group = tokens.token_next(idx, skip_ws=True, skip_cm=True) # handle recursive keyword if group.match(T.Keyword, values=['RECURSIVE']): gidx, group = tokens.token_next(gidx, skip_ws=True, skip_cm=True) if not group.is_group: return [], None # get CTE name offset = 1 cte_name = group.token_first(skip_ws=True, skip_cm=True) self.ctes.add(cte_name.value) # AS keyword offset, as_keyword = group.token_next(offset, skip_ws=True, skip_cm=True) if not as_keyword.match(T.Keyword, values=['AS']): raise RuntimeError(f"CTE does not have AS keyword at index {gidx}") offset, parens = group.token_next(offset, skip_ws=True, skip_cm=True) if isinstance(parens, Parenthesis) or parens.is_group: # Parse CTE using recursion. return cte_name.value, self.recurse(TokenList( parens.tokens)).in_tables raise RuntimeError( f"Parens {parens} are not Parenthesis at index {gidx}")
def __vectorize(self, tokenlist): token_list = TokenList(list(tokenlist.flatten())) # print(token_list.tokens) for x in token_list: if x.ttype is Comparison: idx_comp_op = token_list.token_index( x) #Index of comparison operator attr = token_list.token_prev( idx_comp_op, skip_ws=True, skip_cm=True)[1].value #Name of the attribute print(attr) comp_op = x # print(comp_op) if comp_op.value == '<' or comp_op.value == '<=': lit_dir = 'ub' elif comp_op.value == '>' or comp_op.value == '>=': lit_dir = 'lb' else: lit_dir = 'bi' # print(lit_dir) try: lit = float( token_list.token_next( idx_comp_op, skip_ws=True, skip_cm=True)[1].value) #literal value except ValueError: print("Possible join, skipping") continue # print(lit) if lit_dir == 'bi': self.query_vec['_'.join([attr, 'lb'])] = lit self.query_vec['_'.join([attr, 'ub'])] = lit continue self.query_vec['_'.join([attr, lit_dir ])] = lit #lit_dir is either lb or ub
def __projections(self, token, tokenlist): idx = tokenlist.token_index(token) afs_list_idx, afs = tokenlist.token_next(idx, skip_ws=True, skip_cm=True) afs_list = TokenList(list(afs.flatten())) for af in afs_list: # Get AFs if af.value.lower() in ['avg', 'count', 'sum', 'min', 'max']: # if af not in self.afs_dic: # self.afs_dic[af.value] = [] af_idx = afs_list.token_index(af) punc_idx, _ = afs_list.token_next(af_idx, skip_ws=True, skip_cm=True) attr_idx, attr = afs_list.token_next(punc_idx, skip_ws=True, skip_cm=True) if attr.ttype is not Wildcard: self.afs.append('_'.join([af.value, attr.value])) else: self.afs.append(af.value)
def filter_identifier_list(tkn_list: TokenList, token: Token): # debug: pprint(token) index = tkn_list.token_index(token) prev_token: Token = tkn_list.token_prev(index)[1] if prev_token is not None: # prev is not exist(index: 0) -> None if not prev_token.match(DML, 'SELECT'): return False next_token: Token = tkn_list.token_next(index)[1] if next_token is not None: # next is not exist(index: list len max) -> None if not next_token.match(Keyword, 'FROM'): return False return True
def extract_from_column(self): ''' columns_group can collect all tokens between 'DML SELECT' and 'Keyword FROM' [<DML 'SELECT' at 0x3655A08>, <Whitespace ' ' at 0x3655A68>, <IdentifierList 'me.Sap...' at 0x366E228>, <Newline ' ' at 0x3665948>, <Keyword 'FROM' at 0x36659A8>, <Whitespace ' ' at 0x3665A08>, <IdentifierList 'SODS2....' at 0x366E390>, <Whitespace ' ' at 0x3667228>, <IdentifierList 't,SHAR...' at 0x366E480>, <Newline ' ' at 0x3667528>] ''' tokens = self.getTokens() tokenlist = TokenList(tokens) cols_idx,cols_item = [] , [] cols_group = [] ''' cols_item only keep the columns between select and from. Notic : exists many groups if sql have union/union all token , so need use cols_group to collect it. ''' fetch_col_flag = False for idx, item in enumerate(tokens): before_idx,before_item = tokenlist.token_prev(idx,skip_ws=True) next_idx,next_item = tokenlist.token_next(idx,skip_ws=True) if not next_item : break #capture up first column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ (before_item.ttype == Keyword.DML or before_item.value.upper() == 'DISTINCT'): cols_idx.append(idx) fetch_col_flag = True cols_item = [] if fetch_col_flag == True: cols_item.append(item) #capture up last column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ next_item.ttype is Keyword and next_item.value.upper() == 'FROM': cols_idx.append(idx) fetch_col_flag = False cols_group.append (''.join([ item.value for item in cols_item])) ''' the cols_idx like [[10,12],[24,26]],it's two-dimnsn list , --> flatten to [10,11,12,24,25,26] ''' cols_idxes = sum([list(range(cols_idx[2*i],cols_idx[2*i+1]+1)) for i in range(int(len(cols_idx)/2))],[]) keep_tokens = [ item for idx,item in enumerate(tokens) if idx not in cols_idxes ] self.tokens = keep_tokens self.tokens_val = [item.value for item in tokens] return cols_group
def _extract_limit_from_query(statement: TokenList) -> Optional[int]: """ Extract limit clause from SQL statement. :param statement: SQL statement :return: Limit extracted from query, None if no limit present in statement """ idx, _ = statement.token_next_by(m=(Keyword, "LIMIT")) if idx is not None: _, token = statement.token_next(idx=idx) if token: if isinstance(token, IdentifierList): # In case of "LIMIT <offset>, <limit>", find comma and extract # first succeeding non-whitespace token idx, _ = token.token_next_by(m=(sqlparse.tokens.Punctuation, ",")) _, token = token.token_next(idx=idx) if token and token.ttype == sqlparse.tokens.Literal.Number.Integer: return int(token.value) return None
def extract_from_column(self): ''' pick up all tokens between 'DML SELECT' and 'Keyword FROM' [<DML 'SELECT' at 0x3655A08>, <Whitespace ' ' at 0x3655A68>, <IdentifierList 'me.Sap...' at 0x366E228>, <Newline ' ' at 0x3665948>, <Keyword 'FROM' at 0x36659A8>, <Whitespace ' ' at 0x3665A08>, <IdentifierList 'SODS2....' at 0x366E390>, <Whitespace ' ' at 0x3667228>, <IdentifierList 't,SHAR...' at 0x366E480>, <Newline ' ' at 0x3667528>] ''' tokens = self.getTokens() tokenlist = TokenList(tokens) cols_idx,cols_item = [] , [] cols_group = [] fetch_col_flag = False for idx, item in enumerate(tokens): before_idx,before_item = tokenlist.token_prev(idx,skip_ws=True) next_idx,next_item = tokenlist.token_next(idx,skip_ws=True) if not next_item : break #capture up first column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ (before_item.ttype == Keyword.DML or before_item.value.upper() == 'DISTINCT'): cols_idx.append(idx) fetch_col_flag = True cols_item = [] if fetch_col_flag == True: cols_item.append(item) #capture up last column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ next_item.ttype is Keyword and next_item.value.upper() == 'FROM': cols_idx.append(idx) fetch_col_flag = False cols_group.append (cols_item) cols_idxes = sum([list(range(cols_idx[2*i],cols_idx[2*i+1]+1)) for i in range(int(len(cols_idx)/2))],[]) left_tokens = [ item for idx,item in enumerate(tokens) if idx not in cols_idxes ]
def _next_non_empty_token( idx: int, token_list: TokenList) -> Tuple[Optional[int], Optional[Token]]: return token_list.token_next(idx, skip_ws=True, skip_cm=True)