def __vectorize(self, tokenlist): token_list = TokenList(list(tokenlist.flatten())) # print(token_list.tokens) for x in token_list: if x.ttype is Comparison: idx_comp_op = token_list.token_index( x) #Index of comparison operator attr = token_list.token_prev( idx_comp_op, skip_ws=True, skip_cm=True)[1].value #Name of the attribute print(attr) comp_op = x # print(comp_op) if comp_op.value == '<' or comp_op.value == '<=': lit_dir = 'ub' elif comp_op.value == '>' or comp_op.value == '>=': lit_dir = 'lb' else: lit_dir = 'bi' # print(lit_dir) try: lit = float( token_list.token_next( idx_comp_op, skip_ws=True, skip_cm=True)[1].value) #literal value except ValueError: print("Possible join, skipping") continue # print(lit) if lit_dir == 'bi': self.query_vec['_'.join([attr, 'lb'])] = lit self.query_vec['_'.join([attr, 'ub'])] = lit continue self.query_vec['_'.join([attr, lit_dir ])] = lit #lit_dir is either lb or ub
def filter_identifier_list(tkn_list: TokenList, token: Token): # debug: pprint(token) index = tkn_list.token_index(token) prev_token: Token = tkn_list.token_prev(index)[1] if prev_token is not None: # prev is not exist(index: 0) -> None if not prev_token.match(DML, 'SELECT'): return False next_token: Token = tkn_list.token_next(index)[1] if next_token is not None: # next is not exist(index: list len max) -> None if not next_token.match(Keyword, 'FROM'): return False return True
def extract_from_column(self): ''' columns_group can collect all tokens between 'DML SELECT' and 'Keyword FROM' [<DML 'SELECT' at 0x3655A08>, <Whitespace ' ' at 0x3655A68>, <IdentifierList 'me.Sap...' at 0x366E228>, <Newline ' ' at 0x3665948>, <Keyword 'FROM' at 0x36659A8>, <Whitespace ' ' at 0x3665A08>, <IdentifierList 'SODS2....' at 0x366E390>, <Whitespace ' ' at 0x3667228>, <IdentifierList 't,SHAR...' at 0x366E480>, <Newline ' ' at 0x3667528>] ''' tokens = self.getTokens() tokenlist = TokenList(tokens) cols_idx,cols_item = [] , [] cols_group = [] ''' cols_item only keep the columns between select and from. Notic : exists many groups if sql have union/union all token , so need use cols_group to collect it. ''' fetch_col_flag = False for idx, item in enumerate(tokens): before_idx,before_item = tokenlist.token_prev(idx,skip_ws=True) next_idx,next_item = tokenlist.token_next(idx,skip_ws=True) if not next_item : break #capture up first column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ (before_item.ttype == Keyword.DML or before_item.value.upper() == 'DISTINCT'): cols_idx.append(idx) fetch_col_flag = True cols_item = [] if fetch_col_flag == True: cols_item.append(item) #capture up last column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ next_item.ttype is Keyword and next_item.value.upper() == 'FROM': cols_idx.append(idx) fetch_col_flag = False cols_group.append (''.join([ item.value for item in cols_item])) ''' the cols_idx like [[10,12],[24,26]],it's two-dimnsn list , --> flatten to [10,11,12,24,25,26] ''' cols_idxes = sum([list(range(cols_idx[2*i],cols_idx[2*i+1]+1)) for i in range(int(len(cols_idx)/2))],[]) keep_tokens = [ item for idx,item in enumerate(tokens) if idx not in cols_idxes ] self.tokens = keep_tokens self.tokens_val = [item.value for item in tokens] return cols_group
def extract_from_column(self): ''' pick up all tokens between 'DML SELECT' and 'Keyword FROM' [<DML 'SELECT' at 0x3655A08>, <Whitespace ' ' at 0x3655A68>, <IdentifierList 'me.Sap...' at 0x366E228>, <Newline ' ' at 0x3665948>, <Keyword 'FROM' at 0x36659A8>, <Whitespace ' ' at 0x3665A08>, <IdentifierList 'SODS2....' at 0x366E390>, <Whitespace ' ' at 0x3667228>, <IdentifierList 't,SHAR...' at 0x366E480>, <Newline ' ' at 0x3667528>] ''' tokens = self.getTokens() tokenlist = TokenList(tokens) cols_idx,cols_item = [] , [] cols_group = [] fetch_col_flag = False for idx, item in enumerate(tokens): before_idx,before_item = tokenlist.token_prev(idx,skip_ws=True) next_idx,next_item = tokenlist.token_next(idx,skip_ws=True) if not next_item : break #capture up first column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ (before_item.ttype == Keyword.DML or before_item.value.upper() == 'DISTINCT'): cols_idx.append(idx) fetch_col_flag = True cols_item = [] if fetch_col_flag == True: cols_item.append(item) #capture up last column index if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \ next_item.ttype is Keyword and next_item.value.upper() == 'FROM': cols_idx.append(idx) fetch_col_flag = False cols_group.append (cols_item) cols_idxes = sum([list(range(cols_idx[2*i],cols_idx[2*i+1]+1)) for i in range(int(len(cols_idx)/2))],[]) left_tokens = [ item for idx,item in enumerate(tokens) if idx not in cols_idxes ]