Beispiel #1
0
 def __vectorize(self, tokenlist):
     token_list = TokenList(list(tokenlist.flatten()))
     # print(token_list.tokens)
     for x in token_list:
         if x.ttype is Comparison:
             idx_comp_op = token_list.token_index(
                 x)  #Index of comparison operator
             attr = token_list.token_prev(
                 idx_comp_op, skip_ws=True,
                 skip_cm=True)[1].value  #Name of the attribute
             print(attr)
             comp_op = x
             # print(comp_op)
             if comp_op.value == '<' or comp_op.value == '<=':
                 lit_dir = 'ub'
             elif comp_op.value == '>' or comp_op.value == '>=':
                 lit_dir = 'lb'
             else:
                 lit_dir = 'bi'
             # print(lit_dir)
             try:
                 lit = float(
                     token_list.token_next(
                         idx_comp_op, skip_ws=True,
                         skip_cm=True)[1].value)  #literal value
             except ValueError:
                 print("Possible join, skipping")
                 continue
             # print(lit)
             if lit_dir == 'bi':
                 self.query_vec['_'.join([attr, 'lb'])] = lit
                 self.query_vec['_'.join([attr, 'ub'])] = lit
                 continue
             self.query_vec['_'.join([attr, lit_dir
                                      ])] = lit  #lit_dir is either lb or ub
Beispiel #2
0
 def filter_identifier_list(tkn_list: TokenList, token: Token):
     # debug: pprint(token)
     index = tkn_list.token_index(token)
     prev_token: Token = tkn_list.token_prev(index)[1]
     if prev_token is not None:
         # prev is not exist(index: 0) -> None
         if not prev_token.match(DML, 'SELECT'):
             return False
     next_token: Token = tkn_list.token_next(index)[1]
     if next_token is not None:
         # next is not exist(index: list len max) -> None
         if not next_token.match(Keyword, 'FROM'):
             return False
     return True
Beispiel #3
0
    def extract_from_column(self):

        '''
        columns_group can collect all tokens between 'DML SELECT' and 'Keyword FROM'
        
        [<DML 'SELECT' at 0x3655A08>, <Whitespace ' ' at 0x3655A68>, <IdentifierList 'me.Sap...' at 0x366E228>,
         <Newline ' ' at 0x3665948>, <Keyword 'FROM' at 0x36659A8>, <Whitespace ' ' at 0x3665A08>,
         <IdentifierList 'SODS2....' at 0x366E390>,
         <Whitespace ' ' at 0x3667228>, <IdentifierList 't,SHAR...' at 0x366E480>, <Newline ' ' at 0x3667528>]
        '''
        
        tokens = self.getTokens()
        tokenlist = TokenList(tokens)
        cols_idx,cols_item = [] , []
        cols_group = []
        '''
            cols_item only keep the columns between select and from.
            Notic : exists many groups if sql have union/union all token , so need use cols_group to collect it.
        '''
        fetch_col_flag = False
        for idx, item in enumerate(tokens):
            before_idx,before_item = tokenlist.token_prev(idx,skip_ws=True)
            next_idx,next_item = tokenlist.token_next(idx,skip_ws=True)
            if not next_item :
                break
            #capture up first column index
            if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \
                (before_item.ttype == Keyword.DML or before_item.value.upper() == 'DISTINCT'):
                cols_idx.append(idx)
                fetch_col_flag = True
                cols_item = []                
            if fetch_col_flag == True:
                
                cols_item.append(item)
            #capture up last column index
            if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \
                next_item.ttype is Keyword and next_item.value.upper() == 'FROM':
                cols_idx.append(idx)
                fetch_col_flag = False
                cols_group.append (''.join([ item.value for item in cols_item]))
        
        '''
        the cols_idx like [[10,12],[24,26]],it's two-dimnsn list , --> flatten to [10,11,12,24,25,26]
        '''
        cols_idxes = sum([list(range(cols_idx[2*i],cols_idx[2*i+1]+1)) for i in range(int(len(cols_idx)/2))],[]) 
        
        keep_tokens = [ item for idx,item in enumerate(tokens) if idx not in cols_idxes ]
        self.tokens = keep_tokens
        self.tokens_val = [item.value for item in tokens]
        return cols_group
Beispiel #4
0
    def extract_from_column(self):

        '''
        pick up all tokens between 'DML SELECT' and 'Keyword FROM'
        
        [<DML 'SELECT' at 0x3655A08>, <Whitespace ' ' at 0x3655A68>, <IdentifierList 'me.Sap...' at 0x366E228>,
         <Newline ' ' at 0x3665948>, <Keyword 'FROM' at 0x36659A8>, <Whitespace ' ' at 0x3665A08>,
         <IdentifierList 'SODS2....' at 0x366E390>,
         <Whitespace ' ' at 0x3667228>, <IdentifierList 't,SHAR...' at 0x366E480>, <Newline ' ' at 0x3667528>]
        '''
        
        tokens = self.getTokens()
        tokenlist = TokenList(tokens)
        cols_idx,cols_item = [] , []
        cols_group = []
        fetch_col_flag = False
        for idx, item in enumerate(tokens):
            before_idx,before_item = tokenlist.token_prev(idx,skip_ws=True)
            next_idx,next_item = tokenlist.token_next(idx,skip_ws=True)
            if not next_item :
                break
            #capture up first column index
            if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \
                (before_item.ttype == Keyword.DML or before_item.value.upper() == 'DISTINCT'):
                cols_idx.append(idx)
                fetch_col_flag = True
                cols_item = []                
            if fetch_col_flag == True:
                cols_item.append(item)
            #capture up last column index
            if (isinstance(item,IdentifierList) or isinstance(item,Identifier)) and \
                next_item.ttype is Keyword and next_item.value.upper() == 'FROM':
                cols_idx.append(idx)
                fetch_col_flag = False
                cols_group.append (cols_item)
        
        cols_idxes = sum([list(range(cols_idx[2*i],cols_idx[2*i+1]+1)) for i in range(int(len(cols_idx)/2))],[]) 
        
        left_tokens = [ item for idx,item in enumerate(tokens) if idx not in cols_idxes ]