def _process_select_clause(self, stmt, token_group, token, tokengroup_set): select_clause_grp = TokenGroup([ token, ], ST.SelectClause) if token_group.ttype in (ST.RoundBracket, ST.CollectionSet): # TODO: Should we check the preceeding token to make sure SELECT is the first keyword in Brackets token_group.ttype = ST.SubQuery elif ( token_group.ttype in (ST.Select, ST.SelectInto, ST.InsertIntoSelect, ST.SubQuery) and (token_group.last_token().match_type_value( Token(T.Keyword, 'UNION')) or (token_group.last_token().match_type_value( Token(T.Keyword, 'ALL')) and token_group.token_before( next_token_index=token_group.last_token_index()). match_type_value(Token(T.Keyword, 'UNION')))) ): # token_group.has_token_as_immediate_child(Token(T.Keyword, 'UNION')): pass else: while token_group.ttype is not None and token_group.ttype not in tokengroup_set: # If Select Clause is preceeded by Insert Into Clause, get out of that Clause Token Group token_group = self._switch_to_parent(token_group) if token_group.ttype in tokengroup_set: token_group = self._switch_to_parent(token_group) token_group.append(select_clause_grp) # If we are at the Statement Token Group, set the Statement Type if token_group == stmt: if token_group.ttype is None: token_group.ttype = ST.Select elif token_group.ttype is ST.Insert: token_group.ttype = ST.InsertIntoSelect return select_clause_grp
def _process_name(self, stmt, token_group, token, tokengroup_set): if token_group.last_token().match_type_value(Token(T.Keyword, 'AS')): # Alias follows AS Keyword token.ttype = ST.AliasName token_group.append(token) elif (token_group.parent.ttype in (ST.SelectClause, ST.FromClause) and ((token_group.ttype == ST.Identifier and token_group.last_token().ttype == T.Name) or (token_group.ttype == ST.Function and token_group.last_token().ttype == ST.ArgumentList) or (token_group.ttype == ST.SubQuery and token_group.last_token().match_type_value( Token(T.Punctuation, ')'))) or (token_group.ttype == ST.ComputedIdentifier and token_group.last_token().ttype != T.Operator) or token_group.ttype == ST.SelectConstantIdentifier)): # Alias withot AS Keyword. Here TokenGroup has to be direct child of Select/From clause # and must satisfy other appropriate conditions token.ttype = ST.AliasName token_group.append(token) elif (token_group.parent.ttype == ST.ComputedIdentifier and token_group.ttype == ST.Identifier and token_group.last_token().ttype == T.Name): # Case: SELECT A.x+B.y SomeAlias FROM ... # here Computed Identifier => A.x+B.y and Identifier => B.y followed by Alias return self._process_name(stmt, self._switch_to_parent(token_group), token, tokengroup_set) elif (token_group.ttype == ST.SelectClause and token_group.last_token().ttype not in (T.Keyword, T.Punctuation)): # Case: SELECT CASE ... END AS case_alias token_group = token_group.merge_into_token_group( ST.ComputedIdentifier, token_list_start_index_included=token_group.last_token_index()) token.ttype = ST.AliasName token_group.append(token) elif token_group.ttype == ST.Identifier: token_group.append(token) elif token_group.ttype == PT.WithClause: # Eg. WITH aliasname AS (SQL) token.ttype = PT.WithQueryAliasName with_identifier_grp = TokenGroup([ token, ], PT.WithIdentifier) token_group.append(with_identifier_grp) token_group = with_identifier_grp else: identifier_grp = TokenGroup([ token, ], ST.Identifier) token_group.append(identifier_grp) token_group = identifier_grp return token_group
def __init__(self, lexer_object=SqlLexer, end_marker_token=Token(T.Punctuation, ';')): self._lexer = lexer_object(stripall=True) # self._lexer.add_filter('keywordcase', case='upper') self._lexer.add_filter(text_to_whitespace_token()) self._end_marker_token = end_marker_token self._parse_rules = self._set_rules_()
def _process_in(self, stmt, token_group, token, tokengroup_set): while token_group.ttype not in (ST.JoinOnClause, ST.WhereClause, ST.HavingClause, ST.Condition, ST.RoundBracket, ST.ConditionGroup, ST.Not, ST.CaseExpression, ST.WhenExpression, ST.ThenExpression, ST.ElseExpression): # Get out of the Token Group until you find Condition Clause token_group = self._switch_to_parent(token_group) if token_group.ttype == ST.RoundBracket: token_group.ttype = ST.ConditionGroup if token_group.ttype == ST.Condition: # This will mostly be NOT IN Condition token_group.ttype = ST.NotIn if token_group.last_token( ).match_type_value(Token(ST.LogicalOperator, 'NOT')) else ST.In else: token_group = token_group.merge_into_token_group( ST.In, token_list_start_index_included=token_group.last_token_index()) token_group.append(token) return token_group
def _process_exists(self, stmt, token_group, token, tokengroup_set): while token_group.ttype not in ( ST.WhereClause, # ST.JoinOnClause, ST.HavingClause, ST.RoundBracket, ST.ConditionGroup, ST.Not, ): # ST.Condition, # ST.CaseExpression, ST.WhenExpression, ST.ThenExpression, ST.ElseExpression): # Get out of the Token Group until you find Condition Clause token_group = self._switch_to_parent(token_group) if token_group.ttype == ST.RoundBracket: token_group.ttype = ST.ConditionGroup if token_group.ttype == ST.Not and token_group.last_token( ).match_type_value(Token(ST.LogicalOperator, 'NOT')): token_group.ttype = ST.NotExists token_group.append(token) return token_group exists_grp = TokenGroup([ token, ], ST.Exists) token_group.append(exists_grp) return exists_grp
def datafields_involved(self): """ Returns a list of datafields involved in the Postgres SQL Statement. Datafield information is generated from the parsed token list when the function is called for the first time and stored. On subsequent calls, the stored information is returned. Returns: [List] -- List of Datafields involved in the Postgres SQL Statement. {type, datafield, dataset, schema, catalog, dataset_alias, datafield_alias, rr_ind, defined_at} """ if self._datafields is None: # Datafields info needs to be generated. self.datasets_involved() self._datafields = [] for token in self.get_identifiers(): if token.parent.ttype not in (ST.FromClause, ): _datafield = { 'type': 'NotKnown', 'datafield': '', 'datafield_alias': None, 'dataset': None, 'schema': None, 'catalog': None, 'dataset_type': None, 'dataset_alias': None, 'rw_ind': 'r', 'defined_at': token, } if token.ttype == ST.Identifier: _datafield['type'] = 'Datafield' for sub_token in token.token_list: if sub_token.ttype in (T.Name, ST.AllColumnsIdentifier): _datafield['datafield'] = sub_token.value() elif sub_token.ttype == ST.AliasName: _datafield['datafield_alias'] = sub_token.value() elif sub_token.ttype == ST.QualifierName: _datafield['dataset_alias'] = sub_token.value() # Get Dataset info if _datafield['dataset_alias']: dataset_alias_found = False for dset in self._datasets: # Using Alias if _datafield['dataset_alias'] == dset['alias']: _datafield['dataset'] = dset['dataset'] _datafield['dataset_type'] = dset['type'] _datafield['schema'] = dset['schema'] _datafield['catalog'] = dset['catalog'] dataset_alias_found = True break if not dataset_alias_found: for dset in self._datasets: # Using Dataset Name if _datafield['dataset_alias'] == dset['dataset']: _datafield['dataset'] = dset['dataset'] _datafield['dataset_type'] = dset['type'] _datafield['schema'] = dset['schema'] _datafield['catalog'] = dset['catalog'] break elif token.ttype in (ST.ComputedIdentifier, ST.SelectConstantIdentifier, ST.Function): _datafield['type'] = 'Computed Field' if token.ttype == ST.ComputedIdentifier else ( 'Function Field' if token.ttype == ST.Function else 'Constant Field') not_an_alias_ind = True for sub_token in token.token_list: if sub_token.ttype == ST.AliasName: _datafield['datafield_alias'] = sub_token.value() not_an_alias_ind = False elif sub_token.match_type_value(Token(T.Keyword, 'AS')): not_an_alias_ind = False if not_an_alias_ind: _datafield['datafield'] = _datafield['datafield'] + \ sub_token.value(True) else: # No need to process continue self._datafields.append(_datafield) return self._datafields
def datasets_involved(self): """ Returns a list of datasets involved in the Postgres SQL Statement. Dataset information is generated from the parsed token list when the function is called for the first time and stored. On subsequent calls, the stored information is returned. Returns: [List] -- List of Datasets involved in the Postgres SQL Statement. {type, dataset, schema, catalog, alias, rr_ind, defined_at} """ if self._datasets is None: # Datasets info needs to be generated. self._datasets = [] for token in self.get_identifiers(): if token.parent.ttype in (ST.FromClause, PT.WithClause): _dataset = { 'type': 'NotKnown', 'dataset': '', 'schema': self._default_schema, 'catalog': self._default_catalog, 'alias': None, 'rw_ind': 'r', 'defined_at': token, } if token.ttype == ST.SubQuery: _dataset['type'] = 'Sub Query' subquery_ind = True for subquery_token in token.token_list: if subquery_token.ttype == ST.AliasName: _dataset['alias'] = subquery_token.value() subquery_ind = False elif subquery_token.match_type_value(Token(T.Keyword, 'AS')): subquery_ind = False if subquery_ind: _dataset['dataset'] = _dataset['dataset'] + \ subquery_token.value(True) elif token.ttype == PT.WithIdentifier: _dataset['type'] = 'With Query' for subtoken in token.token_list: if subtoken.ttype == PT.WithQueryAliasName: _dataset['alias'] = subtoken.value() elif subtoken.ttype == PT.WithQueryAliasIdentifier: for subsubtoken in subtoken.token_list: if subsubtoken.ttype == PT.WithQueryAliasName: _dataset['alias'] = subtoken.value() elif subtoken.ttype == ST.SubQuery: _dataset['dataset'] = subtoken.value() else: _dataset['type'] = 'Dataset' qualifier = [] for sub_token in token.token_list: if sub_token.ttype == T.Name: _dataset['dataset'] = sub_token.value() elif sub_token.ttype == ST.AliasName: _dataset['alias'] = sub_token.value() elif sub_token.ttype == ST.QualifierName: qualifier.append(sub_token.value()) if len(qualifier) >= 1: _dataset['schema'] = qualifier[-1] if len(qualifier) == 2: _dataset['catalog'] = qualifier[-2] self._datasets.append(_dataset) return self._datasets
def _switch_to_parent(self, token_group): if token_group.pop_whitespace_token(): # If Token Group has Whitespace at its end, move ito its Parent Group token_group.parent.append(Token(T.Whitespace, ' ')) return token_group.parent
def token_stream(sql_text): for ttype, value in self._lexer.get_tokens(sql_text): yield Token(ttype, value)