def crawl( cls, segment: BaseSegment, queries: Dict[str, List["SelectCrawler"]], dialect: Dialect, recurse_into=True, ) -> Generator[Union[str, List["SelectCrawler"]], None, None]: """Find SELECTs, table refs, or value table function calls in segment. For each SELECT, yield a list of SelectCrawlers. As we find table references or function call strings, yield those. """ buff = [] for seg in segment.recursive_crawl("table_reference", "select_statement", recurse_into=recurse_into): if seg is segment: # If we are starting with a select_statement, recursive_crawl() # returns the statement itself. Skip that. continue if seg.type == "table_reference": if not seg.is_qualified() and seg.raw in queries: # It's a CTE. # :TRICKY: Pop the CTE from "queries" to help callers avoid # infinite recursion. We could make this behavior optional # someday, if necessary. yield queries.pop(seg.raw) else: # It's an external table. yield seg.raw else: assert seg.type == "select_statement" buff.append(SelectCrawler(seg, dialect)) if not buff: # If we reach here, the SELECT may be querying from a value table # function, e.g. UNNEST(). For our purposes, this is basically the # same as an external table. Return the "table" part as a string. table_expr = segment.get_child("table_expression") if table_expr: yield table_expr.raw yield buff
def _get_name_if_cte(select_statement: BaseSegment, ancestor_segment: BaseSegment) -> Optional[str]: """Return name if CTE. If top-level, return None.""" cte = None path_to = ancestor_segment.path_to(select_statement) for seg in path_to: if seg.is_type("common_table_expression"): cte = seg break select_name = cte.segments[0].raw if cte else None return select_name
def gather(cls, segment: BaseSegment, dialect: Dialect) -> Dict[Optional[str], List["SelectCrawler"]]: """Find top-level SELECTs and CTEs, return info.""" queries = defaultdict(list) # We specify recurse_into=False because we only want top-level select # statmeents and CTEs. We'll deal with nested selects later as needed, # when processing their top-level parent. for select_statement in segment.recursive_crawl("select_statement", recurse_into=False): select_name = cls._get_name_if_cte(select_statement, segment) queries[select_name].append( SelectCrawler(select_statement, dialect)) return dict(queries)
def get( cls, segment: BaseSegment, queries: Dict[str, List["SelectCrawler"]], dialect: Dialect, ) -> Union[str, List["SelectCrawler"]]: """Find SELECTs, table refs, or value table function calls in segment. If we find a SELECT, return info list. Otherwise, return table name or function call string. """ buff = [] for seg in segment.recursive_crawl( "table_reference", "select_statement", recurse_into=False ): if seg is segment: # If we are starting with a select_statement, recursive_crawl() # returns the statement itself. Skip that. continue if seg.type == "table_reference": if not seg.is_qualified() and seg.raw in queries: # It's a CTE. return queries[seg.raw] else: # It's an external table. return seg.raw else: assert seg.type == "select_statement" buff.append(SelectCrawler(seg, dialect)) if not buff: # If we reach here, the SELECT may be querying from a value table # function, e.g. UNNEST(). For our purposes, this is basically the # same as an external table. Return the "table" part as a string. table_expr = segment.get_child("main_table_expression") if table_expr: return table_expr.raw return buff
def extract_ignore_mask( cls, tree: BaseSegment ) -> Tuple[List[NoQaDirective], List[SQLBaseError]]: """Look for inline ignore comments and return NoQaDirectives.""" ignore_buff: List[NoQaDirective] = [] violations: List[SQLBaseError] = [] for comment in tree.recursive_crawl("comment"): if comment.name == "inline_comment": ignore_entry = cls.extract_ignore_from_comment(comment) if isinstance(ignore_entry, SQLParseError): violations.append(ignore_entry) elif ignore_entry: ignore_buff.append(ignore_entry) if ignore_buff: linter_logger.info("Parsed noqa directives from file: %r", ignore_buff) return ignore_buff, violations
def lint_fix_parsed( cls, tree: BaseSegment, config: FluffConfig, rule_set: List[BaseRule], fix: bool = False, fname: Optional[str] = None, templated_file: Optional[TemplatedFile] = None, formatter: Any = None, ) -> Tuple[BaseSegment, List[SQLBaseError], List[NoQaDirective]]: """Lint and optionally fix a tree object.""" # Keep track of the linting errors on the very first linter pass. The # list of issues output by "lint" and "fix" only includes issues present # in the initial SQL code, EXCLUDING any issues that may be created by # the fixes themselves. initial_linting_errors = [] # A placeholder for the fixes we had on the previous loop last_fixes = None # Keep a set of previous versions to catch infinite loops. previous_versions: Set[Tuple[str, Tuple[SourceFix, ...]]] = {(tree.raw, ())} # If we are fixing then we want to loop up to the runaway_limit, otherwise just # once for linting. loop_limit = config.get("runaway_limit") if fix else 1 # Dispatch the output for the lint header if formatter: formatter.dispatch_lint_header(fname) # Look for comment segments which might indicate lines to ignore. if not config.get("disable_noqa"): rule_codes = [r.code for r in rule_set] ignore_buff, ivs = cls.extract_ignore_mask_tree(tree, rule_codes) initial_linting_errors += ivs else: ignore_buff = [] save_tree = tree # There are two phases of rule running. # 1. The main loop is for most rules. These rules are assumed to # interact and cause a cascade of fixes requiring multiple passes. # These are run the `runaway_limit` number of times (default 10). # 2. The post loop is for post-processing rules, not expected to trigger # any downstream rules, e.g. capitalization fixes. They are run on the # first loop and then twice at the end (once to fix, and once again to # check result of fixes), but not in the intervening loops. phases = ["main"] if fix: phases.append("post") for phase in phases: if len(phases) > 1: rules_this_phase = [ rule for rule in rule_set if rule.lint_phase == phase ] else: rules_this_phase = rule_set for loop in range(loop_limit if phase == "main" else 2): def is_first_linter_pass(): return phase == phases[0] and loop == 0 # Additional newlines are to assist in scanning linting loops # during debugging. linter_logger.info( f"\n\nEntering linter phase {phase}, loop {loop+1}/{loop_limit}\n" ) changed = False if is_first_linter_pass(): # In order to compute initial_linting_errors correctly, need # to run all rules on the first loop of the main phase. rules_this_phase = rule_set progress_bar_crawler = tqdm( rules_this_phase, desc="lint by rules", leave=False, disable=progress_bar_configuration.disable_progress_bar, ) for crawler in progress_bar_crawler: # Performance: After first loop pass, skip rules that don't # do fixes. Any results returned won't be seen by the user # anyway (linting errors ADDED by rules changing SQL, are # not reported back to the user - only initial linting errors), # so there's absolutely no reason to run them. if (fix and not is_first_linter_pass() and not is_fix_compatible(crawler)): continue progress_bar_crawler.set_description( f"rule {crawler.code}") # fixes should be a dict {} with keys edit, delete, create # delete is just a list of segments to delete # edit and create are list of tuples. The first element is # the "anchor", the segment to look for either to edit or to # insert BEFORE. The second is the element to insert or create. linting_errors, _, fixes, _ = crawler.crawl( tree, dialect=config.get("dialect_obj"), fix=fix, templated_file=templated_file, ignore_mask=ignore_buff, fname=fname, ) if is_first_linter_pass(): initial_linting_errors += linting_errors if fix and fixes: linter_logger.info( f"Applying Fixes [{crawler.code}]: {fixes}") # Do some sanity checks on the fixes before applying. anchor_info = BaseSegment.compute_anchor_edit_info( fixes) if any(not info.is_valid for info in anchor_info.values()): # pragma: no cover message = ( f"Rule {crawler.code} returned conflicting " "fixes with the same anchor. This is only " "supported for create_before+create_after, so " "the fixes will not be applied. {fixes!r}") cls._report_conflicting_fixes_same_anchor(message) for lint_result in linting_errors: lint_result.fixes = [] elif fixes == last_fixes: # pragma: no cover # If we generate the same fixes two times in a row, # that means we're in a loop, and we want to stop. # (Fixes should address issues, hence different # and/or fewer fixes next time.) cls._warn_unfixable(crawler.code) else: # This is the happy path. We have fixes, now we want to # apply them. last_fixes = fixes new_tree, _, _ = tree.apply_fixes( config.get("dialect_obj"), crawler.code, anchor_info) # Check for infinite loops. We use a combination of the # fixed templated file and the list of source fixes to # apply. loop_check_tuple = ( new_tree.raw, tuple(new_tree.source_fixes), ) if loop_check_tuple not in previous_versions: # We've not seen this version of the file so # far. Continue. tree = new_tree previous_versions.add(loop_check_tuple) changed = True continue else: # Applying these fixes took us back to a state # which we've seen before. We're in a loop, so # we want to stop. cls._warn_unfixable(crawler.code) if fix and not changed: # We did not change the file. Either the file is clean (no # fixes), or any fixes which are present will take us back # to a previous state. linter_logger.info( f"Fix loop complete for {phase} phase. Stability " f"achieved after {loop}/{loop_limit} loops.") break else: if fix: # The linter loop hit the limit before reaching a stable point # (i.e. free of lint errors). If this happens, it's usually # because one or more rules produced fixes which did not address # the original issue **or** created new issues. linter_logger.warning( f"Loop limit on fixes reached [{loop_limit}].") # Discard any fixes for the linting errors, since they caused a # loop. IMPORTANT: By doing this, we are telling SQLFluff that # these linting errors are "unfixable". This is important, # because when "sqlfluff fix" encounters unfixable lint errors, # it exits with a "failure" exit code, which is exactly what we # want in this situation. (Reason: Although this is more of an # internal SQLFluff issue, users deserve to know about it, # because it means their file(s) weren't fixed. for violation in initial_linting_errors: if isinstance(violation, SQLLintError): violation.fixes = [] # Return the original parse tree, before any fixes were applied. # Reason: When the linter hits the loop limit, the file is often # messy, e.g. some of the fixes were applied repeatedly, possibly # other weird things. We don't want the user to see this junk! return save_tree, initial_linting_errors, ignore_buff if config.get("ignore_templated_areas", default=True): initial_linting_errors = cls.remove_templated_errors( initial_linting_errors) return tree, initial_linting_errors, ignore_buff
def get_select_statement_info( segment: BaseSegment, dialect: Optional[Dialect], early_exit: bool = True) -> Optional[SelectStatementColumnsAndTables]: """Analyze a select statement: targets, aliases, etc. Return info.""" assert segment.is_type("select_statement") table_aliases, value_table_function_aliases = get_aliases_from_select( segment, dialect) if early_exit and not table_aliases and not value_table_function_aliases: return None # Iterate through all the references, both in the select clause, but also # potential others. sc = segment.get_child("select_clause") reference_buffer = list(sc.recursive_crawl("object_reference")) # Add any wildcard references reference_buffer += list(sc.recursive_crawl("wildcard_identifier")) for potential_clause in ( "where_clause", "groupby_clause", "having_clause", "orderby_clause", ): clause = segment.get_child(potential_clause) if clause: reference_buffer += list( clause.recursive_crawl("object_reference")) # PURGE any references which are in nested select statements for ref in reference_buffer.copy(): ref_path = segment.path_to(ref) # is it in a subselect? i.e. a select which isn't this one. if any( seg.is_type("select_statement") and seg is not segment for seg in ref_path): reference_buffer.remove(ref) # Get all select targets. select_targets = segment.get_child("select_clause").get_children( "select_clause_element") # Get all column aliases col_aliases = [] for col_seg in list(sc.recursive_crawl("alias_expression")): for seg in col_seg.segments: if seg.is_type("identifier"): col_aliases.append(seg.raw) # Get any columns referred to in a using clause, and extract anything # from ON clauses. using_cols = [] fc = segment.get_child("from_clause") if fc: for join_clause in fc.recursive_crawl("join_clause"): in_using_brackets = False seen_using = False for seg in join_clause.segments: if seg.is_type("keyword") and seg.name == "USING": seen_using = True elif seg.is_type("join_on_condition"): for on_seg in seg.segments: if on_seg.is_type("expression"): # Deal with expressions reference_buffer += list( seg.recursive_crawl("object_reference")) elif seen_using and seg.is_type("start_bracket"): in_using_brackets = True elif seen_using and seg.is_type("end_bracket"): in_using_brackets = False seen_using = False elif in_using_brackets and seg.is_type("identifier"): using_cols.append(seg.raw) return SelectStatementColumnsAndTables( select_statement=segment, table_aliases=table_aliases or [], value_table_function_aliases=value_table_function_aliases or [], reference_buffer=reference_buffer, select_targets=select_targets, col_aliases=col_aliases, using_cols=using_cols, )
def lint_fix( self, tree: BaseSegment, config: Optional[FluffConfig] = None, fix: bool = False, fname: Optional[str] = None, templated_file: Optional[TemplatedFile] = None, ) -> Tuple[BaseSegment, List[SQLLintError]]: """Lint and optionally fix a tree object.""" config = config or self.config # Keep track of the linting errors all_linting_errors = [] # A placeholder for the fixes we had on the previous loop last_fixes = None # Keep a set of previous versions to catch infinite loops. previous_versions = {tree.raw} # If we are fixing then we want to loop up to the runaway_limit, otherwise just once for linting. loop_limit = config.get("runaway_limit") if fix else 1 # Dispatch the output for the lint header if self.formatter: self.formatter.dispatch_lint_header(fname) for loop in range(loop_limit): changed = False for crawler in self.get_ruleset(config=config): # fixes should be a dict {} with keys edit, delete, create # delete is just a list of segments to delete # edit and create are list of tuples. The first element is the # "anchor", the segment to look for either to edit or to insert BEFORE. # The second is the element to insert or create. linting_errors, _, fixes, _ = crawler.crawl( tree, dialect=config.get("dialect_obj"), fname=fname, templated_file=templated_file, ) all_linting_errors += linting_errors if fix and fixes: linter_logger.info(f"Applying Fixes: {fixes}") # Do some sanity checks on the fixes before applying. if fixes == last_fixes: self._warn_unfixable(crawler.code) else: last_fixes = fixes new_tree, _ = tree.apply_fixes(fixes) # Check for infinite loops if new_tree.raw not in previous_versions: # We've not seen this version of the file so far. Continue. tree = new_tree previous_versions.add(tree.raw) changed = True continue else: # Applying these fixes took us back to a state which we've # seen before. Abort. self._warn_unfixable(crawler.code) if loop == 0: # Keep track of initial errors for reporting. initial_linting_errors = all_linting_errors.copy() if fix and not changed: # We did not change the file. Either the file is clean (no fixes), or # any fixes which are present will take us back to a previous state. linter_logger.info( f"Fix loop complete. Stability achieved after {loop}/{loop_limit} loops." ) break if fix and loop + 1 == loop_limit: linter_logger.warning( f"Loop limit on fixes reached [{loop_limit}].") if config.get("ignore_templated_areas", default=True): initial_linting_errors = self.remove_templated_errors( initial_linting_errors) return tree, initial_linting_errors
def _validate_one_reference( single_table_references: str, ref: BaseSegment, this_ref_type: str, standalone_aliases: List[str], table_ref_str: str, table_ref_str_source: Optional[BaseSegment], col_alias_names: List[str], seen_ref_types: Set[str], fixable: bool, ) -> Optional[LintResult]: # We skip any unqualified wildcard references (i.e. *). They shouldn't # count. if not ref.is_qualified() and ref.is_type( "wildcard_identifier"): # type: ignore return None # Oddball case: Column aliases provided via function calls in by # FROM or JOIN. References to these don't need to be qualified. # Note there could be a table with a column by the same name as # this alias, so avoid bogus warnings by just skipping them # entirely rather than trying to enforce anything. if ref.raw in standalone_aliases: return None # Oddball case: tsql table variables can't be used to qualify references. # This appears here as an empty string for table_ref_str. if not table_ref_str: return None # Certain dialects allow use of SELECT alias in WHERE clauses if ref.raw in col_alias_names: return None if single_table_references == "consistent": if seen_ref_types and this_ref_type not in seen_ref_types: return LintResult( anchor=ref, description=f"{this_ref_type.capitalize()} reference " f"{ref.raw!r} found in single table select which is " "inconsistent with previous references.", ) return None if single_table_references != this_ref_type: if single_table_references == "unqualified": # If this is qualified we must have a "table", "."" at least fixes = [LintFix.delete(el) for el in ref.segments[:2]] if fixable else None return LintResult( anchor=ref, fixes=fixes, description="{} reference {!r} found in single table " "select.".format(this_ref_type.capitalize(), ref.raw), ) fixes = None if fixable: fixes = [ LintFix.create_before( ref.segments[0] if len(ref.segments) else ref, source=[table_ref_str_source] if table_ref_str_source else None, edit_segments=[ IdentifierSegment( raw=table_ref_str, type="naked_identifier", ), SymbolSegment(raw=".", type="symbol"), ], ) ] return LintResult( anchor=ref, fixes=fixes, description="{} reference {!r} found in single table " "select.".format(this_ref_type.capitalize(), ref.raw), ) return None
def lint_fix_parsed( cls, tree: BaseSegment, config: FluffConfig, rule_set: List[BaseRule], fix: bool = False, fname: Optional[str] = None, templated_file: Optional[TemplatedFile] = None, formatter: Any = None, ) -> Tuple[BaseSegment, List[SQLBaseError], List[NoQaDirective]]: """Lint and optionally fix a tree object.""" # Keep track of the linting errors all_linting_errors = [] # A placeholder for the fixes we had on the previous loop last_fixes = None # Keep a set of previous versions to catch infinite loops. previous_versions = {tree.raw} # If we are fixing then we want to loop up to the runaway_limit, otherwise just # once for linting. loop_limit = config.get("runaway_limit") if fix else 1 # Dispatch the output for the lint header if formatter: formatter.dispatch_lint_header(fname) # Look for comment segments which might indicate lines to ignore. if not config.get("disable_noqa"): rule_codes = [r.code for r in rule_set] ignore_buff, ivs = cls.extract_ignore_mask_tree(tree, rule_codes) all_linting_errors += ivs else: ignore_buff = [] save_tree = tree for loop in range(loop_limit): changed = False progress_bar_crawler = tqdm( rule_set, desc="lint by rules", leave=False, disable=progress_bar_configuration.disable_progress_bar, ) for crawler in progress_bar_crawler: progress_bar_crawler.set_description(f"rule {crawler.code}") # fixes should be a dict {} with keys edit, delete, create # delete is just a list of segments to delete # edit and create are list of tuples. The first element is the # "anchor", the segment to look for either to edit or to insert BEFORE. # The second is the element to insert or create. linting_errors, _, fixes, _ = crawler.crawl( tree, ignore_mask=ignore_buff, dialect=config.get("dialect_obj"), fname=fname, templated_file=templated_file, ) all_linting_errors += linting_errors if fix and fixes: linter_logger.info( f"Applying Fixes [{crawler.code}]: {fixes}") # Do some sanity checks on the fixes before applying. if fixes == last_fixes: # pragma: no cover cls._warn_unfixable(crawler.code) else: last_fixes = fixes new_tree, _ = tree.apply_fixes( config.get("dialect_obj"), fixes) # Check for infinite loops if new_tree.raw not in previous_versions: # We've not seen this version of the file so far. Continue. tree = new_tree previous_versions.add(tree.raw) changed = True continue else: # Applying these fixes took us back to a state which we've # seen before. Abort. cls._warn_unfixable(crawler.code) if loop == 0: # Keep track of initial errors for reporting. initial_linting_errors = all_linting_errors.copy() if fix and not changed: # We did not change the file. Either the file is clean (no fixes), or # any fixes which are present will take us back to a previous state. linter_logger.info( f"Fix loop complete. Stability achieved after {loop}/{loop_limit} " "loops.") break else: if fix: # The linter loop hit the limit before reaching a stable point # (i.e. free of lint errors). If this happens, it's usually # because one or more rules produced fixes which did not address # the original issue **or** created new issues. linter_logger.warning( f"Loop limit on fixes reached [{loop_limit}].") # Discard any fixes for the linting errors, since they caused a # loop. IMPORTANT: By doing this, we are telling SQLFluff that # these linting errors are "unfixable". This is important, # because when "sqlfluff fix" encounters unfixable lint errors, # it exits with a "failure" exit code, which is exactly what we # want in this situation. (Reason: Although this is more of an # internal SQLFluff issue, users deserve to know about it, # because it means their file(s) weren't fixed. for violation in initial_linting_errors: if isinstance(violation, SQLLintError): violation.fixes = [] # Return the original parse tree, before any fixes were applied. # Reason: When the linter hits the loop limit, the file is often # messy, e.g. some of the fixes were applied repeatedly, possibly # other weird things. We don't want the user to see this junk! return save_tree, initial_linting_errors, ignore_buff if config.get("ignore_templated_areas", default=True): initial_linting_errors = cls.remove_templated_errors( initial_linting_errors) return tree, initial_linting_errors, ignore_buff
def is_self_match(self, segment: BaseSegment) -> bool: """Does this segment match the relevant criteria.""" return segment.is_type(*self.types)
def passes_filter(self, segment: BaseSegment): """Returns true if this segment considered at all.""" return self.works_on_unparsable or not segment.is_type("unparsable")