def get_re_matches(patterns_re: List[Tuple], path: Path) -> List[PatternMatch]: try: contents = path.read_text() except UnicodeDecodeError: debug_print(f"regex matcher skipping binary file at {path}") return [] return [ PatternMatch({ "check_id": pattern_id, "path": str(path), "start": { "offset": match.start(), "line": _offset_to_line_no(match.start(), contents), "col": _offset_to_col_no(match.start(), contents), }, "end": { "offset": match.end(), "line": _offset_to_line_no(match.end(), contents), "col": _offset_to_col_no(match.end(), contents), }, "extra": { "lines": [contents[match.start():match.end()]] }, }) for pattern_id, pattern in patterns_re for match in re.finditer(pattern, contents) ]
def generate_config() -> None: import requests # here for faster startup times # defensive coding if Path(DEFAULT_CONFIG_FILE).exists(): raise SemgrepError( f"{DEFAULT_CONFIG_FILE} already exists. Please remove and try again" ) try: r = requests.get(TEMPLATE_YAML_URL, timeout=10) r.raise_for_status() template_str = r.text except Exception as e: debug_print(str(e)) print_stderr( f"There was a problem downloading the latest template config. Using fallback template" ) template_str = """rules: - id: eqeq-is-bad pattern: $X == $X message: "$X == $X is a useless equality check" languages: [python] severity: ERROR""" try: with open(DEFAULT_CONFIG_FILE, "w") as template: template.write(template_str) print_stderr( f"Template config successfully written to {DEFAULT_CONFIG_FILE}" ) except Exception as e: raise SemgrepError(str(e))
def _evaluate_expression( expression: BooleanRuleExpression, pattern_ids_to_pattern_matches: Dict[PatternId, List[PatternMatch]], ranges_left: Set[Range], steps_for_debugging: List[Dict[str, Any]], flags: Optional[Dict[str, Any]] = None, ) -> Set[Range]: if (expression.operator == OPERATORS.AND_EITHER or expression.operator == OPERATORS.AND_ALL): assert ( expression.children is not None ), f"{pattern_names_for_operator(OPERATORS.AND_EITHER)} or {pattern_names_for_operator(OPERATORS.AND_ALL)} must have a list of subpatterns" # recurse on the nested expressions if expression.operator == OPERATORS.AND_EITHER: # remove anything that does not equal one of these ranges evaluated_ranges = [ _evaluate_expression( expr, pattern_ids_to_pattern_matches, ranges_left.copy(), steps_for_debugging, flags=flags, ) for expr in expression.children ] ranges_left.intersection_update(flatten(evaluated_ranges)) else: # chain intersection eagerly; intersect for every AND'ed child for expr in expression.children: remainining_ranges = _evaluate_expression( expr, pattern_ids_to_pattern_matches, ranges_left.copy(), steps_for_debugging, flags=flags, ) ranges_left.intersection_update(remainining_ranges) debug_print(f"after filter `{expression.operator}`: {ranges_left}") steps_for_debugging.append({ "filter": f"{pattern_name_for_operator(expression.operator)}", "pattern_id": None, "ranges": list(ranges_left), }) else: assert ( expression.children is None ), f"only `{pattern_names_for_operator(OPERATORS.AND_EITHER)}` or `{pattern_names_for_operator(OPERATORS.AND_ALL)}` expressions can have multiple subpatterns" ranges_left = _evaluate_single_expression( expression, pattern_ids_to_pattern_matches, ranges_left, steps_for_debugging, flags=flags, ) return ranges_left
def post_output(cls, output_url: str, output: str) -> None: import requests # here for faster startup times print_stderr(f"posting to {output_url}...") try: r = requests.post(output_url, data=output, timeout=10) debug_print( f"posted to {output_url} and got status_code:{r.status_code}") except requests.exceptions.Timeout: raise SemgrepError(f"posting output to {output_url} timed out")
def resolve_config(config_str: Optional[str]) -> Dict[str, YamlTree]: """ resolves if config arg is a registry entry, a url, or a file, folder, or loads from defaults if None""" start_t = time.time() if config_str is None: config = load_config_from_local_path() elif config_str in RULES_REGISTRY: config = download_config(RULES_REGISTRY[config_str]) elif is_url(config_str): config = download_config(config_str) else: config = load_config_from_local_path(config_str) if config: debug_print(f"loaded {len(config)} configs in {time.time() - start_t}") return config
def invoke_semgrep( self, target_manager: TargetManager, rules: List[Rule] ) -> Tuple[Dict[Rule, List[RuleMatch]], Dict[Rule, List[Dict[str, Any]]], List[CoreException], ]: """ Takes in rules and targets and retuns object with findings """ start = datetime.now() findings_by_rule, debug_steps_by_rule, errors = self._run_rules( rules, target_manager) debug_print(f"semgrep ran in {datetime.now() - start}") return findings_by_rule, debug_steps_by_rule, errors
def is_running_latest(version_cache_path: Path = VERSION_CACHE_PATH) -> bool: latest_version_str = _get_latest_version(version_cache_path) if latest_version_str is None: return False try: latest_version = Version(latest_version_str) current_version = Version(constants.__VERSION__) except InvalidVersion as e: util.debug_print(f"Invalid version string: {e}") return False if current_version < latest_version: return False return True
def download_config(config_url: str) -> Dict[str, YamlTree]: import requests # here for faster startup times DOWNLOADING_MESSAGE = f"downloading config..." debug_print(f"trying to download from {config_url}") print_stderr( f"using config from {nice_semgrep_url(config_url)}. Visit https://semgrep.live/registry to see all public rules." ) print_stderr(DOWNLOADING_MESSAGE, end="\r") headers = {"User-Agent": SEMGREP_USER_AGENT} try: r = requests.get(config_url, stream=True, headers=headers, timeout=10) if r.status_code == requests.codes.ok: content_type = r.headers.get("Content-Type") yaml_types = [ "text/plain", "application/x-yaml", "text/x-yaml", "text/yaml", "text/vnd.yaml", ] if content_type and any((ct in content_type for ct in yaml_types)): return parse_config_string( "remote-url", r.content.decode("utf-8"), filename=f"{config_url[:20]}...", ) else: raise SemgrepError( f"unknown content-type: {content_type} returned by config url: {config_url}. Can not parse" ) else: raise SemgrepError( f"bad status code: {r.status_code} returned by config url: {config_url}" ) except Exception as e: raise SemgrepError( f"Failed to download config from {config_url}: {str(e)}") return None
def _fetch_latest_version( url: str = VERSION_CHECK_URL, timeout: int = VERSION_CHECK_TIMEOUT ) -> Optional[str]: try: import requests resp = requests.get( url, headers={"User-Agent": f"Semgrep/{constants.__VERSION__}"}, timeout=timeout, ) except Exception as e: util.debug_print(f"Fetching latest version failed to connect: {e}") return None else: if resp.status_code != requests.codes.OK: util.debug_print( f"Fetching latest version received HTTP error code: {resp.status_code}" ) return None try: resp_json = resp.json() except ValueError: util.debug_print("Fetching latest version received invalid JSON") return None else: return str(resp_json["latest"])
def _get_version_from_cache(version_cache_path: Path) -> Optional[str]: now = time.time() if version_cache_path.is_file(): with version_cache_path.open() as f: timestamp_str = f.readline().strip() latest_version_str = f.readline().strip() try: # Treat time as integer seconds so no need to deal with str float conversion timestamp = int(timestamp_str) except ValueError: util.debug_print( f"Version cache invalid timestamp: {timestamp_str}") return None one_day = 86400 if now - timestamp > one_day: util.debug_print( f"Version cache expired: {timestamp_str}:{now}") return None return latest_version_str util.debug_print("Version cache does not exist") return None
def evaluate(rule: Rule, pattern_matches: List[PatternMatch], allow_exec: bool) -> Tuple[List[RuleMatch], List[Dict[str, Any]]]: """ Takes a Rule and list of pattern matches from a single file and handles the boolean expression evaluation of the Rule's patterns Returns a list of RuleMatches. """ output = [] pattern_ids_to_pattern_matches = group_by_pattern_id(pattern_matches) steps_for_debugging = [{ "filter": "initial", "pattern_id": None, "ranges": { k: list(set(vv.range for vv in v)) for k, v in pattern_ids_to_pattern_matches.items() }, }] debug_print(str(pattern_ids_to_pattern_matches)) valid_ranges_to_output = evaluate_expression( rule.expression, pattern_ids_to_pattern_matches, flags={RCE_RULE_FLAG: allow_exec}, steps_for_debugging=steps_for_debugging, ) # only output matches which are inside these offsets! debug_print(f"compiled result {valid_ranges_to_output}") debug_print("-" * 80) for pattern_match in pattern_matches: if pattern_match.range in valid_ranges_to_output: message = interpolate_message_metavariables(rule, pattern_match) fix = interpolate_fix_metavariables(rule, pattern_match) rule_match = RuleMatch( rule.id, pattern_match, message=message, metadata=rule.metadata, severity=rule.severity, fix=fix, fix_regex=rule.fix_regex, ) output.append(rule_match) return output, steps_for_debugging
def rule_match_nosem(rule_match: RuleMatch, strict: bool) -> bool: if not rule_match.lines: return False # Only consider the first line of a match. This will keep consistent # behavior on where we expect a 'nosem' comment to exist. If we allow these # comments on any line of a match it will get confusing as to what finding # the 'nosem' is referring to. re_match = NOSEM_INLINE_RE.search(rule_match.lines[0]) if re_match is None: return False ids_str = re_match.groupdict()["ids"] if ids_str is None: debug_print( f"found 'nosem' comment, skipping rule '{rule_match.id}' on line {rule_match.start['line']}" ) return True pattern_ids = { pattern_id.strip() for pattern_id in COMMA_SEPARATED_LIST_RE.split(ids_str) if pattern_id.strip() } result = False for pattern_id in pattern_ids: if rule_match.id == pattern_id: debug_print( f"found 'nosem' comment with id '{pattern_id}', skipping rule '{rule_match.id}' on line {rule_match.start['line']}" ) result = result or True else: message = f"found 'nosem' comment with id '{pattern_id}', but no corresponding rule trying '{rule_match.id}'" if strict: raise SemgrepError(message) else: debug_print(message) return result
def score_output_json( json_out: Dict[str, Any], test_files: List[Path], ignore_todo: bool ) -> Tuple[Dict[str, List[int]], Dict[str, Dict[str, Any]], int]: comment_lines: Dict[str, Dict[str, List[int]]] = collections.defaultdict( lambda: collections.defaultdict(list)) reported_lines: Dict[str, Dict[str, List[int]]] = collections.defaultdict( lambda: collections.defaultdict(list)) ignore_lines: Dict[str, List[int]] = collections.defaultdict(list) score_by_checkid: Dict[str, List[int]] = collections.defaultdict( lambda: [0, 0, 0, 0]) expected_reported_by_check_id: Dict[str, Dict[ str, Any]] = collections.defaultdict(dict) num_todo = 0 for test_file in test_files: test_file_resolved = str(test_file.resolve()) with open(test_file_resolved) as fin: all_lines = fin.readlines() for i, line in enumerate(all_lines): # +1 because we are 0 based and semgrep output is not, plus skip the comment line effective_line_num = i + 2 todo_in_line = line_has_todo_rule(line) todo_ok_in_line = line_has_todo_ok(line) if todo_in_line: num_todo += 1 if (not ignore_todo and todo_in_line) or line_has_rule(line): comment_lines[test_file_resolved][normalize_rule_id( line)].append(effective_line_num) if ignore_todo and todo_ok_in_line: ignore_lines[test_file_resolved].append(effective_line_num) for result in json_out["results"]: reported_lines[str(Path( result["path"]).resolve())][result["check_id"]].append( int(result["start"]["line"])) def join_keys(a: Dict[str, Any], b: Dict[str, Any]) -> Set[str]: return set(a.keys()).union(set(b.keys())) for file_path in join_keys(comment_lines, reported_lines): for check_id in join_keys(comment_lines[file_path], reported_lines[file_path]): all_reported = set(reported_lines[file_path][check_id]) expected = set(comment_lines[file_path][check_id]) ignored = set(ignore_lines[file_path]) reported = all_reported - ignored new_cm = compute_confusion_matrix(reported, expected) debug_print( f"reported lines for check {check_id}: {sorted(reported)}, expected lines: {sorted(expected)} (ignored: {sorted(ignored)}, confusion matrix: {new_cm}" ) expected_reported_by_check_id[check_id][file_path] = (expected, reported) # TODO: -- re-enable this # assert len(set(reported_lines[file_path][check_id])) == len( # reported_lines[file_path][check_id] # ), f"for testing, please don't make rules that fire multiple times on the same line ({check_id} in {file_path} on lines {reported_lines[file_path][check_id]})" old_cm = score_by_checkid[check_id] score_by_checkid[check_id] = [ old_cm[i] + new_cm[i] for i in range(len(new_cm)) ] return (score_by_checkid, expected_reported_by_check_id, num_todo)
def main( output_handler: OutputHandler, target: List[str], pattern: str, lang: str, config: str, no_rewrite_rule_ids: bool = False, jobs: int = 1, include: Optional[List[str]] = None, exclude: Optional[List[str]] = None, strict: bool = False, autofix: bool = False, dryrun: bool = False, disable_nosem: bool = False, dangerously_allow_arbitrary_code_execution_from_rules: bool = False, no_git_ignore: bool = False, ) -> None: if include is None: include = [] if exclude is None: exclude = [] valid_configs, config_errors = get_config(pattern, lang, config) output_handler.handle_semgrep_errors(config_errors) if config_errors and strict: raise SemgrepError( f"run with --strict and there were {len(config_errors)} errors loading configs", code=MISSING_CONFIG_EXIT_CODE, ) if not no_rewrite_rule_ids: # re-write the configs to have the hierarchical rule ids valid_configs = rename_rule_ids(valid_configs) # extract just the rules from valid configs all_rules = flatten_configs(valid_configs) if not pattern: plural = "s" if len(valid_configs) > 1 else "" config_id_if_single = (list(valid_configs.keys())[0] if len(valid_configs) == 1 else "") invalid_msg = (f"({len(config_errors)} config files were invalid)" if len(config_errors) else "") debug_print( f"running {len(all_rules)} rules from {len(valid_configs)} config{plural} {config_id_if_single} {invalid_msg}" ) notify_user_of_work(all_rules, include, exclude) if len(valid_configs) == 0: raise SemgrepError( f"no valid configuration file found ({len(config_errors)} configs were invalid)", code=MISSING_CONFIG_EXIT_CODE, ) respect_git_ignore = not no_git_ignore target_manager = TargetManager( includes=include, excludes=exclude, targets=target, respect_git_ignore=respect_git_ignore, ) # actually invoke semgrep rule_matches_by_rule, debug_steps_by_rule, semgrep_core_errors = CoreRunner( allow_exec=dangerously_allow_arbitrary_code_execution_from_rules, jobs=jobs, ).invoke_semgrep(target_manager, all_rules) semgrep_errors = [e.into_semgrep_error() for e in semgrep_core_errors] output_handler.handle_semgrep_errors(semgrep_errors) if not disable_nosem: rule_matches_by_rule = { rule: [ rule_match for rule_match in rule_matches if not rule_match_nosem(rule_match, strict) ] for rule, rule_matches in rule_matches_by_rule.items() } output_handler.handle_semgrep_core_output(rule_matches_by_rule, debug_steps_by_rule) if autofix: apply_fixes(rule_matches_by_rule, dryrun)
def _run_rule( self, rule: Rule, target_manager: TargetManager, cache_dir: str ) -> Tuple[List[RuleMatch], List[Dict[str, Any]], List[CoreException]]: """ Run all rules on targets and return list of all places that match patterns, ... todo errors """ outputs: List[PatternMatch] = [] # multiple invocations per language errors: List[CoreException] = [] equivalences = rule.equivalences for language, all_patterns_for_language in self._group_patterns_by_language( [rule]).items(): try: targets = target_manager.get_files(language, rule.includes, rule.excludes) except _UnknownLanguageError as ex: raise UnknownLanguageError( short_msg="invalid language", long_msg=f"unsupported language {language}", spans=[ rule.languages_span.with_context(before=1, after=1) ], ) from ex if targets == []: continue # semgrep-core doesn't know about OPERATORS.REGEX - this is # strictly a semgrep Python feature. Regex filtering is # performed purely in Python code then compared against # semgrep-core's results for other patterns. patterns_regex, patterns = partition( lambda p: p.expression.operator == OPERATORS.REGEX, all_patterns_for_language, ) if patterns_regex: patterns_json = [ pattern.to_json() for pattern in patterns_regex ] try: patterns_re = [(pattern["id"], re.compile(pattern["pattern"])) for pattern in patterns_json] except re.error as err: raise SemgrepError( f"invalid regular expression specified: {err}") re_fn = functools.partial(get_re_matches, patterns_re) with multiprocessing.Pool(self._jobs) as pool: matches = pool.map(re_fn, targets) outputs.extend(single_match for file_matches in matches for single_match in file_matches) patterns_json = [p.to_json() for p in patterns] with tempfile.NamedTemporaryFile( "w") as pattern_file, tempfile.NamedTemporaryFile( "w") as target_file, tempfile.NamedTemporaryFile( "w") as equiv_file: yaml = YAML() yaml.dump({"rules": patterns_json}, pattern_file) pattern_file.flush() target_file.write("\n".join(str(t) for t in targets)) target_file.flush() cmd = [SEMGREP_PATH] + [ "-lang", language, "-rules_file", pattern_file.name, "-j", str(self._jobs), "-target_file", target_file.name, "-use_parsing_cache", cache_dir, ] if equivalences: self._write_equivalences_file(equiv_file, equivalences) cmd += ["-equivalences", equiv_file.name] core_run = sub_run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) debug_print(core_run.stderr.decode("utf-8", "replace")) if core_run.returncode != 0: # see if semgrep output a JSON error that we can decode semgrep_output = core_run.stdout.decode("utf-8", "replace") try: output_json = json.loads(semgrep_output) except ValueError: raise SemgrepError( f"unexpected non-json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) if "error" in output_json: self._raise_semgrep_error_from_json( output_json, patterns) else: raise SemgrepError( f"unexpected json output while invoking semgrep-core:\n{PLEASE_FILE_ISSUE_TEXT}" ) output_json = json.loads( (core_run.stdout.decode("utf-8", "replace"))) errors.extend( CoreException.from_json(e, language) for e in output_json["errors"]) outputs.extend(PatternMatch(m) for m in output_json["matches"]) # group output; we want to see all of the same rule ids on the same file path by_rule_index: Dict[Rule, Dict[ Path, List[PatternMatch]]] = collections.defaultdict( lambda: collections.defaultdict(list)) for pattern_match in outputs: by_rule_index[rule][pattern_match.path].append(pattern_match) findings = [] debugging_steps: List[Any] = [] for rule, paths in by_rule_index.items(): for filepath, pattern_matches in paths.items(): debug_print( f"----- rule ({rule.id}) ----- filepath: {filepath}") findings_for_rule, debugging_steps = evaluate( rule, pattern_matches, self._allow_exec) findings.extend(findings_for_rule) findings = dedup_output(findings) # debugging steps are only tracked for a single file, just overwrite return findings, debugging_steps, errors
def generate_file_pairs(location: Path, ignore_todo: bool, strict: bool, semgrep_verbose: bool, unsafe: bool) -> None: filenames = list(location.rglob("*")) no_tests = [] tested = [] semgrep_error = [] print("starting tests...") for filename in filenames: if (filename.suffix in YML_EXTENSIONS and not filename.name.startswith(".") and not filename.parent.name.startswith(".")): # find all filenames that have the same name but not extension, or are in a folder with the same name as a the yaml file yaml_file_name_without_ext = filename.with_suffix("") children_test_files = [ p for p in filenames if str(p.with_suffix("")) == (str(yaml_file_name_without_ext)) ] # remove yaml files from the test lists test_files = [ path for path in children_test_files if path.suffix not in YML_EXTENSIONS and path.is_file() ] if not len(test_files): no_tests.append(filename) continue # invoke semgrep try: output_json = invoke_semgrep( filename, test_files, no_git_ignore=True, no_rewrite_rule_ids=True, strict=strict, dangerously_allow_arbitrary_code_execution_from_rules= unsafe, ) tested.append((filename, score_output_json(output_json, test_files, ignore_todo))) except Exception as ex: print( f"semgrep error running with config {filename} on {test_files}:\n{ex}" ) semgrep_error.append(filename) if len(semgrep_error) and strict: print("exiting due to semgrep/config errors and strict flag") sys.exit(1) print(f"{len(no_tests)} yaml files missing tests") debug_print(f"missing tests: {no_tests}") print(f"{len(tested)} yaml files tested") print("check id scoring:") print("=" * 80) failed_tests = [] total_confusion = [0, 0, 0, 0] for (filename, (output, expected_reported_by_check_id, num_todo)) in tested: print(filename) if not len(output.items()): print(f" no checks fired (TODOs: {num_todo})") for check_id, (tp, tn, fp, fn) in output.items(): good = (fp == 0) and (fn == 0) if not good: failed_tests.append((filename, check_id, expected_reported_by_check_id[check_id])) status = "✔" if good else "✖" todo_text = f"(TODOs: {num_todo})" if num_todo > 0 else "" confusion = [tp, tn, fp, fn] # add to the total confusion matrix total_confusion = [ total_confusion[i] + confusion[i] for i in range(len(confusion)) ] print( f" {status} - {check_id.ljust(60)}{confusion_matrix_to_string(confusion)} {todo_text}" ) print("=" * 80) print( f"final confusion matrix: {confusion_matrix_to_string(total_confusion)}" ) print("=" * 80) if len(failed_tests) > 0: print(f"failing rule files: ") for (filename, check_id, failed_test_files) in failed_tests: print(f" ✖ FAILED rule file: {filename} check: {check_id}") for test_file_path, (expected, reported) in failed_test_files.items(): print( f" in test: {test_file_path}, expected lines: {sorted(expected)} != reported: {sorted(reported)}" ) print( f"{len(failed_tests)} checks failed tests (run with verbose flag for more details)" ) sys.exit(1) else: print("all tests passed") sys.exit(0)
def _evaluate_single_expression( expression: BooleanRuleExpression, pattern_ids_to_pattern_matches: Dict[PatternId, List[PatternMatch]], ranges_left: Set[Range], steps_for_debugging: List[Dict[str, Any]], flags: Optional[Dict[str, Any]] = None, ) -> Set[Range]: assert expression.pattern_id, f"<internal error: expected pattern id: {expression}>" results_for_pattern = [ x.range for x in pattern_ids_to_pattern_matches.get(expression.pattern_id, []) ] if expression.operator == OPERATORS.AND: # remove all ranges that don't equal the ranges for this pattern return ranges_left.intersection(results_for_pattern) elif expression.operator == OPERATORS.AND_NOT: # remove all ranges that DO equal the ranges for this pattern # difference_update = Remove all elements of another set from this set. output_ranges = ranges_left.difference(results_for_pattern) debug_print(f"after filter `{expression.operator}`: {output_ranges}") steps_for_debugging.append({ "filter": pattern_name_for_operator(expression.operator), "pattern_id": expression.pattern_id, "ranges": list(output_ranges), }) return output_ranges elif expression.operator == OPERATORS.AND_INSIDE: # remove all ranges (not enclosed by) or (not equal to) the inside ranges output_ranges = set() for arange in ranges_left: for keep_inside_this_range in results_for_pattern: is_enclosed = keep_inside_this_range.is_enclosing_or_eq(arange) # print( # f'candidate range is {arange}, needs to be `{operator}` {keep_inside_this_range}; keep?: {keep}') if is_enclosed: output_ranges.add(arange) break # found a match, no need to keep going debug_print(f"after filter `{expression.operator}`: {output_ranges}") steps_for_debugging.append({ "filter": pattern_name_for_operator(expression.operator), "pattern_id": expression.pattern_id, "ranges": list(output_ranges), }) return output_ranges elif expression.operator == OPERATORS.AND_NOT_INSIDE: # remove all ranges enclosed by or equal to output_ranges = ranges_left.copy() for arange in ranges_left: for keep_inside_this_range in results_for_pattern: if keep_inside_this_range.is_enclosing_or_eq(arange): output_ranges.remove(arange) break debug_print(f"after filter `{expression.operator}`: {output_ranges}") steps_for_debugging.append({ "filter": pattern_name_for_operator(expression.operator), "pattern_id": expression.pattern_id, "ranges": list(output_ranges), }) return output_ranges elif expression.operator == OPERATORS.WHERE_PYTHON: if not flags or not flags[RCE_RULE_FLAG]: raise SemgrepError( f"at least one rule needs to execute arbitrary code; this is dangerous! if you want to continue, enable the flag: {RCE_RULE_FLAG}", code=NEED_ARBITRARY_CODE_EXEC_EXIT_CODE, ) assert expression.operand, "must have operand for this operator type" output_ranges = set() # Look through every range that hasn't been filtered yet for pattern_match in list( flatten(pattern_ids_to_pattern_matches.values())): # Only need to check where-python clause if the range hasn't already been filtered if pattern_match.range in ranges_left: debug_print( f"WHERE is {expression.operand}, metavars: {pattern_match.metavars}" ) if _where_python_statement_matches(expression.operand, pattern_match.metavars): output_ranges.add(pattern_match.range) debug_print(f"after filter `{expression.operator}`: {output_ranges}") steps_for_debugging.append({ "filter": pattern_name_for_operator(expression.operator), "pattern_id": expression.pattern_id, "ranges": list(output_ranges), }) return output_ranges elif expression.operator == OPERATORS.REGEX: # remove all ranges that don't equal the ranges for this pattern output_ranges = ranges_left.intersection(results_for_pattern) debug_print(f"after filter `{expression.operator}`: {output_ranges}") steps_for_debugging.append({ "filter": pattern_name_for_operator(expression.operator), "pattern_id": expression.pattern_id, "ranges": list(output_ranges), }) return output_ranges else: raise UnknownOperatorError(f"unknown operator {expression.operator}")