def construct_object(self, node: Node, deep: bool = False) -> YamlTree: r = super().construct_object(node, deep) # Check for duplicate mapping keys. # This -should- be caught and raised by ruamel.yaml. # However, resetting the constructor below, where the line # reads yaml.Constructor = SpanPreservingRuamelConstructor, # causes ruamel's DuplicateKeyError not to be raised. # This is a quick implementation that will check MappingNodes # if isinstance(node, MappingNode): from semgrep.error import InvalidRuleSchemaError kv_pairs: List[Tuple[Node, Node]] = [t for t in node.value] uniq_key_names: Set[str] = set(t[0].value for t in kv_pairs) # If the number of unique key names is less than the number # of key-value nodes, then there's a duplicate key if len(uniq_key_names) < len(kv_pairs): raise InvalidRuleSchemaError( short_msg="Detected duplicate key", long_msg= f"Detected duplicate key name, one of {list(sorted(uniq_key_names))}.", spans=[ Span.from_node(node, source_hash=source_hash, filename=filename).with_context( before=1, after=1) ], ) if r is None: from semgrep.error import InvalidRuleSchemaError Span.from_node(node, source_hash=source_hash, filename=filename) raise InvalidRuleSchemaError( short_msg="null values prohibited", long_msg= "In semgrep YAML configuration, null values are prohibited", spans=[ Span.from_node(node, source_hash=source_hash, filename=filename).with_context( before=1, after=1) ], ) if isinstance(r, dict): r = YamlMap(r) return YamlTree( r, Span.from_node(node, source_hash=source_hash, filename=filename))
def __init__(self, raw: YamlTree[YamlMap]) -> None: self._yaml = raw self._raw: Dict[str, Any] = raw.unroll_dict() # For tracking errors from semgrep-core self._pattern_spans: Dict[PatternId, Span] = {} paths_tree: Optional[YamlTree] = self._yaml.value.get("paths") if paths_tree is None: path_dict = {} else: paths, paths_span = paths_tree.value, paths_tree.span if not isinstance(paths, YamlMap): path_key = self._yaml.value.key_tree("paths").span help_str: Optional[str] = None if isinstance(paths, list): help_str = "remove the `-` to convert the list into a mapping" raise InvalidRuleSchemaError( short_msg="invalid paths", long_msg=f"the `paths:` targeting rules must be an object with at least one of {ALLOWED_GLOB_TYPES}", spans=[path_key.extend_to(paths_span)], help=help_str, ) path_dict = paths_tree.unroll_dict() self._includes = path_dict.get("include", []) self._excludes = path_dict.get("exclude", []) self._languages = [Language(l) for l in self._raw["languages"]] # check taint/search mode self._expression, self._mode = self._build_search_patterns_for_mode(self._yaml)
def _validate(expression: BooleanRuleExpression, span_key: str = "") -> None: """ Recursively validate expressions """ valid_operators = { OPERATORS.REGEX, OPERATORS.AND_EITHER, OPERATORS.AND_ALL, OPERATORS.NOT_REGEX, } if expression.operator not in valid_operators: operator_key = OPERATOR_PATTERN_NAMES_MAP.get( expression.operator, [""] )[0] doc = self._yaml.value.get(span_key) span = doc.span if doc else self._yaml.span raise InvalidRuleSchemaError( short_msg=f"invalid pattern clause", long_msg=f"invalid pattern clause '{operator_key}' with regex-only rules", spans=[span], help=f"use only patterns, pattern-either, pattern-regex, or pattern-not-regex with regex-only rules", ) if expression.children: for child in expression.children: _validate(child, span_key)
def construct_object(self, node: Node, deep: bool = False) -> YamlTree: r = super().construct_object(node, deep) if r is None: from semgrep.error import InvalidRuleSchemaError Span.from_node(node, source_hash=source_hash, filename=filename) raise InvalidRuleSchemaError( short_msg="null values prohibited", long_msg= "In semgrep YAML configuration, null values are prohibited", spans=[ Span.from_node(node, source_hash=source_hash, filename=filename).with_context( before=1, after=1) ], ) if isinstance(r, dict): r = YamlMap(r) return YamlTree( r, Span.from_node(node, source_hash=source_hash, filename=filename))
def validate_single_rule( config_id: str, rule_yaml: YamlTree[YamlMap], ) -> Optional[Rule]: """ Validate that a rule dictionary contains all necessary keys and can be correctly parsed. Returns Rule object if valid otherwise raises InvalidRuleSchemaError """ rule: YamlMap = rule_yaml.value rule_keys = set({k.value for k in rule.keys()}) if not rule_keys.issuperset(YAML_MUST_HAVE_KEYS): missing_keys = YAML_MUST_HAVE_KEYS - rule_keys extra_keys: Set[str] = rule_keys - YAML_ALL_VALID_RULE_KEYS extra_key_spans = sorted([rule.key_tree(k) for k in extra_keys]) help_msg = None if extra_keys: help_msg = f"Unexpected keys {extra_keys} found. Is one of these a typo of {missing_keys}?" raise InvalidRuleSchemaError( short_msg="missing keys", long_msg=f"{config_id} is missing required keys {missing_keys}", spans=[rule_yaml.span.truncate(lines=5)] + [e.span for e in extra_key_spans], help=help_msg, ) # Raises InvalidRuleSchemaError if fails to parse return Rule.from_yamltree(rule_yaml)
def _validate_none_language_rule(self) -> None: """ For regex-only rules, only patterns, pattern-either, and pattern-regex is valid. """ def _recursive_contains(obj: Union[Dict[str, Any], List[Any], str], search_key: str) -> bool: """ Returns true if object contains any object that contains search_key as key """ if isinstance(obj, dict): for key in obj: if key == search_key: return True if _recursive_contains(obj[key], search_key): return True if isinstance(obj, list): for elem in obj: if _recursive_contains(elem, search_key): return True return False if _recursive_contains(self._raw, "pattern"): raise InvalidRuleSchemaError( short_msg=f"invalid pattern clause", long_msg= f"invalid pattern clause 'pattern' with regex-only rules in rule: {self.id}", spans=[], help= f"use only patterns, pattern-either, pattern-regex, or pattern-not-regex with regex-only rules", )
def _validate_operand(operand: YamlTree) -> str: # type: ignore if not isinstance(operand.value, str): raise InvalidRuleSchemaError( short_msg="invalid operand", long_msg= f"type of `pattern` must be a string, but it was a {type(operand.unroll()).__name__}", spans=[operand.span.with_context(before=1, after=1)], ) return operand.value
def validate_single_rule(config_id: str, rule_yaml: YamlTree[YamlMap]) -> Optional[Rule]: """ Validate that a rule dictionary contains all necessary keys and can be correctly parsed. Returns Rule object if valid otherwise raises InvalidRuleSchemaError """ rule: YamlMap = rule_yaml.value rule_keys = set({k.value for k in rule.keys()}) extra_keys = rule_keys - YAML_ALL_VALID_RULE_KEYS extra_key_spans = sorted([rule.key_tree(k) for k in extra_keys]) missing_keys = YAML_MUST_HAVE_KEYS - rule_keys if missing_keys and extra_keys: help_msg = f"Unexpected keys {extra_keys} found. Is one of these a typo of {missing_keys}?" raise InvalidRuleSchemaError( short_msg="incorrect keys", long_msg=f"{config_id} is missing required keys {missing_keys}", spans=[rule_yaml.span.truncate(lines=5)] + [e.span for e in extra_key_spans], help=help_msg, ) elif missing_keys: help_msg = f"Add {missing_keys} to your config file." raise InvalidRuleSchemaError( short_msg="missing keys", long_msg=f"{config_id} is missing required keys {missing_keys}", spans=[rule_yaml.span.truncate(lines=5)] + [e.span for e in extra_key_spans], help=help_msg, ) elif extra_keys: help_msg = f"Unexpected keys {extra_keys} found. Were you looking for any of these unused, valid keys?\n {sorted(YAML_ALL_VALID_RULE_KEYS - rule_keys)}" raise InvalidRuleSchemaError( short_msg="invalid keys", long_msg= f"{config_id} has extra, un-interpretable keys: {extra_keys}", spans=[e.span for e in extra_key_spans], help=help_msg, ) # Defaults to search mode if mode is not specified return Rule.from_yamltree(rule_yaml)
def __init__(self, raw: YamlTree[YamlMap]) -> None: self._yaml = raw self._raw: Dict[str, Any] = raw.unroll_dict() # For tracking errors from semgrep-core self._pattern_spans: Dict[PatternId, Span] = {} paths_tree: Optional[YamlTree] = self._yaml.value.get("paths") if paths_tree is None: path_dict = {} else: paths, paths_span = paths_tree.value, paths_tree.span if not isinstance(paths, YamlMap): path_key = self._yaml.value.key_tree("paths").span help_str: Optional[str] = None if isinstance(paths, list): help_str = "remove the `-` to convert the list into a mapping" raise InvalidRuleSchemaError( short_msg="invalid paths", long_msg= f"the `paths:` targeting rules must be an object with at least one of {ALLOWED_GLOB_TYPES}", spans=[path_key.extend_to(paths_span)], help=help_str, ) for key, value in paths.items(): if key.value not in ALLOWED_GLOB_TYPES: raise InvalidRuleSchemaError( short_msg="invalid targeting rules", long_msg= f"the `paths:` targeting rules must each be one of {ALLOWED_GLOB_TYPES}", spans=[key.span.with_context(before=1, after=1)], ) if not isinstance(value.value, list): raise InvalidRuleSchemaError( short_msg="invalid target value", long_msg= f"the `paths:` targeting rule values must be lists", spans=[value.span], ) path_dict = paths_tree.unroll_dict() self._includes = path_dict.get("include", []) self._excludes = path_dict.get("exclude", []) self._languages = [Language(l) for l in self._raw["languages"]] self._expression = self._build_boolean_expression(self._yaml)
def _validate_list_operand(field: str, operand: YamlTree) -> list: # type: ignore if not isinstance(operand.value, list): raise InvalidRuleSchemaError( short_msg="invalid operand", long_msg= f"type of {field} must be a list, but it was a {type(operand.unroll()).__name__}", spans=[operand.span.with_context(before=1, after=1)], ) return operand.value
def _build_taint_expression( self, rule: YamlTree[YamlMap]) -> BooleanRuleExpression: """ Build an expression from the yml lines in the rule """ rule_raw = rule.value _rule_id = rule_raw["id"].unroll() if not isinstance(_rule_id, str): raise InvalidRuleSchemaError( short_msg="invalid id", long_msg= f"rule id must be a string, but was {type(_rule_id).__name__}", spans=[rule_raw["id"].span], ) if rule_raw.get("metadata"): raise InvalidRuleSchemaError( short_msg="invalid key", long_msg=f"metadata is not supported in {TAINT_MODE} mode", spans=[rule_raw.key_tree("metadata").span], ) rule_id = PatternId(_rule_id) for pattern_name in YAML_TAINT_MUST_HAVE_KEYS: pattern = rule_raw.get(pattern_name) if not pattern: raise InvalidRuleSchemaError( short_msg=f"missing {pattern_name} key", long_msg= f"In {TAINT_MODE} mode, 'pattern-sources' and 'pattern-sinks' are both required", spans=[rule.span.truncate(10)], ) self._validate_list_operand(pattern_name, pattern) self._pattern_spans[rule_id] = pattern.span return BooleanRuleExpression( OPERATORS.AND, rule_id, None, None, )
def validate_yaml(data: YamlTree) -> None: from semgrep.error import InvalidRuleSchemaError try: jsonschema.validate(data.unroll(), RuleSchema.get()) except jsonschema.ValidationError as ve: message = _validation_error_message(ve) item = data for el in (ve.parent or ve).relative_path: item = item.value[el] raise InvalidRuleSchemaError( short_msg="Invalid rule schema", long_msg=message, spans=[item.span], )
def __init__(self, raw: YamlTree[YamlMap]) -> None: self._yaml = raw self._raw: Dict[str, Any] = raw.unroll_dict() self._id = str(self._raw["id"]) paths_tree: Optional[YamlTree] = self._yaml.value.get("paths") if paths_tree is None: path_dict = {} else: paths, paths_span = paths_tree.value, paths_tree.span if not isinstance(paths, YamlMap): path_key = self._yaml.value.key_tree("paths").span help_str: Optional[str] = None if isinstance(paths, list): help_str = "remove the `-` to convert the list into a mapping" raise InvalidRuleSchemaError( short_msg="invalid paths", long_msg= f"the `paths:` targeting rules must be an object with at least one of {ALLOWED_GLOB_TYPES}", spans=[path_key.extend_to(paths_span)], help=help_str, ) path_dict = paths_tree.unroll_dict() self._includes = path_dict.get("include", []) self._excludes = path_dict.get("exclude", []) rule_languages = { Language_util.resolve(l, self.languages_span) for l in self._raw.get("languages", []) } # add typescript to languages if the rule supports javascript. if any(language == Language.JAVASCRIPT for language in rule_languages): rule_languages.add(Language.TYPESCRIPT) self._raw["languages"] = [r.value for r in rule_languages] self._languages = sorted(rule_languages, key=lambda lang: lang.value) # type: ignore # check taint/search mode if self._raw.get("mode") == JOIN_MODE: self._mode = JOIN_MODE else: self._mode = SEARCH_MODE if any(language == Language.REGEX for language in self._languages): self._validate_none_language_rule()
def _taint_or_search_patterns_validation( self, rule: YamlTree[YamlMap]) -> Tuple[BooleanRuleExpression, Mode]: rule_raw = rule.value mode = (Mode(str(rule_raw["mode"].unroll())) if rule_raw.get("mode") else DEFAULT_MODE) if mode == TAINT_MODE: # Raises InvalidRuleSchemaError if fails to parse in search mode return self._build_taint_expression(rule), mode elif mode == SEARCH_MODE: # Raises InvalidRuleSchemaError if fails to parse in search mode return self._build_boolean_expression(rule), mode else: raise InvalidRuleSchemaError( short_msg="invalid mode", long_msg=f"The only supported modes are {SUPPORTED_MODES}", spans=[rule_raw["mode"].span], )
def validate_yaml(data: YamlTree) -> None: from semgrep.error import InvalidRuleSchemaError try: jsonschema.validate(data.unroll(), RuleSchema.get(), cls=Draft7Validator) except jsonschema.ValidationError as ve: message = _validation_error_message(ve) item = data root_error = ve while root_error.parent is not None: root_error = root_error.parent for el in root_error.absolute_path: item = item.value[el] raise InvalidRuleSchemaError( short_msg="Invalid rule schema", long_msg=message, spans=[item.span], )
def _validate( # type: ignore[misc] config_dict: Dict[str, YamlTree] ) -> Tuple[Dict[str, List[Rule]], List[SemgrepError]]: """ Take configs and separate into valid and list of errors parsing the invalid ones """ errors: List[SemgrepError] = [] valid: Dict[str, Any] = {} for config_id, config_yaml_tree in config_dict.items(): config = config_yaml_tree.value if not isinstance(config, YamlMap): errors.append(SemgrepError(f"{config_id} was not a mapping")) continue rules = config.get(RULES_KEY) if rules is None: errors.append( InvalidRuleSchemaError( short_msg="missing keys", long_msg= f"{config_id} is missing `{RULES_KEY}` as top-level key", spans=[config_yaml_tree.span.truncate(lines=5)], )) continue valid_rules = [] for rule_dict in rules.value: try: rule = validate_single_rule(config_id, rule_dict) except InvalidRuleSchemaError as ex: errors.append(ex) else: valid_rules.append(rule) if valid_rules: valid[config_id] = valid_rules return valid, errors
def _build_boolean_expression( self, rule: YamlTree[YamlMap]) -> BooleanRuleExpression: """ Build a boolean expression from the yml lines in the rule """ rule_raw = rule.value _rule_id = rule_raw["id"].unroll() if not isinstance(_rule_id, str): raise InvalidRuleSchemaError( short_msg="invalid id", long_msg= f"rule id must be a string, but was {type(_rule_id).__name__}", spans=[rule_raw["id"].span], ) rule_id = PatternId(_rule_id) for pattern_name in pattern_names_for_operator(OPERATORS.AND): pattern = rule_raw.get(pattern_name) if pattern: self._pattern_spans[rule_id] = pattern.span return BooleanRuleExpression( OPERATORS.AND, rule_id, None, self._validate_operand(pattern), ) for pattern_name in pattern_names_for_operator(OPERATORS.REGEX): pattern = rule_raw.get(pattern_name) if pattern: self._pattern_spans[rule_id] = pattern.span return BooleanRuleExpression( OPERATORS.REGEX, rule_id, None, self._validate_operand(pattern), ) for pattern_name in pattern_names_for_operator(OPERATORS.AND_ALL): patterns = rule_raw.get(pattern_name) if patterns: return BooleanRuleExpression( operator=OPERATORS.AND_ALL, pattern_id=None, children=list(self._parse_boolean_expression(patterns)), operand=None, ) for pattern_name in pattern_names_for_operator(OPERATORS.AND_EITHER): patterns = rule_raw.get(pattern_name) if patterns: return BooleanRuleExpression( operator=OPERATORS.AND_EITHER, pattern_id=None, children=list(self._parse_boolean_expression(patterns)), operand=None, ) required_operator = [ OPERATORS.AND_ALL, OPERATORS.AND_EITHER, OPERATORS.REGEX, OPERATORS.AND, ] raise InvalidRuleSchemaError( short_msg="missing key", long_msg= f"missing a pattern type in rule, expected one of {pattern_names_for_operators(required_operator)}", spans=[rule.span.truncate(10)], )
def _parse_boolean_expression( self, rule_patterns: YamlTree[List[YamlTree]], pattern_id_idx: int = 0, prefix: str = "", ) -> Iterator[BooleanRuleExpression]: """ Move through the expression from the YML, yielding tuples of (operator, unique-id-for-pattern, pattern) """ if not isinstance(rule_patterns.value, list): raise InvalidRuleSchemaError( short_msg="invalid patterns", long_msg= f"invalid type for patterns; expected a list, but found {type(rule_patterns.unroll()).__name__}", spans=[rule_patterns.span.with_context(before=1).truncate(5)], help= f"perhaps your YAML is missing a `-` on line {rule_patterns.span.start.line}?", ) for rule_index, pattern_tree in enumerate(rule_patterns.value): pattern = pattern_tree.value if not isinstance(pattern, YamlMap): raise InvalidRuleSchemaError( short_msg="invalid pattern", long_msg= f"invalid type for pattern expected dict but found {type(pattern).__name__}", spans=[pattern_tree.span], help=f"Did you mean `pattern: {pattern}`?", ) for boolean_operator_yaml, sub_pattern in pattern.items(): boolean_operator: str = boolean_operator_yaml.value operator = operator_for_pattern_name(boolean_operator_yaml) if operator in set(OPERATORS_WITH_CHILDREN): if isinstance(sub_pattern.value, list): sub_expression = self._parse_boolean_expression( sub_pattern, 0, f"{prefix}.{rule_index}.{pattern_id_idx}") yield BooleanRuleExpression( operator=operator, pattern_id=None, children=list(sub_expression), operand=None, ) else: raise InvalidRuleSchemaError( short_msg="missing children", long_msg= f"operator {boolean_operator} must have children", spans=[ boolean_operator_yaml.span.extend_to( sub_pattern.span) ], ) else: pattern_text, pattern_span = sub_pattern.value, sub_pattern.span if isinstance(pattern_text, str): pattern_id = PatternId(f"{prefix}.{pattern_id_idx}") self._pattern_spans[pattern_id] = pattern_span yield BooleanRuleExpression( operator=operator, pattern_id=pattern_id, children=None, operand=pattern_text, ) pattern_id_idx += 1 else: raise InvalidRuleSchemaError( short_msg="invalid operand", long_msg= f"operand for {boolean_operator} must be a string, but instead was {type(sub_pattern.unroll()).__name__}", spans=[ boolean_operator_yaml.span.extend_to( pattern_span).truncate(5) ], )