Ejemplo n.º 1
0
        def construct_object(self, node: Node, deep: bool = False) -> YamlTree:
            r = super().construct_object(node, deep)

            # Check for duplicate mapping keys.
            # This -should- be caught and raised by ruamel.yaml.
            # However, resetting the constructor below, where the line
            # reads yaml.Constructor = SpanPreservingRuamelConstructor,
            # causes ruamel's DuplicateKeyError not to be raised.
            # This is a quick implementation that will check MappingNodes
            #
            if isinstance(node, MappingNode):
                from semgrep.error import InvalidRuleSchemaError

                kv_pairs: List[Tuple[Node, Node]] = [t for t in node.value]
                uniq_key_names: Set[str] = set(t[0].value for t in kv_pairs)
                # If the number of unique key names is less than the number
                # of key-value nodes, then there's a duplicate key
                if len(uniq_key_names) < len(kv_pairs):
                    raise InvalidRuleSchemaError(
                        short_msg="Detected duplicate key",
                        long_msg=
                        f"Detected duplicate key name, one of {list(sorted(uniq_key_names))}.",
                        spans=[
                            Span.from_node(node,
                                           source_hash=source_hash,
                                           filename=filename).with_context(
                                               before=1, after=1)
                        ],
                    )

            if r is None:
                from semgrep.error import InvalidRuleSchemaError

                Span.from_node(node,
                               source_hash=source_hash,
                               filename=filename)
                raise InvalidRuleSchemaError(
                    short_msg="null values prohibited",
                    long_msg=
                    "In semgrep YAML configuration, null values are prohibited",
                    spans=[
                        Span.from_node(node,
                                       source_hash=source_hash,
                                       filename=filename).with_context(
                                           before=1, after=1)
                    ],
                )

            if isinstance(r, dict):
                r = YamlMap(r)
            return YamlTree(
                r,
                Span.from_node(node,
                               source_hash=source_hash,
                               filename=filename))
Ejemplo n.º 2
0
    def __init__(self, raw: YamlTree[YamlMap]) -> None:
        self._yaml = raw
        self._raw: Dict[str, Any] = raw.unroll_dict()

        # For tracking errors from semgrep-core
        self._pattern_spans: Dict[PatternId, Span] = {}

        paths_tree: Optional[YamlTree] = self._yaml.value.get("paths")
        if paths_tree is None:
            path_dict = {}
        else:
            paths, paths_span = paths_tree.value, paths_tree.span
            if not isinstance(paths, YamlMap):
                path_key = self._yaml.value.key_tree("paths").span
                help_str: Optional[str] = None
                if isinstance(paths, list):
                    help_str = "remove the `-` to convert the list into a mapping"
                raise InvalidRuleSchemaError(
                    short_msg="invalid paths",
                    long_msg=f"the `paths:` targeting rules must be an object with at least one of {ALLOWED_GLOB_TYPES}",
                    spans=[path_key.extend_to(paths_span)],
                    help=help_str,
                )
            path_dict = paths_tree.unroll_dict()
        self._includes = path_dict.get("include", [])
        self._excludes = path_dict.get("exclude", [])
        self._languages = [Language(l) for l in self._raw["languages"]]

        # check taint/search mode
        self._expression, self._mode = self._build_search_patterns_for_mode(self._yaml)
Ejemplo n.º 3
0
 def _validate(expression: BooleanRuleExpression, span_key: str = "") -> None:
     """
     Recursively validate expressions
     """
     valid_operators = {
         OPERATORS.REGEX,
         OPERATORS.AND_EITHER,
         OPERATORS.AND_ALL,
         OPERATORS.NOT_REGEX,
     }
     if expression.operator not in valid_operators:
         operator_key = OPERATOR_PATTERN_NAMES_MAP.get(
             expression.operator, [""]
         )[0]
         doc = self._yaml.value.get(span_key)
         span = doc.span if doc else self._yaml.span
         raise InvalidRuleSchemaError(
             short_msg=f"invalid pattern clause",
             long_msg=f"invalid pattern clause '{operator_key}' with regex-only rules",
             spans=[span],
             help=f"use only patterns, pattern-either, pattern-regex, or pattern-not-regex with regex-only rules",
         )
     if expression.children:
         for child in expression.children:
             _validate(child, span_key)
Ejemplo n.º 4
0
        def construct_object(self, node: Node, deep: bool = False) -> YamlTree:
            r = super().construct_object(node, deep)
            if r is None:
                from semgrep.error import InvalidRuleSchemaError

                Span.from_node(node,
                               source_hash=source_hash,
                               filename=filename)
                raise InvalidRuleSchemaError(
                    short_msg="null values prohibited",
                    long_msg=
                    "In semgrep YAML configuration, null values are prohibited",
                    spans=[
                        Span.from_node(node,
                                       source_hash=source_hash,
                                       filename=filename).with_context(
                                           before=1, after=1)
                    ],
                )

            if isinstance(r, dict):
                r = YamlMap(r)
            return YamlTree(
                r,
                Span.from_node(node,
                               source_hash=source_hash,
                               filename=filename))
Ejemplo n.º 5
0
def validate_single_rule(
    config_id: str,
    rule_yaml: YamlTree[YamlMap],
) -> Optional[Rule]:
    """
        Validate that a rule dictionary contains all necessary keys
        and can be correctly parsed.

        Returns Rule object if valid otherwise raises InvalidRuleSchemaError
    """
    rule: YamlMap = rule_yaml.value

    rule_keys = set({k.value for k in rule.keys()})
    if not rule_keys.issuperset(YAML_MUST_HAVE_KEYS):
        missing_keys = YAML_MUST_HAVE_KEYS - rule_keys

        extra_keys: Set[str] = rule_keys - YAML_ALL_VALID_RULE_KEYS
        extra_key_spans = sorted([rule.key_tree(k) for k in extra_keys])
        help_msg = None
        if extra_keys:
            help_msg = f"Unexpected keys {extra_keys} found. Is one of these a typo of {missing_keys}?"
        raise InvalidRuleSchemaError(
            short_msg="missing keys",
            long_msg=f"{config_id} is missing required keys {missing_keys}",
            spans=[rule_yaml.span.truncate(lines=5)] +
            [e.span for e in extra_key_spans],
            help=help_msg,
        )

    # Raises InvalidRuleSchemaError if fails to parse
    return Rule.from_yamltree(rule_yaml)
Ejemplo n.º 6
0
    def _validate_none_language_rule(self) -> None:
        """
        For regex-only rules, only patterns, pattern-either, and pattern-regex is valid.
        """
        def _recursive_contains(obj: Union[Dict[str, Any], List[Any], str],
                                search_key: str) -> bool:
            """
            Returns true if object contains any object that contains search_key as key
            """
            if isinstance(obj, dict):
                for key in obj:
                    if key == search_key:
                        return True

                    if _recursive_contains(obj[key], search_key):
                        return True

            if isinstance(obj, list):
                for elem in obj:
                    if _recursive_contains(elem, search_key):
                        return True

            return False

        if _recursive_contains(self._raw, "pattern"):
            raise InvalidRuleSchemaError(
                short_msg=f"invalid pattern clause",
                long_msg=
                f"invalid pattern clause 'pattern' with regex-only rules in rule: {self.id}",
                spans=[],
                help=
                f"use only patterns, pattern-either, pattern-regex, or pattern-not-regex with regex-only rules",
            )
Ejemplo n.º 7
0
 def _validate_operand(operand: YamlTree) -> str:  # type: ignore
     if not isinstance(operand.value, str):
         raise InvalidRuleSchemaError(
             short_msg="invalid operand",
             long_msg=
             f"type of `pattern` must be a string, but it was a {type(operand.unroll()).__name__}",
             spans=[operand.span.with_context(before=1, after=1)],
         )
     return operand.value
Ejemplo n.º 8
0
def validate_single_rule(config_id: str,
                         rule_yaml: YamlTree[YamlMap]) -> Optional[Rule]:
    """
        Validate that a rule dictionary contains all necessary keys
        and can be correctly parsed.

        Returns Rule object if valid otherwise raises InvalidRuleSchemaError
    """
    rule: YamlMap = rule_yaml.value

    rule_keys = set({k.value for k in rule.keys()})
    extra_keys = rule_keys - YAML_ALL_VALID_RULE_KEYS
    extra_key_spans = sorted([rule.key_tree(k) for k in extra_keys])
    missing_keys = YAML_MUST_HAVE_KEYS - rule_keys

    if missing_keys and extra_keys:
        help_msg = f"Unexpected keys {extra_keys} found. Is one of these a typo of {missing_keys}?"
        raise InvalidRuleSchemaError(
            short_msg="incorrect keys",
            long_msg=f"{config_id} is missing required keys {missing_keys}",
            spans=[rule_yaml.span.truncate(lines=5)] +
            [e.span for e in extra_key_spans],
            help=help_msg,
        )
    elif missing_keys:
        help_msg = f"Add {missing_keys} to your config file."
        raise InvalidRuleSchemaError(
            short_msg="missing keys",
            long_msg=f"{config_id} is missing required keys {missing_keys}",
            spans=[rule_yaml.span.truncate(lines=5)] +
            [e.span for e in extra_key_spans],
            help=help_msg,
        )
    elif extra_keys:
        help_msg = f"Unexpected keys {extra_keys} found. Were you looking for any of these unused, valid keys?\n {sorted(YAML_ALL_VALID_RULE_KEYS - rule_keys)}"
        raise InvalidRuleSchemaError(
            short_msg="invalid keys",
            long_msg=
            f"{config_id} has extra, un-interpretable keys: {extra_keys}",
            spans=[e.span for e in extra_key_spans],
            help=help_msg,
        )
    # Defaults to search mode if mode is not specified
    return Rule.from_yamltree(rule_yaml)
Ejemplo n.º 9
0
    def __init__(self, raw: YamlTree[YamlMap]) -> None:
        self._yaml = raw
        self._raw: Dict[str, Any] = raw.unroll_dict()

        # For tracking errors from semgrep-core
        self._pattern_spans: Dict[PatternId, Span] = {}

        paths_tree: Optional[YamlTree] = self._yaml.value.get("paths")
        if paths_tree is None:
            path_dict = {}
        else:
            paths, paths_span = paths_tree.value, paths_tree.span
            if not isinstance(paths, YamlMap):
                path_key = self._yaml.value.key_tree("paths").span
                help_str: Optional[str] = None
                if isinstance(paths, list):
                    help_str = "remove the `-` to convert the list into a mapping"
                raise InvalidRuleSchemaError(
                    short_msg="invalid paths",
                    long_msg=
                    f"the `paths:` targeting rules must be an object with at least one of {ALLOWED_GLOB_TYPES}",
                    spans=[path_key.extend_to(paths_span)],
                    help=help_str,
                )
            for key, value in paths.items():
                if key.value not in ALLOWED_GLOB_TYPES:
                    raise InvalidRuleSchemaError(
                        short_msg="invalid targeting rules",
                        long_msg=
                        f"the `paths:` targeting rules must each be one of {ALLOWED_GLOB_TYPES}",
                        spans=[key.span.with_context(before=1, after=1)],
                    )
                if not isinstance(value.value, list):
                    raise InvalidRuleSchemaError(
                        short_msg="invalid target value",
                        long_msg=
                        f"the `paths:` targeting rule values must be lists",
                        spans=[value.span],
                    )
            path_dict = paths_tree.unroll_dict()
        self._includes = path_dict.get("include", [])
        self._excludes = path_dict.get("exclude", [])
        self._languages = [Language(l) for l in self._raw["languages"]]
        self._expression = self._build_boolean_expression(self._yaml)
Ejemplo n.º 10
0
 def _validate_list_operand(field: str,
                            operand: YamlTree) -> list:  # type: ignore
     if not isinstance(operand.value, list):
         raise InvalidRuleSchemaError(
             short_msg="invalid operand",
             long_msg=
             f"type of {field} must be a list, but it was a {type(operand.unroll()).__name__}",
             spans=[operand.span.with_context(before=1, after=1)],
         )
     return operand.value
Ejemplo n.º 11
0
 def _build_taint_expression(
         self, rule: YamlTree[YamlMap]) -> BooleanRuleExpression:
     """
     Build an expression from the yml lines in the rule
     """
     rule_raw = rule.value
     _rule_id = rule_raw["id"].unroll()
     if not isinstance(_rule_id, str):
         raise InvalidRuleSchemaError(
             short_msg="invalid id",
             long_msg=
             f"rule id must be a string, but was {type(_rule_id).__name__}",
             spans=[rule_raw["id"].span],
         )
     if rule_raw.get("metadata"):
         raise InvalidRuleSchemaError(
             short_msg="invalid key",
             long_msg=f"metadata is not supported in {TAINT_MODE} mode",
             spans=[rule_raw.key_tree("metadata").span],
         )
     rule_id = PatternId(_rule_id)
     for pattern_name in YAML_TAINT_MUST_HAVE_KEYS:
         pattern = rule_raw.get(pattern_name)
         if not pattern:
             raise InvalidRuleSchemaError(
                 short_msg=f"missing {pattern_name} key",
                 long_msg=
                 f"In {TAINT_MODE} mode, 'pattern-sources' and 'pattern-sinks' are both required",
                 spans=[rule.span.truncate(10)],
             )
         self._validate_list_operand(pattern_name, pattern)
         self._pattern_spans[rule_id] = pattern.span
     return BooleanRuleExpression(
         OPERATORS.AND,
         rule_id,
         None,
         None,
     )
Ejemplo n.º 12
0
def validate_yaml(data: YamlTree) -> None:
    from semgrep.error import InvalidRuleSchemaError

    try:
        jsonschema.validate(data.unroll(), RuleSchema.get())
    except jsonschema.ValidationError as ve:
        message = _validation_error_message(ve)
        item = data
        for el in (ve.parent or ve).relative_path:
            item = item.value[el]

        raise InvalidRuleSchemaError(
            short_msg="Invalid rule schema", long_msg=message, spans=[item.span],
        )
Ejemplo n.º 13
0
    def __init__(self, raw: YamlTree[YamlMap]) -> None:
        self._yaml = raw
        self._raw: Dict[str, Any] = raw.unroll_dict()

        self._id = str(self._raw["id"])

        paths_tree: Optional[YamlTree] = self._yaml.value.get("paths")
        if paths_tree is None:
            path_dict = {}
        else:
            paths, paths_span = paths_tree.value, paths_tree.span
            if not isinstance(paths, YamlMap):
                path_key = self._yaml.value.key_tree("paths").span
                help_str: Optional[str] = None
                if isinstance(paths, list):
                    help_str = "remove the `-` to convert the list into a mapping"
                raise InvalidRuleSchemaError(
                    short_msg="invalid paths",
                    long_msg=
                    f"the `paths:` targeting rules must be an object with at least one of {ALLOWED_GLOB_TYPES}",
                    spans=[path_key.extend_to(paths_span)],
                    help=help_str,
                )
            path_dict = paths_tree.unroll_dict()
        self._includes = path_dict.get("include", [])
        self._excludes = path_dict.get("exclude", [])
        rule_languages = {
            Language_util.resolve(l, self.languages_span)
            for l in self._raw.get("languages", [])
        }

        # add typescript to languages if the rule supports javascript.
        if any(language == Language.JAVASCRIPT for language in rule_languages):
            rule_languages.add(Language.TYPESCRIPT)
            self._raw["languages"] = [r.value for r in rule_languages]

        self._languages = sorted(rule_languages,
                                 key=lambda lang: lang.value)  # type: ignore

        # check taint/search mode
        if self._raw.get("mode") == JOIN_MODE:
            self._mode = JOIN_MODE
        else:
            self._mode = SEARCH_MODE

        if any(language == Language.REGEX for language in self._languages):
            self._validate_none_language_rule()
Ejemplo n.º 14
0
    def _taint_or_search_patterns_validation(
            self,
            rule: YamlTree[YamlMap]) -> Tuple[BooleanRuleExpression, Mode]:

        rule_raw = rule.value
        mode = (Mode(str(rule_raw["mode"].unroll()))
                if rule_raw.get("mode") else DEFAULT_MODE)
        if mode == TAINT_MODE:
            # Raises InvalidRuleSchemaError if fails to parse in search mode
            return self._build_taint_expression(rule), mode
        elif mode == SEARCH_MODE:
            # Raises InvalidRuleSchemaError if fails to parse in search mode
            return self._build_boolean_expression(rule), mode
        else:
            raise InvalidRuleSchemaError(
                short_msg="invalid mode",
                long_msg=f"The only supported modes are {SUPPORTED_MODES}",
                spans=[rule_raw["mode"].span],
            )
Ejemplo n.º 15
0
def validate_yaml(data: YamlTree) -> None:
    from semgrep.error import InvalidRuleSchemaError

    try:
        jsonschema.validate(data.unroll(),
                            RuleSchema.get(),
                            cls=Draft7Validator)
    except jsonschema.ValidationError as ve:
        message = _validation_error_message(ve)
        item = data

        root_error = ve
        while root_error.parent is not None:
            root_error = root_error.parent

        for el in root_error.absolute_path:
            item = item.value[el]

        raise InvalidRuleSchemaError(
            short_msg="Invalid rule schema",
            long_msg=message,
            spans=[item.span],
        )
Ejemplo n.º 16
0
    def _validate(  # type: ignore[misc]
        config_dict: Dict[str, YamlTree]
    ) -> Tuple[Dict[str, List[Rule]], List[SemgrepError]]:
        """
        Take configs and separate into valid and list of errors parsing the invalid ones
        """
        errors: List[SemgrepError] = []
        valid: Dict[str, Any] = {}
        for config_id, config_yaml_tree in config_dict.items():
            config = config_yaml_tree.value
            if not isinstance(config, YamlMap):
                errors.append(SemgrepError(f"{config_id} was not a mapping"))
                continue

            rules = config.get(RULES_KEY)
            if rules is None:
                errors.append(
                    InvalidRuleSchemaError(
                        short_msg="missing keys",
                        long_msg=
                        f"{config_id} is missing `{RULES_KEY}` as top-level key",
                        spans=[config_yaml_tree.span.truncate(lines=5)],
                    ))
                continue
            valid_rules = []
            for rule_dict in rules.value:

                try:
                    rule = validate_single_rule(config_id, rule_dict)
                except InvalidRuleSchemaError as ex:
                    errors.append(ex)
                else:
                    valid_rules.append(rule)

            if valid_rules:
                valid[config_id] = valid_rules
        return valid, errors
Ejemplo n.º 17
0
    def _build_boolean_expression(
            self, rule: YamlTree[YamlMap]) -> BooleanRuleExpression:
        """
        Build a boolean expression from the yml lines in the rule
        """
        rule_raw = rule.value
        _rule_id = rule_raw["id"].unroll()
        if not isinstance(_rule_id, str):
            raise InvalidRuleSchemaError(
                short_msg="invalid id",
                long_msg=
                f"rule id must be a string, but was {type(_rule_id).__name__}",
                spans=[rule_raw["id"].span],
            )
        rule_id = PatternId(_rule_id)
        for pattern_name in pattern_names_for_operator(OPERATORS.AND):
            pattern = rule_raw.get(pattern_name)
            if pattern:
                self._pattern_spans[rule_id] = pattern.span
                return BooleanRuleExpression(
                    OPERATORS.AND,
                    rule_id,
                    None,
                    self._validate_operand(pattern),
                )

        for pattern_name in pattern_names_for_operator(OPERATORS.REGEX):
            pattern = rule_raw.get(pattern_name)
            if pattern:
                self._pattern_spans[rule_id] = pattern.span
                return BooleanRuleExpression(
                    OPERATORS.REGEX,
                    rule_id,
                    None,
                    self._validate_operand(pattern),
                )

        for pattern_name in pattern_names_for_operator(OPERATORS.AND_ALL):
            patterns = rule_raw.get(pattern_name)
            if patterns:
                return BooleanRuleExpression(
                    operator=OPERATORS.AND_ALL,
                    pattern_id=None,
                    children=list(self._parse_boolean_expression(patterns)),
                    operand=None,
                )

        for pattern_name in pattern_names_for_operator(OPERATORS.AND_EITHER):
            patterns = rule_raw.get(pattern_name)
            if patterns:
                return BooleanRuleExpression(
                    operator=OPERATORS.AND_EITHER,
                    pattern_id=None,
                    children=list(self._parse_boolean_expression(patterns)),
                    operand=None,
                )

        required_operator = [
            OPERATORS.AND_ALL,
            OPERATORS.AND_EITHER,
            OPERATORS.REGEX,
            OPERATORS.AND,
        ]

        raise InvalidRuleSchemaError(
            short_msg="missing key",
            long_msg=
            f"missing a pattern type in rule, expected one of {pattern_names_for_operators(required_operator)}",
            spans=[rule.span.truncate(10)],
        )
Ejemplo n.º 18
0
 def _parse_boolean_expression(
     self,
     rule_patterns: YamlTree[List[YamlTree]],
     pattern_id_idx: int = 0,
     prefix: str = "",
 ) -> Iterator[BooleanRuleExpression]:
     """
     Move through the expression from the YML, yielding tuples of (operator, unique-id-for-pattern, pattern)
     """
     if not isinstance(rule_patterns.value, list):
         raise InvalidRuleSchemaError(
             short_msg="invalid patterns",
             long_msg=
             f"invalid type for patterns; expected a list, but found {type(rule_patterns.unroll()).__name__}",
             spans=[rule_patterns.span.with_context(before=1).truncate(5)],
             help=
             f"perhaps your YAML is missing a `-` on line {rule_patterns.span.start.line}?",
         )
     for rule_index, pattern_tree in enumerate(rule_patterns.value):
         pattern = pattern_tree.value
         if not isinstance(pattern, YamlMap):
             raise InvalidRuleSchemaError(
                 short_msg="invalid pattern",
                 long_msg=
                 f"invalid type for pattern expected dict but found {type(pattern).__name__}",
                 spans=[pattern_tree.span],
                 help=f"Did you mean `pattern: {pattern}`?",
             )
         for boolean_operator_yaml, sub_pattern in pattern.items():
             boolean_operator: str = boolean_operator_yaml.value
             operator = operator_for_pattern_name(boolean_operator_yaml)
             if operator in set(OPERATORS_WITH_CHILDREN):
                 if isinstance(sub_pattern.value, list):
                     sub_expression = self._parse_boolean_expression(
                         sub_pattern, 0,
                         f"{prefix}.{rule_index}.{pattern_id_idx}")
                     yield BooleanRuleExpression(
                         operator=operator,
                         pattern_id=None,
                         children=list(sub_expression),
                         operand=None,
                     )
                 else:
                     raise InvalidRuleSchemaError(
                         short_msg="missing children",
                         long_msg=
                         f"operator {boolean_operator} must have children",
                         spans=[
                             boolean_operator_yaml.span.extend_to(
                                 sub_pattern.span)
                         ],
                     )
             else:
                 pattern_text, pattern_span = sub_pattern.value, sub_pattern.span
                 if isinstance(pattern_text, str):
                     pattern_id = PatternId(f"{prefix}.{pattern_id_idx}")
                     self._pattern_spans[pattern_id] = pattern_span
                     yield BooleanRuleExpression(
                         operator=operator,
                         pattern_id=pattern_id,
                         children=None,
                         operand=pattern_text,
                     )
                     pattern_id_idx += 1
                 else:
                     raise InvalidRuleSchemaError(
                         short_msg="invalid operand",
                         long_msg=
                         f"operand for {boolean_operator} must be a string, but instead was {type(sub_pattern.unroll()).__name__}",
                         spans=[
                             boolean_operator_yaml.span.extend_to(
                                 pattern_span).truncate(5)
                         ],
                     )