Exemple #1
0
 def test_echo_issues_outputs_proper_json_when_requested(self, mock_json):
     issue_1 = scanner.Issue(
         types.IssueType.Entropy, "foo", types.Chunk("foo", "/bar")
     )
     issue_2 = scanner.Issue(
         types.IssueType.RegEx, "bar", types.Chunk("foo", "/bar")
     )
     util.echo_issues([issue_1, issue_2], True, "/repo", "/output")
     mock_json.dumps.assert_called_once_with(
         {
             "project_path": "/repo",
             "output_dir": "/output",
             "found_issues": [
                 {
                     "issue_type": "High Entropy",
                     "issue_detail": None,
                     "diff": "foo",
                     "matched_string": "foo",
                     "signature": "4db0024275a64ac2bf5e7d061e130e283b0b37a44167b605643e06e33177f74e",
                     "file_path": "/bar",
                 },
                 {
                     "issue_type": "Regular Expression Match",
                     "issue_detail": None,
                     "diff": "foo",
                     "matched_string": "bar",
                     "signature": "1516f2c3395943be40811573bb63ed1e2b8fe3a0e6dcc8dbb43351ca90ba6822",
                     "file_path": "/bar",
                 },
             ],
         }
     )
Exemple #2
0
    def test_all_files_are_yielded_as_chunks(
        self,
        mock_extract: mock.MagicMock,
    ):
        self.mock_repo.return_value.branches = {"foo": mock.MagicMock()}
        test_scanner = scanner.GitRepoScanner(
            self.global_options, self.git_options, "."
        )
        mock_commit_1 = mock.MagicMock()
        mock_commit_1.parents = None
        mock_commit_2 = mock.MagicMock()
        mock_commit_2.parents = [mock_commit_1]
        self.mock_repo.return_value.walk.return_value = [
            mock_commit_2,
            mock_commit_1,
        ]
        self.mock_iter_diff.return_value = [("foo", "bar.py"), ("baz", "blah.py")]
        chunks = list(test_scanner.chunks)

        # These get duplicated in this test, because `_iter_diff` is called
        # both in the normal branch/commit iteration, and then once more afterward
        # to capture the first commit on the branch
        self.assertEqual(
            chunks,
            [
                types.Chunk("foo", "bar.py", mock_extract.return_value),
                types.Chunk("baz", "blah.py", mock_extract.return_value),
                types.Chunk("foo", "bar.py", mock_extract.return_value),
                types.Chunk("baz", "blah.py", mock_extract.return_value),
            ],
        )
Exemple #3
0
    def test_echo_result_outputs_proper_json_when_requested(
        self,
        mock_time,
        mock_scanner,
    ):
        mock_time.now.return_value.isoformat.return_value = "now:now:now"
        issue_1 = scanner.Issue(types.IssueType.Entropy, "foo",
                                types.Chunk("foo", "/bar", {}))
        issue_2 = scanner.Issue(types.IssueType.RegEx, "bar",
                                types.Chunk("foo", "/bar", {}))
        mock_scanner.scan.return_value = (issue_1, issue_2)
        mock_scanner.excluded_paths = []
        options = generate_options(
            GlobalOptions,
            output_format=types.OutputFormat.Json.value,
            exclude_signatures=[],
            exclude_entropy_patterns=[],
        )

        # We're generating JSON piecemeal, so if we want to be safe we'll recover
        # the entire output, deserialize it (to confirm it's valid syntax) and
        # compare the result to the original input dictionary.
        with mock.patch("sys.stdout", new=StringIO()) as mock_stdout:
            util.echo_result(options, mock_scanner, "/repo", "/output")
            actual_output = mock_stdout.getvalue()

        self.assertEqual(
            json.loads(actual_output),
            {
                "scan_time":
                "now:now:now",
                "project_path":
                "/repo",
                "output_dir":
                "/output",
                "excluded_paths": [],
                "excluded_signatures": [],
                "exclude_entropy_patterns": [],
                "found_issues": [
                    {
                        "issue_type": "High Entropy",
                        "issue_detail": None,
                        "diff": "foo",
                        "matched_string": "foo",
                        "signature":
                        "4db0024275a64ac2bf5e7d061e130e283b0b37a44167b605643e06e33177f74e",
                        "file_path": "/bar",
                    },
                    {
                        "issue_type": "Regular Expression Match",
                        "issue_detail": None,
                        "diff": "foo",
                        "matched_string": "bar",
                        "signature":
                        "1516f2c3395943be40811573bb63ed1e2b8fe3a0e6dcc8dbb43351ca90ba6822",
                        "file_path": "/bar",
                    },
                ],
            },
        )
 def test_all_files_are_yielded_as_chunks(
     self,
     mock_repo: mock.MagicMock,
     mock_extract: mock.MagicMock,
     mock_iter_diff_index: mock.MagicMock,
     mock_iter_commits: mock.MagicMock,
 ):
     mock_repo.return_value.remotes.origin.fetch.return_value = ["foo"]
     test_scanner = scanner.GitRepoScanner(self.global_options,
                                           self.git_options, ".")
     mock_commit_1 = mock.MagicMock()
     mock_commit_2 = mock.MagicMock()
     mock_iter_commits.return_value = [(mock_commit_1, mock_commit_2)]
     mock_iter_diff_index.return_value = [("foo", "bar.py"),
                                          ("baz", "blah.py")]
     chunks = list(test_scanner.chunks)
     # These get duplicated in this test, because `_iter_diff_index` is called
     # both in the normal branch/commit iteration, and then once more afterward
     # to capture the first commit on the branch
     self.assertEqual(
         chunks,
         [
             types.Chunk("foo", "bar.py", mock_extract.return_value),
             types.Chunk("baz", "blah.py", mock_extract.return_value),
             types.Chunk("foo", "bar.py", mock_extract.return_value),
             types.Chunk("baz", "blah.py", mock_extract.return_value),
         ],
     )
Exemple #5
0
    def chunks(self) -> Generator[types.Chunk, None, None]:
        """Yield individual diffs from the repository's history.

        :rtype: Generator[Chunk, None, None]
        :raises types.GitRemoteException: If there was an error fetching branches
        """
        already_searched: Set[bytes] = set()

        try:
            if self.git_options.branch:
                # Single branch only
                if self.git_options.fetch:
                    self._repo.remotes.origin.fetch(self.git_options.branch)
                unfiltered_branches = list(self._repo.branches)
                branches = [
                    x for x in unfiltered_branches
                    if x == self.git_options.branch
                ]
            else:
                # Everything
                if self.git_options.fetch:
                    self._repo.remotes.origin.fetch()
                branches = list(self._repo.branches)
        except git.GitCommandError as exc:
            raise types.GitRemoteException(exc.stderr.strip()) from exc

        for branch in branches:
            diff_index: git.DiffIndex = None
            diff_hash: bytes
            curr_commit: git.Commit = None
            prev_commit: git.Commit = None
            for curr_commit, prev_commit in self._iter_branch_commits(
                    self._repo, branch):
                diff_index = curr_commit.diff(prev_commit, create_patch=True)
                diff_hash = hashlib.md5(
                    (str(prev_commit) +
                     str(curr_commit)).encode("utf-8")).digest()
                if diff_hash in already_searched:
                    continue
                already_searched.add(diff_hash)
                for blob, file_path in self._iter_diff_index(diff_index):
                    yield types.Chunk(
                        blob,
                        file_path,
                        util.extract_commit_metadata(prev_commit, branch),
                    )

            # Finally, yield the first commit to the branch
            if curr_commit:
                diff = curr_commit.diff(git.NULL_TREE, create_patch=True)
                for blob, file_path in self._iter_diff_index(diff):
                    yield types.Chunk(
                        blob,
                        file_path,
                        util.extract_commit_metadata(prev_commit, branch),
                    )
 def test_echo_result_outputs_proper_json_when_requested_pathtype(
         self, mock_time, mock_json, mock_scanner):
     mock_time.now.return_value.isoformat.return_value = "now:now:now"
     issue_1 = scanner.Issue(types.IssueType.Entropy, "foo",
                             types.Chunk("foo", "/bar", {}))
     issue_2 = scanner.Issue(types.IssueType.RegEx, "bar",
                             types.Chunk("foo", "/bar", {}))
     mock_scanner.issues = [issue_1, issue_2]
     mock_scanner.excluded_paths = [
         re.compile("package-lock.json"),
         re.compile("poetry.lock"),
     ]
     exclude_signatures = [
         "fffffffffffff",
         "ooooooooooooo",
     ]
     options = generate_options(GlobalOptions,
                                json=True,
                                exclude_signatures=exclude_signatures)
     util.echo_result(options, mock_scanner, "/repo", Path("/tmp"))
     mock_json.dumps.assert_called_once_with({
         "scan_time":
         "now:now:now",
         "project_path":
         "/repo",
         "output_dir":
         str(Path("/tmp")),
         "excluded_paths": ["package-lock.json", "poetry.lock"],
         "excluded_signatures": [
             "fffffffffffff",
             "ooooooooooooo",
         ],
         "found_issues": [
             {
                 "issue_type": "High Entropy",
                 "issue_detail": None,
                 "diff": "foo",
                 "matched_string": "foo",
                 "signature":
                 "4db0024275a64ac2bf5e7d061e130e283b0b37a44167b605643e06e33177f74e",
                 "file_path": "/bar",
             },
             {
                 "issue_type": "Regular Expression Match",
                 "issue_detail": None,
                 "diff": "foo",
                 "matched_string": "bar",
                 "signature":
                 "1516f2c3395943be40811573bb63ed1e2b8fe3a0e6dcc8dbb43351ca90ba6822",
                 "file_path": "/bar",
             },
         ],
     })
 def test_populated_issues_list_does_not_rescan(self, mock_scan: mock.MagicMock):
     test_scanner = TestScanner(self.options)
     test_scanner._issues = [  # pylint: disable=protected-access
         scanner.Issue(types.IssueType.RegEx, "foo", types.Chunk("foo", "bar"))
     ]
     test_scanner.issues  # pylint: disable=pointless-statement
     mock_scan.assert_not_called()
Exemple #8
0
    def chunks(self):
        """Yield the individual file changes currently staged for commit.

        :rtype: Generator[Chunk, None, None]
        """
        diff_index = self._repo.index.diff(self._repo.head.commit,
                                           create_patch=True,
                                           R=True)
        for blob, file_path in self._iter_diff_index(diff_index):
            yield types.Chunk(blob, file_path, {})
Exemple #9
0
 def test_command_exits_with_positive_return_code_when_issues_are_found(
         self, mock_scanner: mock.MagicMock):
     mock_scanner.return_value.scan.return_value = [
         scanner.Issue(types.IssueType.Entropy, "foo",
                       types.Chunk("foo", "/bar", {}))
     ]
     runner = CliRunner()
     with runner.isolated_filesystem():
         result = runner.invoke(cli.main, ["scan-local-repo", "."])
     self.assertGreater(result.exit_code, 0)
 def test_issue_is_not_created_if_signature_is_excluded(
         self, mock_signature: mock.MagicMock):
     mock_signature.return_value = True
     test_scanner = TestScanner(self.options)
     test_scanner._rules_regexes = {  # pylint: disable=protected-access
         "foo": re.compile("foo")
     }
     chunk = types.Chunk("foo", "bar")
     issues = test_scanner.scan_regex(chunk)
     mock_signature.assert_called_once_with("foo", "bar")
     self.assertEqual(issues, [])
 def setUp(self) -> None:
     super().setUp()
     self.options.entropy = True
     self.chunk = types.Chunk(
         """
     foo bar
     asdfqwer
     """,
         "foo.py",
     )
     self.scanner = TestScanner(self.options)
Exemple #12
0
 def test_output_dir_is_created_if_it_does_not_exist(
         self, mock_scanner: mock.MagicMock):
     mock_scanner.return_value.scan.return_value = [
         scanner.Issue(types.IssueType.Entropy, "foo",
                       types.Chunk("foo", "/bar", {}))
     ]
     runner = CliRunner()
     with runner.isolated_filesystem():
         runner.invoke(
             cli.main,
             ["--output-dir", "./foo", "--json", "scan-local-repo", "."])
         self.assertTrue(Path("./foo").exists())
Exemple #13
0
    def test_echo_result_outputs_compact_format(self, mock_click,
                                                mock_scanner):
        options = generate_options(GlobalOptions,
                                   verbose=0,
                                   output_format="compact")
        issue1 = scanner.Issue(types.IssueType.Entropy, "foo",
                               types.Chunk("fullfoobar", "/what/foo", {}))
        issue2 = scanner.Issue(types.IssueType.RegEx, "bar",
                               types.Chunk("fullfoobar", "/what/bar", {}))
        issue2.issue_detail = "Meets the bar"
        mock_scanner.scan.return_value = (issue1, issue2)
        util.echo_result(options, mock_scanner, "", "")

        mock_click.echo.assert_has_calls([
            mock.call(
                "[High Entropy] /what/foo: foo (ea29b8c0f8a478f260689899393107cca188fbbff1c5a5bd4ff32c102cb60226, None)"
            ),
            mock.call(
                "[Regular Expression Match] /what/bar: bar (fa692eebc3d60e67a9f22b4b877d5939cb2ec96c0c26c7e5168b3b8b660c573c, Meets the bar)"
            ),
        ], )
Exemple #14
0
 def test_output_dir_is_not_called_out_when_outputting_json(
         self, mock_scanner: mock.MagicMock):
     mock_scanner.return_value.scan.return_value = [
         scanner.Issue(types.IssueType.Entropy, "foo",
                       types.Chunk("foo", "/bar", {}))
     ]
     runner = CliRunner()
     with runner.isolated_filesystem():
         result = runner.invoke(
             cli.main,
             ["--output-dir", "./foo", "--json", "scan-local-repo", "."])
     # All other outputs are mocked, so this is ensuring that the
     #   "Results have been saved in ..." message is not output.
     self.assertEqual(result.output, "")
 def test_all_regex_rules_are_checked(self):
     rule_1 = mock.MagicMock()
     rule_1.findall.return_value = []
     rule_2 = mock.MagicMock()
     rule_2.findall.return_value = []
     test_scanner = TestScanner(self.options)
     test_scanner._rules_regexes = {  # pylint: disable=protected-access
         "foo": rule_1,
         "bar": rule_2,
     }
     chunk = types.Chunk("foo", "bar")
     test_scanner.scan_regex(chunk)
     rule_1.findall.assert_called_once_with("foo")
     rule_2.findall.assert_called_once_with("foo")
 def test_issue_is_returned_if_signature_is_not_excluded(
         self, mock_signature: mock.MagicMock):
     mock_signature.return_value = False
     test_scanner = TestScanner(self.options)
     test_scanner._rules_regexes = {  # pylint: disable=protected-access
         "foo": re.compile("foo")
     }
     chunk = types.Chunk("foo", "bar")
     issues = test_scanner.scan_regex(chunk)
     mock_signature.assert_called_once_with("foo", "bar")
     self.assertEqual(len(issues), 1)
     self.assertEqual(issues[0].issue_detail, "foo")
     self.assertEqual(issues[0].issue_type, types.IssueType.RegEx)
     self.assertEqual(issues[0].matched_string, "foo")
Exemple #17
0
 def test_output_dir_is_called_out(self, mock_scanner: mock.MagicMock,
                                   mock_dt: mock.MagicMock):
     mock_scanner.return_value.scan.return_value = [
         scanner.Issue(types.IssueType.Entropy, "foo",
                       types.Chunk("foo", "/bar"))
     ]
     mock_dt.now.return_value.isoformat.return_value = "nownownow"
     runner = CliRunner()
     with runner.isolated_filesystem() as dirname:
         result = runner.invoke(
             cli.main, ["--output-dir", "./foo", "scan-local-repo", "."])
     self.assertEqual(
         result.output,
         f"Results have been saved in {Path(dirname).resolve()}/foo/tartufo-scan-results-nownownow\n",
     )
Exemple #18
0
 def test_issue_is_not_created_if_signature_is_excluded(
         self, mock_signature: mock.MagicMock):
     mock_signature.return_value = True
     test_scanner = TestScanner(self.options)
     test_scanner._rules_regexes = {  # pylint: disable=protected-access
         Rule(
             name="foo",
             pattern=re.compile("foo"),
             path_pattern=None,
             re_match_type=MatchType.Match,
             re_match_scope=None,
         )
     }
     chunk = types.Chunk("foo", "bar", {})
     issues = list(test_scanner.scan_regex(chunk))
     mock_signature.assert_called_once_with("foo", "bar")
     self.assertEqual(issues, [])
Exemple #19
0
 def test_output_dir_is_valid_name_in_windows(self,
                                              mock_scanner: mock.MagicMock,
                                              mock_dt: mock.MagicMock):
     mock_scanner.return_value.scan.return_value = [
         scanner.Issue(types.IssueType.Entropy, "foo",
                       types.Chunk("foo", "/bar", {}))
     ]
     mock_dt.now.return_value.isoformat.return_value = "now:now:now"
     runner = CliRunner()
     with runner.isolated_filesystem() as dirname:
         output_dir = (Path(dirname) / "foo").resolve()
         result = runner.invoke(
             cli.main,
             ["--output-dir",
              str(output_dir), "scan-local-repo", "."])
     result_dir = output_dir / "tartufo-scan-results-nownownow"
     self.assertEqual(
         result.output,
         f"Results have been saved in {result_dir}\n",
     )
Exemple #20
0
 def test_as_dict_returns_compact_dictionary(self):
     issue = Issue(
         types.IssueType.Entropy,
         "test-string",
         types.Chunk(
             "test-contents", "test-file", {"test-meta1": "test-meta-value"}
         ),
     )
     issue.issue_detail = "issue-detail"
     actual = issue.as_dict(compact=True)
     self.assertEqual(
         {
             "file_path": "test-file",
             "issue_detail": "issue-detail",
             "issue_type": "High Entropy",
             "matched_string": "test-string",
             "signature": "bf09b8c7e62db27c45e618f4aa9d8b13bf91cf3de593b11c1fb515e8b1003ca8",
         },
         actual,
     )
Exemple #21
0
 def test_all_regex_rules_are_checked(self):
     rule_1 = mock.MagicMock()
     rule_1.findall.return_value = []
     rule_2 = mock.MagicMock()
     rule_2.findall.return_value = []
     rule_2_path = mock.MagicMock()
     rule_2_path.match = mock.MagicMock(return_value=["/file/path"])
     rule_3 = mock.MagicMock()
     rule_3_path = mock.MagicMock()
     rule_3_path.match = mock.MagicMock(return_value=[])
     test_scanner = TestScanner(self.options)
     test_scanner._rules_regexes = {  # pylint: disable=protected-access
         Rule(
             name="foo",
             pattern=rule_1,
             path_pattern=None,
             re_match_type=MatchType.Match,
             re_match_scope=None,
         ),
         Rule(
             name="bar",
             pattern=rule_2,
             path_pattern=rule_2_path,
             re_match_type=MatchType.Match,
             re_match_scope=None,
         ),
         Rule(
             name="not-found",
             pattern=rule_3,
             path_pattern=rule_3_path,
             re_match_type=MatchType.Match,
             re_match_scope=None,
         ),
     }
     chunk = types.Chunk("foo", "/file/path", {})
     list(test_scanner.scan_regex(chunk))
     rule_1.findall.assert_called_once_with("foo")
     rule_2.findall.assert_called_once_with("foo")
     rule_2_path.match.assert_called_once_with("/file/path")
     rule_3_path.match.assert_called_once_with("/file/path")
     rule_3.assert_not_called()
Exemple #22
0
    def chunks(self) -> Generator[types.Chunk, None, None]:
        """Yield individual diffs from the repository's history.

        :raises types.GitRemoteException: If there was an error fetching branches
        """
        already_searched: Set[bytes] = set()

        try:
            if self.git_options.branch:
                # Single branch only
                branch = self._repo.branches.get(self.git_options.branch)
                if not branch:
                    raise BranchNotFoundException(
                        f"Branch {self.git_options.branch} was not found.")
                branches = [self.git_options.branch]
            else:
                # Everything
                if util.is_shallow_clone(self._repo):
                    # If this is a shallow clone, examine the repo head as a single
                    # commit to scan all files at once
                    branches = ["HEAD"]
                else:
                    # We use `self._repo.branches` here so that we make sure to
                    # scan not only the locally checked out branches (as provided
                    # by self._repo.listall_branches()), but to also scan all
                    # available remote refs
                    branches = list(self._repo.branches)
        except pygit2.GitError as exc:
            raise types.GitRemoteException(str(exc)) from exc

        self.logger.debug(
            "Branches to be scanned: %s",
            ", ".join([str(branch) for branch in branches]),
        )

        for branch_name in branches:
            self.logger.info("Scanning branch: %s", branch_name)
            if branch_name == "HEAD":
                commits = [self._repo.get(self._repo.head.target)]
            else:
                branch = self._repo.branches.get(branch_name)
                try:
                    commits = self._repo.walk(branch.resolve().target,
                                              pygit2.GIT_SORT_TOPOLOGICAL)
                except AttributeError:
                    self.logger.debug(
                        "Skipping branch %s because it cannot be resolved.",
                        branch_name)
                    continue
            diff_hash: bytes
            curr_commit: pygit2.Commit = None
            prev_commit: pygit2.Commit = None
            for curr_commit in commits:
                try:
                    prev_commit = curr_commit.parents[0]
                except (IndexError, KeyError, TypeError):
                    # IndexError: current commit has no parents
                    # KeyError: current commit has parents which are not local
                    # If a commit doesn't have a parent skip diff generation since it is the first commit
                    self.logger.debug(
                        "Skipping commit %s because it has no parents",
                        curr_commit.hex)
                    continue
                diff: pygit2.Diff = self._repo.diff(prev_commit, curr_commit)
                diff_hash = hashlib.md5(
                    (str(prev_commit) +
                     str(curr_commit)).encode("utf-8")).digest()
                if diff_hash in already_searched:
                    continue
                already_searched.add(diff_hash)
                diff.find_similar()
                for blob, file_path in self._iter_diff_index(diff):
                    yield types.Chunk(
                        blob,
                        file_path,
                        util.extract_commit_metadata(curr_commit, branch_name),
                    )

            # Finally, yield the first commit to the branch
            if curr_commit:
                tree: pygit2.Tree = self._repo.revparse_single(
                    curr_commit.hex).tree
                tree_diff: pygit2.Diff = tree.diff_to_tree(swap=True)
                iter_diff = self._iter_diff_index(tree_diff)
                for blob, file_path in iter_diff:
                    yield types.Chunk(
                        blob,
                        file_path,
                        util.extract_commit_metadata(curr_commit, branch_name),
                    )
Exemple #23
0
 def chunks(self):
     """Yield the individual file changes currently staged for commit."""
     diff_index = self._repo.diff("HEAD")
     for blob, file_path in self._iter_diff_index(diff_index):
         yield types.Chunk(blob, file_path, {})
Exemple #24
0
    def chunks(self) -> Generator[types.Chunk, None, None]:
        """Yield the individual files in the target directory."""

        for blob, file_path in self._iter_folder():
            yield types.Chunk(blob, file_path, {})