def test_echo_issues_outputs_proper_json_when_requested(self, mock_json): issue_1 = scanner.Issue( types.IssueType.Entropy, "foo", types.Chunk("foo", "/bar") ) issue_2 = scanner.Issue( types.IssueType.RegEx, "bar", types.Chunk("foo", "/bar") ) util.echo_issues([issue_1, issue_2], True, "/repo", "/output") mock_json.dumps.assert_called_once_with( { "project_path": "/repo", "output_dir": "/output", "found_issues": [ { "issue_type": "High Entropy", "issue_detail": None, "diff": "foo", "matched_string": "foo", "signature": "4db0024275a64ac2bf5e7d061e130e283b0b37a44167b605643e06e33177f74e", "file_path": "/bar", }, { "issue_type": "Regular Expression Match", "issue_detail": None, "diff": "foo", "matched_string": "bar", "signature": "1516f2c3395943be40811573bb63ed1e2b8fe3a0e6dcc8dbb43351ca90ba6822", "file_path": "/bar", }, ], } )
def test_all_files_are_yielded_as_chunks( self, mock_extract: mock.MagicMock, ): self.mock_repo.return_value.branches = {"foo": mock.MagicMock()} test_scanner = scanner.GitRepoScanner( self.global_options, self.git_options, "." ) mock_commit_1 = mock.MagicMock() mock_commit_1.parents = None mock_commit_2 = mock.MagicMock() mock_commit_2.parents = [mock_commit_1] self.mock_repo.return_value.walk.return_value = [ mock_commit_2, mock_commit_1, ] self.mock_iter_diff.return_value = [("foo", "bar.py"), ("baz", "blah.py")] chunks = list(test_scanner.chunks) # These get duplicated in this test, because `_iter_diff` is called # both in the normal branch/commit iteration, and then once more afterward # to capture the first commit on the branch self.assertEqual( chunks, [ types.Chunk("foo", "bar.py", mock_extract.return_value), types.Chunk("baz", "blah.py", mock_extract.return_value), types.Chunk("foo", "bar.py", mock_extract.return_value), types.Chunk("baz", "blah.py", mock_extract.return_value), ], )
def test_echo_result_outputs_proper_json_when_requested( self, mock_time, mock_scanner, ): mock_time.now.return_value.isoformat.return_value = "now:now:now" issue_1 = scanner.Issue(types.IssueType.Entropy, "foo", types.Chunk("foo", "/bar", {})) issue_2 = scanner.Issue(types.IssueType.RegEx, "bar", types.Chunk("foo", "/bar", {})) mock_scanner.scan.return_value = (issue_1, issue_2) mock_scanner.excluded_paths = [] options = generate_options( GlobalOptions, output_format=types.OutputFormat.Json.value, exclude_signatures=[], exclude_entropy_patterns=[], ) # We're generating JSON piecemeal, so if we want to be safe we'll recover # the entire output, deserialize it (to confirm it's valid syntax) and # compare the result to the original input dictionary. with mock.patch("sys.stdout", new=StringIO()) as mock_stdout: util.echo_result(options, mock_scanner, "/repo", "/output") actual_output = mock_stdout.getvalue() self.assertEqual( json.loads(actual_output), { "scan_time": "now:now:now", "project_path": "/repo", "output_dir": "/output", "excluded_paths": [], "excluded_signatures": [], "exclude_entropy_patterns": [], "found_issues": [ { "issue_type": "High Entropy", "issue_detail": None, "diff": "foo", "matched_string": "foo", "signature": "4db0024275a64ac2bf5e7d061e130e283b0b37a44167b605643e06e33177f74e", "file_path": "/bar", }, { "issue_type": "Regular Expression Match", "issue_detail": None, "diff": "foo", "matched_string": "bar", "signature": "1516f2c3395943be40811573bb63ed1e2b8fe3a0e6dcc8dbb43351ca90ba6822", "file_path": "/bar", }, ], }, )
def test_all_files_are_yielded_as_chunks( self, mock_repo: mock.MagicMock, mock_extract: mock.MagicMock, mock_iter_diff_index: mock.MagicMock, mock_iter_commits: mock.MagicMock, ): mock_repo.return_value.remotes.origin.fetch.return_value = ["foo"] test_scanner = scanner.GitRepoScanner(self.global_options, self.git_options, ".") mock_commit_1 = mock.MagicMock() mock_commit_2 = mock.MagicMock() mock_iter_commits.return_value = [(mock_commit_1, mock_commit_2)] mock_iter_diff_index.return_value = [("foo", "bar.py"), ("baz", "blah.py")] chunks = list(test_scanner.chunks) # These get duplicated in this test, because `_iter_diff_index` is called # both in the normal branch/commit iteration, and then once more afterward # to capture the first commit on the branch self.assertEqual( chunks, [ types.Chunk("foo", "bar.py", mock_extract.return_value), types.Chunk("baz", "blah.py", mock_extract.return_value), types.Chunk("foo", "bar.py", mock_extract.return_value), types.Chunk("baz", "blah.py", mock_extract.return_value), ], )
def chunks(self) -> Generator[types.Chunk, None, None]: """Yield individual diffs from the repository's history. :rtype: Generator[Chunk, None, None] :raises types.GitRemoteException: If there was an error fetching branches """ already_searched: Set[bytes] = set() try: if self.git_options.branch: # Single branch only if self.git_options.fetch: self._repo.remotes.origin.fetch(self.git_options.branch) unfiltered_branches = list(self._repo.branches) branches = [ x for x in unfiltered_branches if x == self.git_options.branch ] else: # Everything if self.git_options.fetch: self._repo.remotes.origin.fetch() branches = list(self._repo.branches) except git.GitCommandError as exc: raise types.GitRemoteException(exc.stderr.strip()) from exc for branch in branches: diff_index: git.DiffIndex = None diff_hash: bytes curr_commit: git.Commit = None prev_commit: git.Commit = None for curr_commit, prev_commit in self._iter_branch_commits( self._repo, branch): diff_index = curr_commit.diff(prev_commit, create_patch=True) diff_hash = hashlib.md5( (str(prev_commit) + str(curr_commit)).encode("utf-8")).digest() if diff_hash in already_searched: continue already_searched.add(diff_hash) for blob, file_path in self._iter_diff_index(diff_index): yield types.Chunk( blob, file_path, util.extract_commit_metadata(prev_commit, branch), ) # Finally, yield the first commit to the branch if curr_commit: diff = curr_commit.diff(git.NULL_TREE, create_patch=True) for blob, file_path in self._iter_diff_index(diff): yield types.Chunk( blob, file_path, util.extract_commit_metadata(prev_commit, branch), )
def test_echo_result_outputs_proper_json_when_requested_pathtype( self, mock_time, mock_json, mock_scanner): mock_time.now.return_value.isoformat.return_value = "now:now:now" issue_1 = scanner.Issue(types.IssueType.Entropy, "foo", types.Chunk("foo", "/bar", {})) issue_2 = scanner.Issue(types.IssueType.RegEx, "bar", types.Chunk("foo", "/bar", {})) mock_scanner.issues = [issue_1, issue_2] mock_scanner.excluded_paths = [ re.compile("package-lock.json"), re.compile("poetry.lock"), ] exclude_signatures = [ "fffffffffffff", "ooooooooooooo", ] options = generate_options(GlobalOptions, json=True, exclude_signatures=exclude_signatures) util.echo_result(options, mock_scanner, "/repo", Path("/tmp")) mock_json.dumps.assert_called_once_with({ "scan_time": "now:now:now", "project_path": "/repo", "output_dir": str(Path("/tmp")), "excluded_paths": ["package-lock.json", "poetry.lock"], "excluded_signatures": [ "fffffffffffff", "ooooooooooooo", ], "found_issues": [ { "issue_type": "High Entropy", "issue_detail": None, "diff": "foo", "matched_string": "foo", "signature": "4db0024275a64ac2bf5e7d061e130e283b0b37a44167b605643e06e33177f74e", "file_path": "/bar", }, { "issue_type": "Regular Expression Match", "issue_detail": None, "diff": "foo", "matched_string": "bar", "signature": "1516f2c3395943be40811573bb63ed1e2b8fe3a0e6dcc8dbb43351ca90ba6822", "file_path": "/bar", }, ], })
def test_populated_issues_list_does_not_rescan(self, mock_scan: mock.MagicMock): test_scanner = TestScanner(self.options) test_scanner._issues = [ # pylint: disable=protected-access scanner.Issue(types.IssueType.RegEx, "foo", types.Chunk("foo", "bar")) ] test_scanner.issues # pylint: disable=pointless-statement mock_scan.assert_not_called()
def chunks(self): """Yield the individual file changes currently staged for commit. :rtype: Generator[Chunk, None, None] """ diff_index = self._repo.index.diff(self._repo.head.commit, create_patch=True, R=True) for blob, file_path in self._iter_diff_index(diff_index): yield types.Chunk(blob, file_path, {})
def test_command_exits_with_positive_return_code_when_issues_are_found( self, mock_scanner: mock.MagicMock): mock_scanner.return_value.scan.return_value = [ scanner.Issue(types.IssueType.Entropy, "foo", types.Chunk("foo", "/bar", {})) ] runner = CliRunner() with runner.isolated_filesystem(): result = runner.invoke(cli.main, ["scan-local-repo", "."]) self.assertGreater(result.exit_code, 0)
def test_issue_is_not_created_if_signature_is_excluded( self, mock_signature: mock.MagicMock): mock_signature.return_value = True test_scanner = TestScanner(self.options) test_scanner._rules_regexes = { # pylint: disable=protected-access "foo": re.compile("foo") } chunk = types.Chunk("foo", "bar") issues = test_scanner.scan_regex(chunk) mock_signature.assert_called_once_with("foo", "bar") self.assertEqual(issues, [])
def setUp(self) -> None: super().setUp() self.options.entropy = True self.chunk = types.Chunk( """ foo bar asdfqwer """, "foo.py", ) self.scanner = TestScanner(self.options)
def test_output_dir_is_created_if_it_does_not_exist( self, mock_scanner: mock.MagicMock): mock_scanner.return_value.scan.return_value = [ scanner.Issue(types.IssueType.Entropy, "foo", types.Chunk("foo", "/bar", {})) ] runner = CliRunner() with runner.isolated_filesystem(): runner.invoke( cli.main, ["--output-dir", "./foo", "--json", "scan-local-repo", "."]) self.assertTrue(Path("./foo").exists())
def test_echo_result_outputs_compact_format(self, mock_click, mock_scanner): options = generate_options(GlobalOptions, verbose=0, output_format="compact") issue1 = scanner.Issue(types.IssueType.Entropy, "foo", types.Chunk("fullfoobar", "/what/foo", {})) issue2 = scanner.Issue(types.IssueType.RegEx, "bar", types.Chunk("fullfoobar", "/what/bar", {})) issue2.issue_detail = "Meets the bar" mock_scanner.scan.return_value = (issue1, issue2) util.echo_result(options, mock_scanner, "", "") mock_click.echo.assert_has_calls([ mock.call( "[High Entropy] /what/foo: foo (ea29b8c0f8a478f260689899393107cca188fbbff1c5a5bd4ff32c102cb60226, None)" ), mock.call( "[Regular Expression Match] /what/bar: bar (fa692eebc3d60e67a9f22b4b877d5939cb2ec96c0c26c7e5168b3b8b660c573c, Meets the bar)" ), ], )
def test_output_dir_is_not_called_out_when_outputting_json( self, mock_scanner: mock.MagicMock): mock_scanner.return_value.scan.return_value = [ scanner.Issue(types.IssueType.Entropy, "foo", types.Chunk("foo", "/bar", {})) ] runner = CliRunner() with runner.isolated_filesystem(): result = runner.invoke( cli.main, ["--output-dir", "./foo", "--json", "scan-local-repo", "."]) # All other outputs are mocked, so this is ensuring that the # "Results have been saved in ..." message is not output. self.assertEqual(result.output, "")
def test_all_regex_rules_are_checked(self): rule_1 = mock.MagicMock() rule_1.findall.return_value = [] rule_2 = mock.MagicMock() rule_2.findall.return_value = [] test_scanner = TestScanner(self.options) test_scanner._rules_regexes = { # pylint: disable=protected-access "foo": rule_1, "bar": rule_2, } chunk = types.Chunk("foo", "bar") test_scanner.scan_regex(chunk) rule_1.findall.assert_called_once_with("foo") rule_2.findall.assert_called_once_with("foo")
def test_issue_is_returned_if_signature_is_not_excluded( self, mock_signature: mock.MagicMock): mock_signature.return_value = False test_scanner = TestScanner(self.options) test_scanner._rules_regexes = { # pylint: disable=protected-access "foo": re.compile("foo") } chunk = types.Chunk("foo", "bar") issues = test_scanner.scan_regex(chunk) mock_signature.assert_called_once_with("foo", "bar") self.assertEqual(len(issues), 1) self.assertEqual(issues[0].issue_detail, "foo") self.assertEqual(issues[0].issue_type, types.IssueType.RegEx) self.assertEqual(issues[0].matched_string, "foo")
def test_output_dir_is_called_out(self, mock_scanner: mock.MagicMock, mock_dt: mock.MagicMock): mock_scanner.return_value.scan.return_value = [ scanner.Issue(types.IssueType.Entropy, "foo", types.Chunk("foo", "/bar")) ] mock_dt.now.return_value.isoformat.return_value = "nownownow" runner = CliRunner() with runner.isolated_filesystem() as dirname: result = runner.invoke( cli.main, ["--output-dir", "./foo", "scan-local-repo", "."]) self.assertEqual( result.output, f"Results have been saved in {Path(dirname).resolve()}/foo/tartufo-scan-results-nownownow\n", )
def test_issue_is_not_created_if_signature_is_excluded( self, mock_signature: mock.MagicMock): mock_signature.return_value = True test_scanner = TestScanner(self.options) test_scanner._rules_regexes = { # pylint: disable=protected-access Rule( name="foo", pattern=re.compile("foo"), path_pattern=None, re_match_type=MatchType.Match, re_match_scope=None, ) } chunk = types.Chunk("foo", "bar", {}) issues = list(test_scanner.scan_regex(chunk)) mock_signature.assert_called_once_with("foo", "bar") self.assertEqual(issues, [])
def test_output_dir_is_valid_name_in_windows(self, mock_scanner: mock.MagicMock, mock_dt: mock.MagicMock): mock_scanner.return_value.scan.return_value = [ scanner.Issue(types.IssueType.Entropy, "foo", types.Chunk("foo", "/bar", {})) ] mock_dt.now.return_value.isoformat.return_value = "now:now:now" runner = CliRunner() with runner.isolated_filesystem() as dirname: output_dir = (Path(dirname) / "foo").resolve() result = runner.invoke( cli.main, ["--output-dir", str(output_dir), "scan-local-repo", "."]) result_dir = output_dir / "tartufo-scan-results-nownownow" self.assertEqual( result.output, f"Results have been saved in {result_dir}\n", )
def test_as_dict_returns_compact_dictionary(self): issue = Issue( types.IssueType.Entropy, "test-string", types.Chunk( "test-contents", "test-file", {"test-meta1": "test-meta-value"} ), ) issue.issue_detail = "issue-detail" actual = issue.as_dict(compact=True) self.assertEqual( { "file_path": "test-file", "issue_detail": "issue-detail", "issue_type": "High Entropy", "matched_string": "test-string", "signature": "bf09b8c7e62db27c45e618f4aa9d8b13bf91cf3de593b11c1fb515e8b1003ca8", }, actual, )
def test_all_regex_rules_are_checked(self): rule_1 = mock.MagicMock() rule_1.findall.return_value = [] rule_2 = mock.MagicMock() rule_2.findall.return_value = [] rule_2_path = mock.MagicMock() rule_2_path.match = mock.MagicMock(return_value=["/file/path"]) rule_3 = mock.MagicMock() rule_3_path = mock.MagicMock() rule_3_path.match = mock.MagicMock(return_value=[]) test_scanner = TestScanner(self.options) test_scanner._rules_regexes = { # pylint: disable=protected-access Rule( name="foo", pattern=rule_1, path_pattern=None, re_match_type=MatchType.Match, re_match_scope=None, ), Rule( name="bar", pattern=rule_2, path_pattern=rule_2_path, re_match_type=MatchType.Match, re_match_scope=None, ), Rule( name="not-found", pattern=rule_3, path_pattern=rule_3_path, re_match_type=MatchType.Match, re_match_scope=None, ), } chunk = types.Chunk("foo", "/file/path", {}) list(test_scanner.scan_regex(chunk)) rule_1.findall.assert_called_once_with("foo") rule_2.findall.assert_called_once_with("foo") rule_2_path.match.assert_called_once_with("/file/path") rule_3_path.match.assert_called_once_with("/file/path") rule_3.assert_not_called()
def chunks(self) -> Generator[types.Chunk, None, None]: """Yield individual diffs from the repository's history. :raises types.GitRemoteException: If there was an error fetching branches """ already_searched: Set[bytes] = set() try: if self.git_options.branch: # Single branch only branch = self._repo.branches.get(self.git_options.branch) if not branch: raise BranchNotFoundException( f"Branch {self.git_options.branch} was not found.") branches = [self.git_options.branch] else: # Everything if util.is_shallow_clone(self._repo): # If this is a shallow clone, examine the repo head as a single # commit to scan all files at once branches = ["HEAD"] else: # We use `self._repo.branches` here so that we make sure to # scan not only the locally checked out branches (as provided # by self._repo.listall_branches()), but to also scan all # available remote refs branches = list(self._repo.branches) except pygit2.GitError as exc: raise types.GitRemoteException(str(exc)) from exc self.logger.debug( "Branches to be scanned: %s", ", ".join([str(branch) for branch in branches]), ) for branch_name in branches: self.logger.info("Scanning branch: %s", branch_name) if branch_name == "HEAD": commits = [self._repo.get(self._repo.head.target)] else: branch = self._repo.branches.get(branch_name) try: commits = self._repo.walk(branch.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL) except AttributeError: self.logger.debug( "Skipping branch %s because it cannot be resolved.", branch_name) continue diff_hash: bytes curr_commit: pygit2.Commit = None prev_commit: pygit2.Commit = None for curr_commit in commits: try: prev_commit = curr_commit.parents[0] except (IndexError, KeyError, TypeError): # IndexError: current commit has no parents # KeyError: current commit has parents which are not local # If a commit doesn't have a parent skip diff generation since it is the first commit self.logger.debug( "Skipping commit %s because it has no parents", curr_commit.hex) continue diff: pygit2.Diff = self._repo.diff(prev_commit, curr_commit) diff_hash = hashlib.md5( (str(prev_commit) + str(curr_commit)).encode("utf-8")).digest() if diff_hash in already_searched: continue already_searched.add(diff_hash) diff.find_similar() for blob, file_path in self._iter_diff_index(diff): yield types.Chunk( blob, file_path, util.extract_commit_metadata(curr_commit, branch_name), ) # Finally, yield the first commit to the branch if curr_commit: tree: pygit2.Tree = self._repo.revparse_single( curr_commit.hex).tree tree_diff: pygit2.Diff = tree.diff_to_tree(swap=True) iter_diff = self._iter_diff_index(tree_diff) for blob, file_path in iter_diff: yield types.Chunk( blob, file_path, util.extract_commit_metadata(curr_commit, branch_name), )
def chunks(self): """Yield the individual file changes currently staged for commit.""" diff_index = self._repo.diff("HEAD") for blob, file_path in self._iter_diff_index(diff_index): yield types.Chunk(blob, file_path, {})
def chunks(self) -> Generator[types.Chunk, None, None]: """Yield the individual files in the target directory.""" for blob, file_path in self._iter_folder(): yield types.Chunk(blob, file_path, {})