def create_same_files(pth: Path, fixtures): filename = "wheel-0.34.2.tar.gz" file_pth = fixtures.path(f"mirror/{filename}") assert os.path.exists(file_pth) struct = [] # Same file in the same directory (no FS changes) dest1 = pth / "same_files/same_directory" / filename dest1.parent.mkdir(parents=True) os.symlink(src=file_pth, dst=dest1) struct.append( (ScanLocation(dest1.parent), ScanLocation(dest1.parent), dest1)) # File has been moved to a subdirectory dest2 = pth / "same_files/sub_directory/sub_directory" / filename dest2.parent.mkdir(parents=True) os.symlink(src=file_pth, dst=dest2) struct.append( (ScanLocation(dest1.parent), ScanLocation(dest2.parent.parent), { "operation": "R", "a_ref": dest1.name, "b_ref": f"sub_directory/{filename}" })) return struct
def create_del_file(pth: Path, fixtures): filename = "wheel-0.34.2.tar.gz" file_pth = fixtures.path(f"mirror/{filename}") assert os.path.exists(file_pth) struct = [] dest1 = pth / "del_file/file_location/subdirectory" / filename dest1.parent.mkdir(parents=True) dest2 = pth / "del_file/empty_directory" dest2.mkdir(parents=True) os.symlink(src=file_pth, dst=dest1) struct.append((ScanLocation(dest1.parent), ScanLocation(dest2), { "operation": "D", "a_ref": filename, "b_ref": None, "a_size": 58330, "b_size": 0 })) struct.append((ScanLocation(dest1.parent.parent), ScanLocation(dest2), { "operation": "D", "a_ref": f"subdirectory/{filename}", "b_ref": None, "a_size": 58330, "b_size": 0 })) return struct
def create_similar_file(pth: Path, fixtures): struct = [] filename = "misc.py" file_pth = Path(fixtures.path(filename)) assert os.path.exists(file_pth) with file_pth.open("r") as fd: file_content = fd.readlines() del file_pth dest1 = pth / "similar_file/file" / filename dest1.parent.mkdir(parents=True) with dest1.open("w") as fd: fd.writelines(file_content) dest2 = pth / "similar_file/directory1/subdirectory" / filename dest2.parent.mkdir(parents=True) with dest2.open("w") as fd: fd.writelines(file_content[:5]) struct.append((ScanLocation(dest1.parent), ScanLocation(dest2.parent), { "operation": "M", "a_ref": filename, "b_ref": filename, "a_mime": "text/x-python", "b_mime": "text/x-python" })) return struct
def create_add_file(pth: Path, fixtures): filename = "wheel-0.34.2.tar.gz" file_pth = fixtures.path(f"mirror/{filename}") assert os.path.exists(file_pth) struct = [] # Add a new file to the same directory dest1 = pth / "add_file/empty_directory" dest1.mkdir(parents=True) dest2 = pth / "add_file/directory1" / filename dest2.parent.mkdir(parents=True) os.symlink(src=file_pth, dst=dest2) struct.append((ScanLocation(dest1), ScanLocation(dest2.parent), { "operation": "A", "a_ref": None, "b_ref": filename, "a_size": 0, "b_size": 58330 })) # Add a new file into subdirectory dest2 = pth / "add_file/directory2/subdirectory" / filename dest2.parent.mkdir(parents=True) os.symlink(src=file_pth, dst=dest2) struct.append((ScanLocation(dest1), ScanLocation(dest2.parent.parent), { "operation": "A", "a_ref": None, "b_ref": f"subdirectory/{filename}", "a_size": 0, "b_size": 58330 })) return struct
def test_file_matcher_closure(fixtures, tmp_path): arch1 = fixtures.path("mirror/wheel-0.33.0-py2.py3-none-any.whl") arch2 = fixtures.path("mirror/wheel-0.34.2-py2.py3-none-any.whl") loc1 = ScanLocation(arch1, strip_path=os.fspath(fixtures.BASE_PATH)) loc2 = ScanLocation(arch2, strip_path=os.fspath(fixtures.BASE_PATH)) apath1 = tmp_path / "arch1" apath2 = tmp_path / "arch2" list(extract(loc1, destination=str(apath1))) list(extract(loc2, destination=str(apath2))) left_content = list(get_directory_content(apath1)) right_content = list(get_directory_content(apath2)) fm = diff.FileMatcher(left_content, right_content) closure = fm.get_closure() added = [str(x) for x in closure["added"]] modified = {(str(x[0]), str(x[1])): x[2] for x in closure["modified"]} removed = [str(x) for x in closure["removed"]] for a in WHEEL_CLOSURE["added"]: assert a in added for r in WHEEL_CLOSURE["removed"]: assert r in removed for left_name, right_name, ratio in WHEEL_CLOSURE["modified"]: assert (left_name, right_name) in modified assert modified[(left_name, right_name)] == pytest.approx(ratio) # Test that there are not duplicate elements on right or left side of modified files matches assert len(set(x[0] for x in modified)) == len(modified) assert len(set(x[1] for x in modified)) == len(modified)
def test_closure_archive_files(fixtures): arch1 = fixtures.path("mirror/wheel-0.34.2-py2.py3-none-any.whl") arch2 = fixtures.path("mirror/wheel-0.34.2.tar.gz") loc1 = ScanLocation(arch1, strip_path=str(fixtures.BASE_PATH)) loc2 = ScanLocation(arch2, strip_path=str(fixtures.BASE_PATH)) closure = diff.FileMatcher(left_files=[loc1], right_files=[loc2]).get_closure() assert len(closure["modified"]) == 1 x = closure["modified"][0] assert x[0] == loc1 assert x[1] == loc2 assert x[2] > 0.0 # Similarity ratio assert len(closure["added"]) == 0 assert len(closure["removed"]) == 0
def process_taint(src: str, pattern: str, taint: str="tainted"): tree = collect(dedent(src), minimal=True) loc = ScanLocation(location="<unknown>") p = ASTPattern({ "pattern": pattern, "taint": taint }) with patch.object(config, "get_ast_patterns", return_value=[p]) as mock: v = Visitor.run_stages(location=loc, ast_tree=tree) return v.tree[-1]
def process_source_code(src: str, single=True) -> NodeType: tree = collect(dedent(src), minimal=True) loc = ScanLocation(location="<unknown>") v = Visitor.run_stages(location=loc, stages=("convert", "rewrite"), ast_tree=tree) if single: return v.tree[-1] else: return v.tree
def test_diff_same_renamed(fixtures, tmp_path): orig_pth = fixtures.path("diffs/1_a/src.py") a_pth = tmp_path / "a.py" os.symlink(src=orig_pth, dst=a_pth) b_pth = tmp_path / "b.py" os.symlink(src=orig_pth, dst=b_pth) a_loc = ScanLocation(a_pth, strip_path=str(tmp_path)) b_loc = ScanLocation(b_pth, strip_path=str(tmp_path)) d = diff.DiffAnalyzer() d.compare(a_loc, b_loc) assert len(d.diffs) == 1 x = d.diffs[0] assert x.operation == "M" assert x.a_scan is a_loc assert x.b_scan is b_loc assert x.similarity == 1.0 assert x.diff is None
def process_taint(src: str, pattern: str, cache_mock, taint: str="tainted"): tree = collect(dedent(src), minimal=True) loc = ScanLocation(location="<unknown>") p = ASTPattern({ "pattern": pattern, "taint": taint }) cache_mock.return_value = [p] v = Visitor.run_stages(location=loc, ast_tree=tree) return v.tree[-1]
def test_diff_file_similar(fixtures): a_loc = ScanLocation(fixtures.path("diffs/1_a/src.py")) b_loc = ScanLocation(fixtures.path("diffs/1_b/src.py")) d = diff.DiffAnalyzer() d.compare(a_loc, b_loc) assert len(d.diffs) == 1 x = d.diffs[0] assert x.operation == "M" assert x.a_scan is a_loc assert x.b_scan is b_loc assert x.similarity > 0.8 and x.similarity < 1.0 assert x.diff is not None assert "+import b_import" in x.diff assert "-import a_import" in x.diff assert " import unchanged" in x.diff assert '-eval("a")' in x.diff assert '+eval("b")' in x.diff assert ' eval("same")' in x.diff
def test_diff_file_removed(fixtures): a_loc = ScanLocation(fixtures.path("diffs/1_a/src.py")) d = diff.DiffAnalyzer() d.compare([a_loc], []) assert len(d.diffs) == 1 x = d.diffs[0] assert x.operation == "D" assert x.a_scan is a_loc assert x.b_scan is None assert x.similarity == 0.0 assert x.diff is None
def test_archive_diff_hook(mock1, fixtures): from aura.analyzers.archive import diff_archive extract_loc1 = ScanLocation("blabla_location1") extract_loc2 = ScanLocation("blabla_location2") mock1.side_effect = [[extract_loc1], [extract_loc2]] assert extract_loc1.metadata.get("b_scan_location") != extract_loc2 arch1 = fixtures.path("mirror/wheel-0.33.0-py2.py3-none-any.whl") arch2 = fixtures.path("mirror/wheel-0.34.2-py2.py3-none-any.whl") loc1 = ScanLocation(arch1) loc2 = ScanLocation(arch2) d = diff.Diff(operation="M", a_scan=loc1, b_scan=loc2) result = list(diff_archive(d)) mock1.assert_any_call(location=d.a_scan) mock1.assert_any_call(location=d.b_scan) assert len(result) == 1 assert result[0] == extract_loc1 assert result[0].metadata["b_scan_location"] == extract_loc2
def get_full_ast(self, src): """ Get a full AST tree after all stages has been applied, e.g. rewrite & taint analysis """ from aura.analyzers.python.visitor import Visitor from aura.uri_handlers.base import ScanLocation with tempfile.NamedTemporaryFile() as fd: fd.write(bytes(src, 'utf-8')) loc = ScanLocation(location=Path(fd.name), metadata={"source": "cli"}) visitor = Visitor.run_stages(location=loc) return visitor.tree["ast_tree"]
def test_diff_archives(fixtures, fuzzy_rule_match): arch1 = fixtures.path("mirror/wheel-0.34.2-py2.py3-none-any.whl") arch2 = fixtures.path("mirror/wheel-0.34.2.tar.gz") matches = [{ "a_md5": "8a2e3b6aca9665a0c6abecc4f4ea7090", "a_mime": "application/zip", "a_ref": "mirror/wheel-0.34.2-py2.py3-none-any.whl", "b_md5": "ce2a27f99c130a927237b5da1ff5ceaf", "b_mime": "application/gzip", "b_ref": "mirror/wheel-0.34.2.tar.gz", "diff": None, "operation": "M" }] d = diff.DiffAnalyzer() d.compare( ScanLocation(arch1, strip_path=os.fspath(fixtures.BASE_PATH)), ScanLocation(arch2, strip_path=os.fspath(fixtures.BASE_PATH)), ) diffs = [x.as_dict() for x in d.diffs] for match in matches: assert any(fuzzy_rule_match(x, match) for x in diffs), (match, diffs)
def disabled_test_is_rename_ratios(random_text): original = random_text(1000) orig_location = ScanLocation(location="pytest_orig_location.txt", size=len(original)) orig_location._lzset = lzset(original) for x in range(0, 100, 20): similarity = x / 100.0 modified = derive_similar(original, similarity) modified_location = ScanLocation( location="pytest_modified_location.txt", size=len(modified)) modified_location._lzset = lzset(modified) ratio = orig_location.is_renamed_file(modified_location) # FIXME: The +/- 40% diff in result is way too high threshold = 0.4 assert ratio >= similarity - threshold, similarity assert ratio <= similarity + threshold, similarity
}, "line": "import a_import", "location": "src.py" }, { "type": "FunctionCall", "extra": { "function": "eval" }, "line": 'eval("a")', "location": "src.py" }] }] DIFFS = (diff.Diff(operation="A", a_scan=None, b_scan=ScanLocation("added_file.py", )), diff.Diff(operation="D", a_scan=ScanLocation("removed_file.py"), b_scan=None), diff.Diff(operation="M", a_scan=ScanLocation("modified_file.py"), b_scan=ScanLocation("modified_file.py"), similarity=0.8, diff="This is a diff of the modified file")) @pytest.mark.e2e def test_text_scan_output_e2e(fixtures): """ Test different output formats """
def test_suspicious_file_trigger(metadata, expected): loc = ScanLocation("does_not_exists", metadata=metadata) assert fs_struct.enable_suspicious_files(location=loc) is expected
def test_file_patterns(pattern: str, path: str, should_match: bool): p = FilePatternMatcher(pattern) loc = ScanLocation(location=Path(path)) assert p.match(loc) is should_match
#Scan Locations should be normalized so that the str repr outputs only the latest part (filename) of the full path assert str(paths[0][0]) == arch1 assert str(paths[0][1]) == arch2 def empty_generator(location): yield from [] @pytest.mark.parametrize( "op,left,right", ( ("A", None, ScanLocation("nonexistent/location1.txt", metadata={ "md5": "loc1", "mime": "text/x-python" })), ("R", ScanLocation("nonexistent/location2.txt", metadata={ "md5": "loc2", "mime": "text/x-python" }), None), # different MD5s, both should be scanned ("M", ScanLocation("nonexistent/location3.txt", metadata={ "md5": "loc3", "mime": "text/x-python" }),
def get_directory_content( pth): # TODO: use ScanLocation.list_recursive instead for f in walk(pth): yield ScanLocation(f, strip_path=str(pth))
def test_same_scan_location_is_rename(): sc = ScanLocation(location=f"{uuid.uuid4()}.txt", size=666) sc._lzset = {"a"} assert sc.is_renamed_file(sc) == 1.0
def test_is_rename_different_depths(l_name, r_name, expected): sc1 = ScanLocation(location=l_name, size=8) sc1._lzset = {"a"} sc2 = ScanLocation(location=r_name, size=8) sc2._lzset = {"a"} assert sc1.is_renamed_file(sc2) == expected