def fake_cook(backend, bundle_type, result_content, sticky=False): swhid = Content.from_data(result_content).swhid() content, obj_id = hash_content(result_content) with mock_cooking(backend): backend.create_task(bundle_type, swhid, sticky) backend.cache.add(bundle_type, swhid, b"content") backend.set_status(bundle_type, swhid, "done") return swhid, content
def empty_content(): """ Hypothesis strategy returning the empty content ingested into the test archive. """ empty_content = Content.from_data(data=b"").to_dict() for algo in DEFAULT_ALGORITHMS: empty_content[algo] = hash_to_hex(empty_content[algo]) return just(empty_content)
def test_client_batch_size( kafka_prefix: str, kafka_consumer_group: str, kafka_server: str, batch_size: int, ): num_objects = 2 * batch_size + 1 assert num_objects < 256, "Too many objects, generation will fail" producer = Producer( { "bootstrap.servers": kafka_server, "client.id": "test producer", "acks": "all", } ) contents = [Content.from_data(bytes([i])) for i in range(num_objects)] # Fill Kafka for content in contents: producer.produce( topic=kafka_prefix + ".content", key=key_to_kafka(content.sha1), value=value_to_kafka(content.to_dict()), ) producer.flush() client = JournalClient( brokers=[kafka_server], group_id=kafka_consumer_group, prefix=kafka_prefix, stop_on_eof=True, batch_size=batch_size, ) collected_output: List[Dict] = [] def worker_fn(objects): received = objects["content"] assert len(received) <= batch_size collected_output.extend(received) client.process(worker_fn) expected_output = [content.to_dict() for content in contents] assert len(collected_output) == len(expected_output) for output in collected_output: assert output in expected_output
def test_content_for_storage_data(tmpdir): # given data = b"temp file for testing content storage conversion" obj = from_disk.Content.from_bytes(data=data, mode=0o100644).get_data() del obj["perms"] expected_content = obj.copy() expected_content["status"] = "visible" expected_content = Content.from_dict(expected_content) # when content = converters.content_for_storage(obj) # then assert content == expected_content
def _add_extra_contents(storage, contents): pbm_image_data = b"""P1 # PBM example 24 7 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0""" # add file with mimetype image/x-portable-bitmap in the archive content pbm_content = Content.from_data(pbm_image_data) storage.content_add([pbm_content]) contents.add(pbm_content.sha1)
def test_get_filtered_files_content(swh_storage): content = Content.from_data(b"foo bar") skipped_content = SkippedContent( sha1=None, sha1_git=b"c" * 20, sha256=None, blake2s256=None, length=42, status="absent", reason="for some reason", ) swh_storage.content_add([content]) swh_storage.skipped_content_add([skipped_content]) files_data = [ { "status": "visible", "sha1": content.sha1, "sha1_git": content.sha1_git, "target": content.sha1_git, }, { "status": "absent", "target": skipped_content.sha1_git, }, ] res = list(get_filtered_files_content(swh_storage, files_data)) assert res == [ { "content": content.data, "status": "visible", "sha1": content.sha1, "sha1_git": content.sha1_git, "target": content.sha1_git, }, { "content": (b"This content has not been retrieved in the " b"Software Heritage archive due to its size."), "status": "absent", "target": skipped_content.sha1_git, }, ]
def test_blob_to_content(self): content_id = b"28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0" content = converters.dulwich_blob_to_content(self.repo[content_id]) expected_content = Content( sha1_git=bytehex_to_hash(content_id), sha1=hash_to_bytes("4850a3420a2262ff061cb296fb915430fa92301c"), sha256=hash_to_bytes("fee7c8a485a10321ad94b64135073cb5" "5f22cb9f57fa2417d2adfb09d310adef"), blake2s256=hash_to_bytes("5d71873f42a137f6d89286e43677721e574" "1fa05ce4cd5e3c7ea7c44d4c2d10b"), data=(b'[submodule "example-dependency"]\n' b"\tpath = example-dependency\n" b"\turl = https://github.com/githubtraining/" b"example-dependency.git\n"), length=124, status="visible", ) assert content == expected_content
def test_content_for_storage_path(tmpdir): # given data = b"temp file for testing content storage conversion" tmpfile = tmpfile_with_content(tmpdir, data) obj = from_disk.Content.from_file(path=os.fsdecode(tmpfile)).get_data() expected_content = obj.copy() expected_content["data"] = data expected_content["status"] = "visible" del expected_content["path"] del expected_content["perms"] expected_content = Content.from_dict(expected_content) # when content = converters.content_for_storage(obj) # then assert content == expected_content
def test_get_filtered_files_content__unknown_status(swh_storage): content = Content.from_data(b"foo bar") swh_storage.content_add([content]) files_data = [ { "status": "visible", "sha1": content.sha1, "sha1_git": content.sha1_git, "target": content.sha1_git, }, { "status": None, "target": b"c" * 20, }, ] with pytest.raises(AssertionError, match="unexpected status None"): list(get_filtered_files_content(swh_storage, files_data))
def dulwich_blob_to_content(obj: ShaFile, max_content_size=None) -> BaseContent: """Convert a dulwich blob to a Software Heritage content""" if obj.type_name != b"blob": raise ValueError("Argument is not a blob.") blob = cast(Blob, obj) hashes = dulwich_blob_to_content_id(blob) if max_content_size is not None and hashes["length"] >= max_content_size: return SkippedContent( status="absent", reason="Content too large", **hashes, ) else: return Content( data=blob.as_raw_string(), status="visible", **hashes, )
def content_for_storage( content: Dict, max_content_size: Optional[int] = None, origin_url: Optional[str] = None, ) -> BaseContent: """Prepare content to be ready for storage Note: - 'data' is returned only if max_content_size is not reached. Returns: content with added data (or reason for being missing) """ ret = content.copy() ret.pop("perms", None) if max_content_size and ret["length"] > max_content_size: logger.info("Skipping content %s, too large (%s > %s)" % (hash_to_hex( content["sha1_git"]), ret["length"], max_content_size)) ret.pop("data", None) ret.update({ "status": "absent", "reason": "Content too large", "origin": origin_url }) return SkippedContent.from_dict(ret) if "data" not in ret: with open(ret["path"], "rb") as f: ret["data"] = f.read() # Extra keys added by swh.model.from_disk, that are not accepted # by swh-storage ret.pop("path", None) ret["status"] = "visible" return Content.from_dict(ret)
def _init_content_tests_data(data_path, data_dict, ext_key): """ Helper function to read the content of a directory, store it into a test archive and add some files metadata (sha1 and/or expected programming language) in a dict. Args: data_path (str): path to a directory relative to the tests folder of swh-web data_dict (dict): the dict that will store files metadata ext_key (bool): whether to use file extensions or filenames as dict keys """ test_contents_dir = os.path.join(os.path.dirname(__file__), data_path).encode("utf-8") directory = from_disk.Directory.from_disk(path=test_contents_dir) contents = [] for name, obj_ in directory.items(): obj = obj_.to_model() if obj.object_type in [ Content.object_type, DiskBackedContent.object_type ]: c = obj.with_data().to_dict() c["status"] = "visible" sha1 = hash_to_hex(c["sha1"]) if ext_key: key = name.decode("utf-8").split(".")[-1] filename = "test." + key else: filename = name.decode("utf-8").split("/")[-1] key = filename language = get_hljs_language_from_filename(filename) data_dict[key] = {"sha1": sha1, "language": language} contents.append(Content.from_dict(c)) storage = get_tests_data()["storage"] storage.content_add(contents)
def fill_storage(storage): storage.origin_add(ORIGINS) storage.directory_add([DIRECTORY, DIRECTORY2]) storage.revision_add(REVISIONS) storage.snapshot_add(SNAPSHOTS) for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS): assert snapshot.id is not None visit = storage.origin_visit_add( [OriginVisit(origin=visit["origin"], date=now(), type=visit["type"])] )[0] visit_status = OriginVisitStatus( origin=visit.origin, visit=visit.visit, date=now(), status="full", snapshot=snapshot.id, ) storage.origin_visit_status_add([visit_status]) contents = [] for (obj_id, content) in OBJ_STORAGE_DATA.items(): content_hashes = hashutil.MultiHash.from_data(content).digest() contents.append( Content( data=content, length=len(content), status="visible", sha1=hash_to_bytes(obj_id), sha1_git=hash_to_bytes(obj_id), sha256=content_hashes["sha256"], blake2s256=content_hashes["blake2s256"], ) ) storage.content_add(contents)
def test_content_identifier(self): self.assertEqual( Content.from_data(content_example["data"]).hashes(), self.content_id)
def test_ignore_displayname(swh_storage, use_graph): """Tests the original authorship information is used instead of configured display names; otherwise objects would not match their hash, and git-fsck/git-clone would fail. This tests both with and without swh-graph, as both configurations use different code paths to fetch revisions. """ date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0), 0, False) legacy_person = Person.from_fullname(b"old me <*****@*****.**>") current_person = Person.from_fullname(b"me <*****@*****.**>") content = Content.from_data(b"foo") swh_storage.content_add([content]) directory = Directory( entries=(DirectoryEntry(name=b"file1", type="file", perms=0o100644, target=content.sha1_git), ), ) swh_storage.directory_add([directory]) revision = Revision( message=b"rev", author=legacy_person, date=date, committer=legacy_person, committer_date=date, parents=(), type=RevisionType.GIT, directory=directory.id, synthetic=True, ) swh_storage.revision_add([revision]) release = Release( name=b"v1.1.0", message=None, author=legacy_person, date=date, target=revision.id, target_type=ObjectType.REVISION, synthetic=True, ) swh_storage.release_add([release]) snapshot = Snapshot( branches={ b"refs/tags/v1.1.0": SnapshotBranch(target=release.id, target_type=TargetType.RELEASE), b"HEAD": SnapshotBranch(target=revision.id, target_type=TargetType.REVISION), }) swh_storage.snapshot_add([snapshot]) # Add all objects to graph if use_graph: from swh.graph.naive_client import NaiveClient as GraphClient nodes = [ str(x.swhid()) for x in [content, directory, revision, release, snapshot] ] edges = [(str(x.swhid()), str(y.swhid())) for (x, y) in [ (directory, content), (revision, directory), (release, revision), (snapshot, release), (snapshot, revision), ]] swh_graph = unittest.mock.Mock( wraps=GraphClient(nodes=nodes, edges=edges)) else: swh_graph = None # Set a display name with swh_storage.db() as db: with db.transaction() as cur: cur.execute( "UPDATE person set displayname = %s where fullname = %s", (current_person.fullname, legacy_person.fullname), ) # Check the display name did apply in the storage assert swh_storage.revision_get([revision.id])[0] == attr.evolve( revision, author=current_person, committer=current_person, ) # Cook cooked_swhid = snapshot.swhid() backend = InMemoryVaultBackend() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) # If we are here, it means git-fsck succeeded when called by cooker.cook(), # so we already know the original person was used. Let's double-check. repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git") tag = repo[b"refs/tags/v1.1.0"] assert tag.tagger == legacy_person.fullname commit = repo[tag.object[1]] assert commit.author == legacy_person.fullname
ORIGIN_VISIT_STATUS = OriginVisitStatus( origin="some-url", visit=1, type="archive", date=datetime.datetime.now(tz=datetime.timezone.utc), status="full", snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), metadata=None, ) CONTENT = Content( data=b"42\n", length=3, sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), sha256=hash_to_bytes( "673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a"), blake2s256=hash_to_bytes( "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d"), status="visible", ) DIRECTORY = Directory( id=hash_to_bytes("34f335a750111ca0a8b64d8034faec9eedc396be"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=CONTENT.sha1_git, perms=DentryPerms.content, )
def test_checksum_mismatch(swh_storage, mismatch_on): date = TimestampWithTimezone.from_datetime( datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc)) author = Person.from_fullname(b"Foo <*****@*****.**>") wrong_hash = b"\x12\x34" * 10 cnt1 = Content.from_data(b"Tr0ub4dor&3") if mismatch_on == "content": cnt1 = attr.evolve(cnt1, sha1_git=wrong_hash) dir1 = Directory(entries=(DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), )) if mismatch_on == "directory": dir1 = attr.evolve(dir1, id=wrong_hash) rev1 = Revision( message=b"msg1", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, type=RevisionType.GIT, synthetic=True, ) if mismatch_on == "revision1": rev1 = attr.evolve(rev1, id=wrong_hash) rev2 = Revision( message=b"msg2", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, parents=(rev1.id, ), type=RevisionType.GIT, synthetic=True, ) if mismatch_on == "revision2": rev2 = attr.evolve(rev2, id=wrong_hash) cooked_swhid = rev2.swhid() swh_storage.content_add([cnt1]) swh_storage.directory_add([dir1]) swh_storage.revision_add([rev1, rev2]) backend = InMemoryVaultBackend() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=None, ) cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) if mismatch_on != "revision2": # git-log fails if the head revision is corrupted # TODO: we need to find a way to make this somewhat usable output = subprocess.check_output([ "git", "-C", f"{tempdir}/{cooked_swhid}.git", "log", "--format=oneline", "--decorate=", ]) assert output.decode( ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"
def test_graph_revisions(swh_storage, up_to_date_graph, root_object, tag, weird_branches): r""" Build objects:: snp /|||\ / ||| \ rel2 <----° /|\ \----> rel4 | / | \ | v / v \ v rev1 <------ rev2 <----° dir4 \ rel3 | | | \ | v v v \ | dir1 dir2 dir3 | | | / | | | | v / v v v v cnt1 <----° cnt2 cnt3 cnt4 cnt5 If up_to_date_graph is true, then swh-graph contains all objects. Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph. If tag is False, rel2 is excluded. If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded. """ from swh.graph.naive_client import NaiveClient as GraphClient # Create objects: date = TimestampWithTimezone.from_datetime( datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc)) author = Person.from_fullname(b"Foo <*****@*****.**>") cnt1 = Content.from_data(b"correct") cnt2 = Content.from_data(b"horse") cnt3 = Content.from_data(b"battery") cnt4 = Content.from_data(b"staple") cnt5 = Content.from_data(b"Tr0ub4dor&3") dir1 = Directory(entries=(DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), )) dir2 = Directory(entries=( DirectoryEntry( name=b"file1", type="file", perms=DentryPerms.content, target=cnt1.sha1_git, ), DirectoryEntry( name=b"file2", type="file", perms=DentryPerms.content, target=cnt2.sha1_git, ), )) dir3 = Directory(entries=(DirectoryEntry( name=b"file3", type="file", perms=DentryPerms.content, target=cnt3.sha1_git, ), )) dir4 = Directory(entries=(DirectoryEntry( name=b"directory3", type="dir", perms=DentryPerms.directory, target=dir3.id, ), )) rev1 = Revision( message=b"msg1", date=date, committer_date=date, author=author, committer=author, directory=dir1.id, type=RevisionType.GIT, synthetic=True, ) rev2 = Revision( message=b"msg2", date=date, committer_date=date, author=author, committer=author, directory=dir2.id, parents=(rev1.id, ), type=RevisionType.GIT, synthetic=True, ) rel2 = Release( name=b"1.0.0", message=b"tag2", target_type=ObjectType.REVISION, target=rev2.id, synthetic=True, ) rel3 = Release( name=b"1.0.0-blob", message=b"tagged-blob", target_type=ObjectType.CONTENT, target=cnt5.sha1_git, synthetic=True, ) rel4 = Release( name=b"1.0.0-weird", message=b"weird release", target_type=ObjectType.RELEASE, target=rel3.id, synthetic=True, ) rel5 = Release( name=b"1.0.0:weirdname", message=b"weird release", target_type=ObjectType.RELEASE, target=rel2.id, synthetic=True, ) # Create snapshot: branches = { b"refs/heads/master": SnapshotBranch(target=rev2.id, target_type=TargetType.REVISION), } if tag: branches[b"refs/tags/1.0.0"] = SnapshotBranch( target=rel2.id, target_type=TargetType.RELEASE) if weird_branches: branches[b"refs/heads/tree-ref"] = SnapshotBranch( target=dir4.id, target_type=TargetType.DIRECTORY) branches[b"refs/heads/blob-ref"] = SnapshotBranch( target=cnt4.sha1_git, target_type=TargetType.CONTENT) branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch( target=rel4.id, target_type=TargetType.RELEASE) snp = Snapshot(branches=branches) # "Fill" swh-graph if up_to_date_graph: nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (rev1, dir1), (rev2, dir2), (rev2, rev1), (snp, rev2), ] if tag: nodes.append(rel2) edges.append((rel2, rev2)) edges.append((snp, rel2)) if weird_branches: nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4, rel5]) edges.extend([ (dir3, cnt3), (dir4, dir3), (snp, dir4), (snp, cnt4), (snp, rel4), (rel4, rel3), (rel3, cnt5), (rel5, rev2), ]) else: nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1] edges = [ (dir1, cnt1), (dir2, cnt1), (dir2, cnt2), (dir3, cnt3), (rev1, dir1), ] if tag: nodes.append(rel2) if weird_branches: nodes.extend([cnt3, dir3]) edges.extend([(dir3, cnt3)]) nodes = [str(n.swhid()) for n in nodes] edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges] # Add all objects to storage swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5]) swh_storage.directory_add([dir1, dir2, dir3, dir4]) swh_storage.revision_add([rev1, rev2]) swh_storage.release_add([rel2, rel3, rel4, rel5]) swh_storage.snapshot_add([snp]) # Add spy on swh_storage, to make sure revision_log is not called # (the graph must be used instead) swh_storage = unittest.mock.MagicMock(wraps=swh_storage) # Add all objects to graph swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges)) # Cook backend = InMemoryVaultBackend() cooked_swhid = { RootObjects.SNAPSHOT: snp.swhid(), RootObjects.REVISION: rev2.swhid(), RootObjects.RELEASE: rel2.swhid(), RootObjects.WEIRD_RELEASE: rel5.swhid(), }[root_object] cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) if weird_branches: # git-fsck now rejects refs pointing to trees and blobs, # but some old git repos have them. cooker.use_fsck = False cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) if root_object in (RootObjects.SNAPSHOT, RootObjects.REVISION): log_head = "master" elif root_object == RootObjects.RELEASE: log_head = "1.0.0" elif root_object == RootObjects.WEIRD_RELEASE: log_head = "release" else: assert False, root_object output = subprocess.check_output([ "git", "-C", f"{tempdir}/{cooked_swhid}.git", "log", "--format=oneline", "--decorate=", log_head, ]) assert output.decode( ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n" # Make sure the graph was used instead of swh_storage.revision_log if root_object == RootObjects.SNAPSHOT: if up_to_date_graph: # The graph has everything, so the first call succeeds and returns # all objects transitively pointed by the snapshot swh_graph.visit_nodes.assert_has_calls([ unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"), ]) else: # The graph does not have everything, so the first call returns nothing. # However, the second call (on the top rev) succeeds and returns # all objects but the rev and the rel swh_graph.visit_nodes.assert_has_calls([ unittest.mock.call(str(snp.swhid()), edges="snp:*,rel:*,rev:rev"), unittest.mock.call(str(rev2.swhid()), edges="rev:rev"), ]) elif root_object in ( RootObjects.REVISION, RootObjects.RELEASE, RootObjects.WEIRD_RELEASE, ): swh_graph.visit_nodes.assert_has_calls( [unittest.mock.call(str(rev2.swhid()), edges="rev:rev")]) else: assert False, root_object if up_to_date_graph: swh_storage.revision_log.assert_not_called() swh_storage.revision_shortlog.assert_not_called() else: swh_storage.revision_log.assert_called()
import functools from hypothesis import given, settings from hypothesis.strategies import sets from swh.journal.client import JournalClient from swh.journal.writer import get_journal_writer from swh.model.hypothesis_strategies import sha1 from swh.model.model import Content from swh.objstorage.factory import get_objstorage from swh.objstorage.replayer.replay import ( is_hash_in_bytearray, process_replay_objects_content, ) CONTENTS = [Content.from_data(f"foo{i}".encode()) for i in range(10)] + [ Content.from_data(f"forbidden foo{i}".encode(), status="hidden") for i in range(10) ] @settings(max_examples=500) @given( sets(sha1(), min_size=0, max_size=500), sets(sha1(), min_size=10), ) def test_is_hash_in_bytearray(haystack, needles): array = b"".join(sorted(haystack)) needles |= haystack # Exhaustively test for all objects in the array for needle in needles: assert is_hash_in_bytearray(needle, array,
RevisionType, SkippedContent, Snapshot, SnapshotBranch, TargetType, Timestamp, TimestampWithTimezone, ) from swh.model.swhids import ExtendedSWHID UTC = datetime.timezone.utc CONTENTS = [ Content( length=4, data=f"foo{i}".encode(), status="visible", **MultiHash.from_data(f"foo{i}".encode()).digest(), ) for i in range(10) ] + [ Content( length=14, data=f"forbidden foo{i}".encode(), status="hidden", **MultiHash.from_data(f"forbidden foo{i}".encode()).digest(), ) for i in range(10) ] SKIPPED_CONTENTS = [ SkippedContent( length=4, status="absent",
class StorageData: """Data model objects to use within tests.""" content = Content( data=b"42\n", length=3, sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"), sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), sha256=hash_to_bytes( "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0" ), blake2s256=hash_to_bytes( "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d" ), status="visible", ) content2 = Content( data=b"4242\n", length=5, sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"), sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"), sha256=hash_to_bytes( "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd" ), blake2s256=hash_to_bytes( "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d" ), status="visible", ) content3 = Content( data=b"424242\n", length=7, sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"), sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"), sha256=hash_to_bytes( "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36" ), blake2s256=hash_to_bytes( "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11" ), status="visible", ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc), ) contents: Tuple[Content, ...] = (content, content2, content3) skipped_content = SkippedContent( length=1024 * 1024 * 200, sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"), sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"), sha256=hash_to_bytes( "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a" ), blake2s256=hash_to_bytes( "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b" ), reason="Content too long", status="absent", origin="file:///dev/zero", ) skipped_content2 = SkippedContent( length=1024 * 1024 * 300, sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"), sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"), sha256=hash_to_bytes( "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a" ), blake2s256=hash_to_bytes( "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b" ), reason="Content too long", status="absent", ) skipped_contents: Tuple[SkippedContent, ...] = (skipped_content, skipped_content2) directory5 = Directory( id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"), entries=(), ) directory = Directory( id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=content.sha1_git, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"bar\xc3", type="dir", target=directory5.id, perms=from_disk.DentryPerms.directory, ), ], ), ) directory2 = Directory( id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"), entries=tuple([ DirectoryEntry( name=b"oof", type="file", target=content2.sha1_git, perms=from_disk.DentryPerms.content, ) ], ), ) directory3 = Directory( id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=content.sha1_git, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"subdir", type="dir", target=directory.id, perms=from_disk.DentryPerms.directory, ), DirectoryEntry( name=b"hello", type="file", target=content2.sha1_git, perms=from_disk.DentryPerms.content, ), ], ), ) directory4 = Directory( id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"), entries=tuple([ DirectoryEntry( name=b"subdir1", type="dir", target=directory3.id, perms=from_disk.DentryPerms.directory, ) ], ), ) directory6 = Directory( id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"), entries=tuple([ DirectoryEntry( name=b"foo", type="file", target=b"\x00" * 20, perms=from_disk.DentryPerms.content, ), DirectoryEntry( name=b"bar", type="dir", target=b"\x01" * 20, perms=from_disk.DentryPerms.directory, ), ], ), raw_manifest=( b"tree 61\x00" b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" # noqa b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01" # noqa ), ) directories: Tuple[Directory, ...] = ( directory2, directory, directory3, directory4, directory5, directory6, ) revision = Revision( id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"*****@*****.**", fullname=b"Nicolas Dandrimont <*****@*****.**> ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0200", ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"*****@*****.**", fullname=b"St\xc3fano Zacchiroli <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset_bytes=b"+0200", ), parents=(), type=RevisionType.GIT, directory=directory.id, metadata={ "checksums": { "sha1": "tarball-sha1", "sha256": "tarball-sha256", }, "signed-off-by": "some-dude", }, extra_headers=( (b"gpgsig", b"test123"), (b"mergetag", b"foo\\bar"), (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"), ), synthetic=True, ) revision2 = Revision( id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"), message=b"hello again", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1123456789, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([revision.id]), type=RevisionType.GIT, directory=directory2.id, metadata=None, extra_headers=(), synthetic=False, ) revision3 = Revision( id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"), message=b"a simple revision with no parents this time", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1127351742, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([revision.id, revision2.id]), type=RevisionType.GIT, directory=directory2.id, metadata=None, extra_headers=(), synthetic=True, ) revision4 = Revision( id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"), message=b"parent of self.revision2", author=Person( name=b"me", email=b"*****@*****.**", fullname=b"me <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"committer-dude", email=b"*****@*****.**", fullname=b"committer-dude <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1244567843, microseconds=220000, ), offset_bytes=b"-1200", ), parents=tuple([revision3.id]), type=RevisionType.GIT, directory=directory.id, metadata=None, extra_headers=(), synthetic=False, ) git_revisions: Tuple[Revision, ...] = (revision, revision2, revision3, revision4) hg_revision = Revision( id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"*****@*****.**", fullname=b"Nicolas Dandrimont <*****@*****.**> ", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0200", ), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"*****@*****.**", fullname=b"St\xc3fano Zacchiroli <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1123456789, microseconds=0), offset_bytes=b"+0200", ), parents=(), type=RevisionType.MERCURIAL, directory=directory.id, metadata={ "checksums": { "sha1": "tarball-sha1", "sha256": "tarball-sha256", }, "signed-off-by": "some-dude", "node": "a316dfb434af2b451c1f393496b7eaeda343f543", }, extra_headers=(), synthetic=True, ) hg_revision2 = Revision( id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"), message=b"hello again", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1123456789, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([hg_revision.id]), type=RevisionType.MERCURIAL, directory=directory2.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ), synthetic=False, ) hg_revision3 = Revision( id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"), message=b"a simple revision with no parents this time", author=Person( name=b"Roberto Dicosmo", email=b"*****@*****.**", fullname=b"Roberto Dicosmo <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1127351742, microseconds=220000, ), offset_bytes=b"+0000", ), parents=tuple([hg_revision.id, hg_revision2.id]), type=RevisionType.MERCURIAL, directory=directory2.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ), synthetic=True, ) hg_revision4 = Revision( id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"), message=b"parent of self.revision2", author=Person( name=b"me", email=b"*****@*****.**", fullname=b"me <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1234567843, microseconds=220000, ), offset_bytes=b"-1200", ), committer=Person( name=b"committer-dude", email=b"*****@*****.**", fullname=b"committer-dude <*****@*****.**>", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1244567843, microseconds=220000, ), offset_bytes=b"-1200", ), parents=tuple([hg_revision3.id]), type=RevisionType.MERCURIAL, directory=directory.id, metadata=None, extra_headers=( (b"node", hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ), synthetic=False, ) hg_revisions: Tuple[Revision, ...] = ( hg_revision, hg_revision2, hg_revision3, hg_revision4, ) revisions: Tuple[Revision, ...] = git_revisions + hg_revisions origins: Tuple[Origin, ...] = ( Origin(url="https://github.com/user1/repo1"), Origin(url="https://github.com/user2/repo1"), Origin(url="https://github.com/user3/repo1"), Origin(url="https://gitlab.com/user1/repo1"), Origin(url="https://gitlab.com/user2/repo1"), Origin(url="https://forge.softwareheritage.org/source/repo1"), Origin(url="https://example.рф/🏛️.txt"), ) origin, origin2 = origins[:2] metadata_authority = MetadataAuthority( type=MetadataAuthorityType.DEPOSIT_CLIENT, url="http://hal.inria.example.com/", ) metadata_authority2 = MetadataAuthority( type=MetadataAuthorityType.REGISTRY, url="http://wikidata.example.com/", ) authorities: Tuple[MetadataAuthority, ...] = ( metadata_authority, metadata_authority2, ) metadata_fetcher = MetadataFetcher( name="swh-deposit", version="0.0.1", ) metadata_fetcher2 = MetadataFetcher( name="swh-example", version="0.0.1", ) fetchers: Tuple[MetadataFetcher, ...] = (metadata_fetcher, metadata_fetcher2) date_visit1 = datetime.datetime(2015, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) date_visit2 = datetime.datetime(2017, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) date_visit3 = datetime.datetime(2018, 1, 1, 23, 0, 0, tzinfo=datetime.timezone.utc) type_visit1 = "git" type_visit2 = "hg" type_visit3 = "deb" origin_visit = OriginVisit( origin=origin.url, visit=1, date=date_visit1, type=type_visit1, ) origin_visit2 = OriginVisit( origin=origin.url, visit=2, date=date_visit2, type=type_visit1, ) origin_visit3 = OriginVisit( origin=origin2.url, visit=1, date=date_visit1, type=type_visit2, ) origin_visits: Tuple[OriginVisit, ...] = ( origin_visit, origin_visit2, origin_visit3, ) release = Release( id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"), name=b"v0.0.1", author=Person( name=b"olasd", email=b"*****@*****.**", fullname=b"olasd <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1234567890, microseconds=0), offset_bytes=b"+0042", ), target=revision.id, target_type=ObjectType.REVISION, message=b"synthetic release", synthetic=True, ) release2 = Release( id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"), name=b"v0.0.2", author=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634366813, microseconds=0), offset_bytes=b"-0200", ), target=revision2.id, target_type=ObjectType.REVISION, message=b"v0.0.2\nMisc performance improvements + bug fixes", synthetic=False, ) release3 = Release( id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"), name=b"v0.0.2", author=Person( name=b"tony", email=b"*****@*****.**", fullname=b"tony <*****@*****.**>", ), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1634366813, microseconds=0), offset_bytes=b"-0200", ), target=revision3.id, target_type=ObjectType.REVISION, message=b"yet another synthetic release", synthetic=True, ) releases: Tuple[Release, ...] = (release, release2, release3) snapshot = Snapshot( id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"), branches={ b"master": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), }, ) empty_snapshot = Snapshot( id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"), branches={}, ) complete_snapshot = Snapshot( id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"), branches={ b"directory": SnapshotBranch( target=directory.id, target_type=TargetType.DIRECTORY, ), b"directory2": SnapshotBranch( target=directory2.id, target_type=TargetType.DIRECTORY, ), b"content": SnapshotBranch( target=content.sha1_git, target_type=TargetType.CONTENT, ), b"alias": SnapshotBranch( target=b"revision", target_type=TargetType.ALIAS, ), b"revision": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), b"release": SnapshotBranch( target=release.id, target_type=TargetType.RELEASE, ), b"snapshot": SnapshotBranch( target=empty_snapshot.id, target_type=TargetType.SNAPSHOT, ), b"dangling": None, }, ) snapshots: Tuple[Snapshot, ...] = (snapshot, empty_snapshot, complete_snapshot) content_metadata1 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), origin=origin.url, discovery_date=datetime.datetime(2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc), authority=metadata_authority, fetcher=metadata_fetcher, format="json", metadata=b'{"foo": "bar"}', ) content_metadata2 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), origin=origin2.url, discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=metadata_authority, fetcher=metadata_fetcher, format="yaml", metadata=b"foo: bar", ) content_metadata3 = RawExtrinsicMetadata( target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT, object_id=content.sha1_git), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority2, metadata=None), fetcher=attr.evolve(metadata_fetcher2, metadata=None), format="yaml", metadata=b"foo: bar", origin=origin.url, visit=42, snapshot=snapshot.swhid(), release=release.swhid(), revision=revision.swhid(), directory=directory.swhid(), path=b"/foo/bar", ) content_metadata: Tuple[RawExtrinsicMetadata, ...] = ( content_metadata1, content_metadata2, content_metadata3, ) origin_metadata1 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2015, 1, 1, 21, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="json", metadata=b'{"foo": "bar"}', ) origin_metadata2 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority, metadata=None), fetcher=attr.evolve(metadata_fetcher, metadata=None), format="yaml", metadata=b"foo: bar", ) origin_metadata3 = RawExtrinsicMetadata( target=Origin(origin.url).swhid(), discovery_date=datetime.datetime(2017, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc), authority=attr.evolve(metadata_authority2, metadata=None), fetcher=attr.evolve(metadata_fetcher2, metadata=None), format="yaml", metadata=b"foo: bar", ) origin_metadata: Tuple[RawExtrinsicMetadata, ...] = ( origin_metadata1, origin_metadata2, origin_metadata3, ) extid1 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=revision.id), extid_type="git", extid=revision.id, ) extid2 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.REVISION, object_id=hg_revision.id), extid_type="mercurial", extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"), ) extid3 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=directory.id), extid_type="directory", extid=b"something", ) extid4 = ExtID( target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY, object_id=directory2.id), extid_type="directory", extid=b"something", extid_version=2, ) extids: Tuple[ExtID, ...] = ( extid1, extid2, extid3, extid4, )
def test_replay_statsd(kafka_server, kafka_prefix, kafka_consumer_group, statsd): objstorage1 = get_objstorage(cls="memory") objstorage2 = get_objstorage(cls="memory") writer = get_journal_writer( cls="kafka", brokers=[kafka_server], client_id="kafka_writer", prefix=kafka_prefix, anonymize=False, ) # Fill the source objstorage with a bunch of content object. In the end, # there should be 2 content objects for each possible replaying decision # (aka. skipped, excluded, in_dst, not_in_src, failed and copied): # contents[0:2] are properly copied # contents[2:4] are excluded # contents[4:6] are in dst # contents[6:8] are hidden contents = [ Content.from_data(f"foo{i}".encode(), status="hidden" if 6 <= i < 8 else "visible") for i in range(8) ] for content in contents: objstorage1.add(content.data) writer.write_addition("content", content) excluded = [c.sha1 for c in contents[2:4]] def exclude_fn(cnt_d): return cnt_d["sha1"] in excluded for content in contents[4:6]: objstorage2.add(content.data) replayer = JournalClient( brokers=kafka_server, group_id=kafka_consumer_group, prefix=kafka_prefix, stop_on_eof=True, # stop_after_objects=len(objects), ) worker_fn = functools.partial( process_replay_objects_content, src=objstorage1, dst=objstorage2, exclude_fn=exclude_fn, ) replayer.process(worker_fn) # We cannot expect any order from replayed objects, so statsd reports won't # be sorted according to contents, so we just count the expected occurrence # of each statsd message. prefix = "swh_content_replayer" expected_reports = { # 4 because 2 for the copied objects + 2 for the in_dst ones f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:obj_in_objstorage$": 4, f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:get_object$": 2, f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:put_object$": 2, f"^{prefix}_duration_seconds:[0-9]+[.][0-9]+[|]ms[|]#request:get$": 2, f"^{prefix}_duration_seconds:[0-9]+[.][0-9]+[|]ms[|]#request:put$": 2, f"^{prefix}_bytes:4[|]c$": 2, } decisions = ("copied", "skipped", "excluded", "in_dst", "not_in_src", "failed") decision_re = ( "^swh_content_replayer_operations_total:1[|]c[|]#decision:(?P<decision>" + "|".join(decisions) + ")(?P<extras>,.+)?$") operations = dict.fromkeys(decisions, 0) reports = dict.fromkeys(expected_reports, 0) for report in (r.decode() for r in statsd.socket.payloads): m = re.match(decision_re, report) if m: operations[m.group("decision")] += 1 else: for expected in expected_reports: m = re.match(expected, report) if m: reports[expected] += 1 assert reports == expected_reports assert operations["skipped"] == 2 assert operations["excluded"] == 2 assert operations["in_dst"] == 2 assert operations["copied"] == 2 # TODO: assert operations["not_in_src"] == 0 assert operations["failed"] == 0
def content_index_add_one(self, algo: str, content: Content, token: int) -> None: self._content_indexes[algo][content.get_hash(algo)].add(token)
def _init_tests_data(): # To hold reference to the memory storage storage = get_storage("memory") # Create search instance search = get_search("memory") search.initialize() search.origin_update({"url": origin["url"]} for origin in _TEST_ORIGINS) # Create indexer storage instance that will be shared by indexers idx_storage = get_indexer_storage("memory") # Declare a test tool for origin intrinsic metadata tests idx_tool = idx_storage.indexer_configuration_add([INDEXER_TOOL])[0] INDEXER_TOOL["id"] = idx_tool["id"] # Load git repositories from archives for origin in _TEST_ORIGINS: for i, archive_ in enumerate(origin["archives"]): if i > 0: # ensure visit dates will be different when simulating # multiple visits of an origin time.sleep(1) origin_repo_archive = os.path.join(os.path.dirname(__file__), "resources/repos/%s" % archive_) loader = GitLoaderFromArchive( storage, origin["url"], archive_path=origin_repo_archive, ) result = loader.load() assert result["status"] == "eventful" ori = storage.origin_get([origin["url"]])[0] origin.update(ori.to_dict()) # add an 'id' key if enabled search.origin_update([{ "url": origin["url"], "has_visits": True, "visit_types": ["git"] }]) for i in range(250): _add_origin(storage, search, origin_url=f"https://many.origins/{i+1}", visit_type="tar") sha1s: Set[Sha1] = set() directories = set() revisions = set() releases = set() snapshots = set() content_path = {} # Get all objects loaded into the test archive common_metadata = {ORIGIN_METADATA_KEY: ORIGIN_METADATA_VALUE} for origin in _TEST_ORIGINS: snp = snapshot_get_latest(storage, origin["url"]) snapshots.add(hash_to_hex(snp.id)) for branch_name, branch_data in snp.branches.items(): target_type = branch_data.target_type.value if target_type == "revision": revisions.add(branch_data.target) if b"master" in branch_name: # Add some origin intrinsic metadata for tests metadata = common_metadata metadata.update(origin.get("metadata", {})) origin_metadata = OriginIntrinsicMetadataRow( id=origin["url"], from_revision=branch_data.target, indexer_configuration_id=idx_tool["id"], metadata=metadata, mappings=[], ) idx_storage.origin_intrinsic_metadata_add( [origin_metadata]) search.origin_update([{ "url": origin["url"], "intrinsic_metadata": metadata }]) ORIGIN_MASTER_REVISION[origin["url"]] = hash_to_hex( branch_data.target) elif target_type == "release": release = storage.release_get([branch_data.target])[0] revisions.add(release.target) releases.add(hash_to_hex(branch_data.target)) for rev_log in storage.revision_shortlog(set(revisions)): rev_id = rev_log[0] revisions.add(rev_id) for rev in storage.revision_get(revisions): if rev is None: continue dir_id = rev.directory directories.add(hash_to_hex(dir_id)) for entry in dir_iterator(storage, dir_id): if entry["type"] == "file": sha1s.add(entry["sha1"]) content_path[entry["sha1"]] = "/".join( [hash_to_hex(dir_id), entry["path"].decode("utf-8")]) elif entry["type"] == "dir": directories.add(hash_to_hex(entry["target"])) _add_extra_contents(storage, sha1s) # Get all checksums for each content result: List[Optional[Content]] = storage.content_get(list(sha1s)) contents: List[Dict] = [] for content in result: assert content is not None sha1 = hash_to_hex(content.sha1) content_metadata = { algo: hash_to_hex(getattr(content, algo)) for algo in DEFAULT_ALGORITHMS } path = "" if content.sha1 in content_path: path = content_path[content.sha1] cnt_data = storage.content_get_data(content.sha1) assert cnt_data is not None mimetype, encoding = get_mimetype_and_encoding_for_content(cnt_data) _, _, cnt_data = _re_encode_content(mimetype, encoding, cnt_data) content_display_data = prepare_content_for_display( cnt_data, mimetype, path) content_metadata.update({ "path": path, "mimetype": mimetype, "encoding": encoding, "hljs_language": content_display_data["language"], "data": content_display_data["content_data"], }) _contents[sha1] = content_metadata contents.append(content_metadata) # Add the empty directory to the test archive storage.directory_add([Directory(entries=())]) # Add empty content to the test archive storage.content_add([Content.from_data(data=b"")]) # Add fake git origin with pull request branches _add_origin( storage, search, origin_url="https://git.example.org/project", snapshot_branches={ b"refs/heads/master": { "target_type": "revision", "target": next(iter(revisions)), }, **{ f"refs/pull/{i}".encode(): { "target_type": "revision", "target": next(iter(revisions)), } for i in range(300) }, }, ) # Return tests data return { "search": search, "storage": storage, "idx_storage": idx_storage, "origins": _TEST_ORIGINS, "contents": contents, "directories": list(directories), "releases": list(releases), "revisions": list(map(hash_to_hex, revisions)), "snapshots": list(snapshots), "generated_checksums": set(), }
def test_original_malformed_objects(self, swh_storage, cook_extract_snapshot): """Tests that objects that were originally malformed: * are still interpreted somewhat correctly (if the loader could make sense of them), especially that they still have links to children * have their original manifest in the bundle """ date = TimestampWithTimezone.from_numeric_offset( Timestamp(1643819927, 0), 0, False) content = Content.from_data(b"foo") swh_storage.content_add([content]) # disordered # fmt: off malformed_dir_manifest = (b"" + b"100644 file2\x00" + content.sha1_git + b"100644 file1\x00" + content.sha1_git) # fmt: on directory = Directory( entries=( DirectoryEntry(name=b"file1", type="file", perms=0o100644, target=content.sha1_git), DirectoryEntry(name=b"file2", type="file", perms=0o100644, target=content.sha1_git), ), raw_manifest=f"tree {len(malformed_dir_manifest)}\x00".encode() + malformed_dir_manifest, ) swh_storage.directory_add([directory]) # 'committer' and 'author' swapped # fmt: off malformed_rev_manifest = ( b"tree " + hashutil.hash_to_bytehex(directory.id) + b"\n" + b"committer me <*****@*****.**> 1643819927 +0000\n" + b"author me <*****@*****.**> 1643819927 +0000\n" + b"\n" + b"rev") # fmt: on revision = Revision( message=b"rev", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=directory.id, synthetic=True, raw_manifest=f"commit {len(malformed_rev_manifest)}\x00".encode() + malformed_rev_manifest, ) swh_storage.revision_add([revision]) # 'tag' and 'tagger' swapped # fmt: off malformed_rel_manifest = ( b"object " + hashutil.hash_to_bytehex(revision.id) + b"\n" + b"type commit\n" + b"tagger me <*****@*****.**> 1643819927 +0000\n" + b"tag v1.1.0\n") # fmt: on release = Release( name=b"v1.1.0", message=None, author=Person.from_fullname(b"me <*****@*****.**>"), date=date, target=revision.id, target_type=ModelObjectType.REVISION, synthetic=True, raw_manifest=f"tag {len(malformed_rel_manifest)}\x00".encode() + malformed_rel_manifest, ) swh_storage.release_add([release]) snapshot = Snapshot( branches={ b"refs/tags/v1.1.0": SnapshotBranch(target=release.id, target_type=TargetType.RELEASE), b"HEAD": SnapshotBranch(target=revision.id, target_type=TargetType.REVISION), }) swh_storage.snapshot_add([snapshot]) with cook_extract_snapshot(swh_storage, snapshot.swhid()) as (ert, p): tag = ert.repo[b"refs/tags/v1.1.0"] assert tag.as_raw_string() == malformed_rel_manifest commit = ert.repo[tag.object[1]] assert commit.as_raw_string() == malformed_rev_manifest tree = ert.repo[commit.tree] assert tree.as_raw_string() == malformed_dir_manifest