def test_commit_without_manifest(self): """Tests a Release can still be produced when the manifest is not understood by the custom parser in dulwich_commit_to_revision.""" target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some commit message" author = Person(fullname=b"Foo <*****@*****.**>", name=b"Foo", email=b"*****@*****.**") commit = dulwich.objects.Commit() commit.tree = target commit.message = message commit.author = commit.committer = b"Foo <*****@*****.**>" commit.author_time = commit.commit_time = 1641980946 commit.author_timezone = commit.commit_timezone = 3600 assert converters.dulwich_commit_to_revision(commit) == Revision( message=b"some commit message", author=author, committer=author, date=TimestampWithTimezone( timestamp=Timestamp(seconds=1641980946, microseconds=0), offset_bytes=b"+0100", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1641980946, microseconds=0), offset_bytes=b"+0100", ), type=RevisionType.GIT, directory=hash_to_bytes(target.decode()), synthetic=False, metadata=None, parents=(), )
def test_svn_date_to_swh_date(): """The timestamp should not be tampered with and include the decimals. """ assert converters.svn_date_to_swh_date( b"2011-05-31T06:04:39.500900Z") == TimestampWithTimezone( timestamp=Timestamp(seconds=1306821879, microseconds=500900), offset_bytes=b"+0000", ) assert converters.svn_date_to_swh_date( b"2011-05-31T06:04:39.800722Z") == TimestampWithTimezone( timestamp=Timestamp(seconds=1306821879, microseconds=800722), offset_bytes=b"+0000", )
def db_to_date( date: Optional[datetime.datetime], offset_bytes: bytes, ) -> Optional[TimestampWithTimezone]: """Convert the DB representation of a date to a swh-model compatible date. Args: date: a date pulled out of the database offset_bytes: a byte representation of the latter two, usually as "+HHMM" or "-HHMM" Returns: a TimestampWithTimezone, or None if the date is None. """ if date is None: return None return TimestampWithTimezone( timestamp=Timestamp( # we use floor() instead of int() to round down, because of negative dates seconds=math.floor(date.timestamp()), microseconds=date.microsecond, ), offset_bytes=offset_bytes, )
def test_build_swh_revision_default(): """This should build the swh revision with the swh revision's extra headers about the repository. """ dir_id = hash_to_bytes("d6e08e19159f77983242877c373c75222d5ae9dd") date = TimestampWithTimezone(timestamp=Timestamp(seconds=1088108379, microseconds=0), offset_bytes=b"+0000") actual_rev = converters.build_swh_revision( repo_uuid=b"uuid", dir_id=dir_id, commit={ "author_name": Person(name=b"theo", email=b"theo@uuid", fullname=b"theo <theo@uuid>"), "message": b"commit message", "author_date": date, }, rev=10, parents=(), ) expected_rev = Revision.from_dict({ "date": date.to_dict(), "committer_date": date.to_dict(), "type": "svn", "directory": dir_id, "message": b"commit message", "author": { "name": b"theo", "email": b"theo@uuid", "fullname": b"theo <theo@uuid>", }, "committer": { "name": b"theo", "email": b"theo@uuid", "fullname": b"theo <theo@uuid>", }, "synthetic": True, "extra_headers": ( (b"svn_repo_uuid", b"uuid"), (b"svn_revision", b"10"), ), "parents": (), }) assert actual_rev == expected_rev
def test_svn_date_to_swh_date_epoch(): """Empty date should be EPOCH (timestamp and offset at 0).""" # It should return 0, epoch default_tstz = TimestampWithTimezone(timestamp=Timestamp(seconds=0, microseconds=0), offset_bytes=b"+0000") assert converters.svn_date_to_swh_date("") == default_tstz assert converters.svn_date_to_swh_date(None) == default_tstz
def test_commit_to_revision_with_extra_headers(self): sha1 = b"322f5bc915e50fc25e85226b5a182bded0e98e4b" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes(sha1.decode()), directory=bytes.fromhex( "f8ec06e4ed7b9fff4918a0241a48023143f30000"), type=RevisionType.GIT, committer=Person( name=b"David Douard", fullname=b"David Douard <*****@*****.**>", email=b"*****@*****.**", ), author=Person( name=b"David Douard", fullname=b"David Douard <*****@*****.**>", email=b"*****@*****.**", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1594137902, microseconds=0, ), offset_bytes=b"+0200", ), message=b"Am\xe9lioration du fichier READM\xa4\n", metadata=None, extra_headers=((b"encoding", b"ISO-8859-15"), (b"gpgsig", GPGSIG)), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1594136900, microseconds=0, ), offset_bytes=b"+0200", ), parents=( bytes.fromhex("c730509025c6e81947102b2d77bc4dc1cade9489"), ), synthetic=False, ) assert revision == expected_revision
def test_commit_to_revision(self): sha1 = b"9768d0b576dbaaecd80abedad6dfd0d72f1476da" revision = converters.dulwich_commit_to_revision(self.repo[sha1]) expected_revision = Revision( id=hash_to_bytes("9768d0b576dbaaecd80abedad6dfd0d72f1476da"), directory=b"\xf0i\\./\xa7\xce\x9dW@#\xc3A7a\xa4s\xe5\x00\xca", type=RevisionType.GIT, committer=Person( name=b"Stefano Zacchiroli", fullname=b"Stefano Zacchiroli <*****@*****.**>", email=b"*****@*****.**", ), author=Person( name=b"Stefano Zacchiroli", fullname=b"Stefano Zacchiroli <*****@*****.**>", email=b"*****@*****.**", ), committer_date=TimestampWithTimezone( timestamp=Timestamp( seconds=1443083765, microseconds=0, ), offset_bytes=b"+0200", ), message=b"add submodule dependency\n", metadata=None, extra_headers=(), date=TimestampWithTimezone( timestamp=Timestamp( seconds=1443083765, microseconds=0, ), offset_bytes=b"+0200", ), parents=( b"\xc3\xc5\x88q23`\x9f[\xbb\xb2\xd9\xe7\xf3\xfbJf\x0f?r", ), synthetic=False, ) assert revision == expected_revision
def test_from_release(): """Convert release model object to a dict should be ok""" ts = int( datetime.datetime(2015, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc).timestamp()) release_input = Release( id=hashutil.hash_to_bytes("aad23fa492a0c5fed0708a6703be875448c86884"), target=hashutil.hash_to_bytes( "5e46d564378afc44b31bb89f99d5675195fbdf67"), target_type=ObjectType.REVISION, date=TimestampWithTimezone( timestamp=Timestamp(seconds=ts, microseconds=0), offset=0, negative_utc=False, ), author=Person( name=b"author name", fullname=b"Author Name author@email", email=b"author@email", ), name=b"v0.0.1", message=b"some comment on release", synthetic=True, ) expected_release = { "id": "aad23fa492a0c5fed0708a6703be875448c86884", "target": "5e46d564378afc44b31bb89f99d5675195fbdf67", "target_type": "revision", "date": "2015-01-01T22:00:00+00:00", "author": { "name": "author name", "fullname": "Author Name author@email", "email": "author@email", }, "name": "v0.0.1", "message": "some comment on release", "target_type": "revision", "synthetic": True, } actual_release = converters.from_release(release_input) assert actual_release == expected_release
def dulwich_tsinfo_to_timestamp( timestamp, timezone: int, timezone_neg_utc: bool, timezone_bytes: Optional[bytes], ) -> TimestampWithTimezone: """Convert the dulwich timestamp information to a structure compatible with Software Heritage.""" ts = Timestamp( seconds=int(timestamp), microseconds=0, ) if timezone_bytes is None: # Failed to parse from the raw manifest, fallback to what Dulwich managed to # parse. return TimestampWithTimezone.from_numeric_offset( timestamp=ts, offset=timezone // 60, negative_utc=timezone_neg_utc, ) else: return TimestampWithTimezone(timestamp=ts, offset_bytes=timezone_bytes)
type="file", target=CONTENT.sha1_git, perms=DentryPerms.content, ) ]), ) REVISION = Revision( id=hash_to_bytes("066b1b62dbfa033362092af468bf6cfabec230e7"), message=b"hello", author=Person( name=b"Nicolas Dandrimont", email=b"*****@*****.**", fullname=b"Nicolas Dandrimont <*****@*****.**> ", ), date=TimestampWithTimezone(Timestamp(1234567890, 0), offset_bytes=b"+0200"), committer=Person( name=b"St\xc3fano Zacchiroli", email=b"*****@*****.**", fullname=b"St\xc3fano Zacchiroli <*****@*****.**>", ), committer_date=TimestampWithTimezone(Timestamp(1123456789, 0), offset_bytes=b"-0000"), parents=(), type=RevisionType.GIT, directory=DIRECTORY.id, metadata={ "checksums": { "sha1": "tarball-sha1", "sha256": "tarball-sha256",
def test_ignore_displayname(swh_storage, use_graph): """Tests the original authorship information is used instead of configured display names; otherwise objects would not match their hash, and git-fsck/git-clone would fail. This tests both with and without swh-graph, as both configurations use different code paths to fetch revisions. """ date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0), 0, False) legacy_person = Person.from_fullname(b"old me <*****@*****.**>") current_person = Person.from_fullname(b"me <*****@*****.**>") content = Content.from_data(b"foo") swh_storage.content_add([content]) directory = Directory( entries=(DirectoryEntry(name=b"file1", type="file", perms=0o100644, target=content.sha1_git), ), ) swh_storage.directory_add([directory]) revision = Revision( message=b"rev", author=legacy_person, date=date, committer=legacy_person, committer_date=date, parents=(), type=RevisionType.GIT, directory=directory.id, synthetic=True, ) swh_storage.revision_add([revision]) release = Release( name=b"v1.1.0", message=None, author=legacy_person, date=date, target=revision.id, target_type=ObjectType.REVISION, synthetic=True, ) swh_storage.release_add([release]) snapshot = Snapshot( branches={ b"refs/tags/v1.1.0": SnapshotBranch(target=release.id, target_type=TargetType.RELEASE), b"HEAD": SnapshotBranch(target=revision.id, target_type=TargetType.REVISION), }) swh_storage.snapshot_add([snapshot]) # Add all objects to graph if use_graph: from swh.graph.naive_client import NaiveClient as GraphClient nodes = [ str(x.swhid()) for x in [content, directory, revision, release, snapshot] ] edges = [(str(x.swhid()), str(y.swhid())) for (x, y) in [ (directory, content), (revision, directory), (release, revision), (snapshot, release), (snapshot, revision), ]] swh_graph = unittest.mock.Mock( wraps=GraphClient(nodes=nodes, edges=edges)) else: swh_graph = None # Set a display name with swh_storage.db() as db: with db.transaction() as cur: cur.execute( "UPDATE person set displayname = %s where fullname = %s", (current_person.fullname, legacy_person.fullname), ) # Check the display name did apply in the storage assert swh_storage.revision_get([revision.id])[0] == attr.evolve( revision, author=current_person, committer=current_person, ) # Cook cooked_swhid = snapshot.swhid() backend = InMemoryVaultBackend() cooker = GitBareCooker( cooked_swhid, backend=backend, storage=swh_storage, graph=swh_graph, ) cooker.cook() # Get bundle bundle = backend.fetch("git_bare", cooked_swhid) # Extract bundle and make sure both revisions are in it with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir: with tarfile.open(fileobj=io.BytesIO(bundle)) as tf: tf.extractall(tempdir) # If we are here, it means git-fsck succeeded when called by cooker.cook(), # so we already know the original person was used. Let's double-check. repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git") tag = repo[b"refs/tags/v1.1.0"] assert tag.tagger == legacy_person.fullname commit = repo[tag.object[1]] assert commit.author == legacy_person.fullname
def test_debian_origins_from_row__check_revisions(): """Tests debian_origins_from_row errors when the revision at the head of a branch is a DSC and has no parents """ storage = get_storage("memory") origin_url = "deb://Debian/packages/kalgebra" revision_id = b"21" * 10 storage.origin_add([Origin(url=origin_url)]) revision_row = { "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", "directory": DIRECTORY_ID, "metadata": { "original_artifact": [ { "filename": "kalgebra_19.12.1-1.dsc", }, ] }, } storage.origin_visit_add([ OriginVisit( origin=origin_url, date=datetime.datetime.now(tz=datetime.timezone.utc), type="deb", visit=280, ) ]) storage.origin_visit_status_add([ OriginVisitStatus( origin=origin_url, visit=280, date=datetime.datetime.now(tz=datetime.timezone.utc), status="full", snapshot=b"42" * 10, metadata=None, ) ]) storage.snapshot_add([ Snapshot( id=b"42" * 10, branches={ b"foo": SnapshotBranch(target_type=TargetType.REVISION, target=revision_id) }, ) ]) storage_before_revision = copy.deepcopy(storage) revision = Revision( id=revision_id, message=b"foo", author=Person.from_fullname(b"foo"), committer=Person.from_fullname(b"foo"), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1580076204, microseconds=0), offset_bytes=b"+0100", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1580076204, microseconds=0), offset_bytes=b"+0100", ), type=RevisionType.DSC, directory= b"\xd5\x9a\x1f\x9c\x80\x9d\x8c}19P\xf6\xc8\xa2\x0f^%H\xcd\xdb", synthetic=True, metadata=None, parents=(b"parent " * 2, ), extra_headers=(), ) storage.revision_add([revision]) with pytest.raises(AssertionError, match="revision with parents"): debian_origins_from_row(revision_row, storage)
sha1_array = bytearray(duplicate_content1.sha1_git) sha1_array[0] += 1 duplicate_content2 = attr.evolve(duplicate_content1, sha1_git=bytes(sha1_array)) DUPLICATE_CONTENTS = [duplicate_content1, duplicate_content2] COMMITTERS = [ Person(fullname=b"foo", name=b"foo", email=b""), Person(fullname=b"bar", name=b"bar", email=b""), ] DATES = [ TimestampWithTimezone( timestamp=Timestamp( seconds=1234567891, microseconds=0, ), offset_bytes=b"+0200", ), TimestampWithTimezone( timestamp=Timestamp( seconds=1234567892, microseconds=0, ), offset_bytes=b"+0200", ), ] REVISIONS = [ Revision( id=hash_to_bytes("66c7c1cd9673275037140f2abff7b7b11fc9439c"),
def test_from_revision_model_object(): ts = int( datetime.datetime(2000, 1, 17, 11, 23, 54, tzinfo=datetime.timezone.utc).timestamp()) revision_input = Revision( directory=hashutil.hash_to_bytes( "7834ef7e7c357ce2af928115c6c6a42b7e2a44e6"), author=Person( name=b"Software Heritage", fullname=b"robot [email protected]", email=b"*****@*****.**", ), committer=Person( name=b"Software Heritage", fullname=b"robot [email protected]", email=b"*****@*****.**", ), message=b"synthetic revision message", date=TimestampWithTimezone( timestamp=Timestamp(seconds=ts, microseconds=0), offset=0, negative_utc=False, ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=ts, microseconds=0), offset=0, negative_utc=False, ), synthetic=True, type=RevisionType.TAR, parents=tuple([ hashutil.hash_to_bytes("29d8be353ed3480476f032475e7c244eff7371d5"), hashutil.hash_to_bytes("30d8be353ed3480476f032475e7c244eff7371d5"), ]), extra_headers=((b"gpgsig", b"some-signature"), ), metadata={ "original_artifact": [{ "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", }], }, ) expected_revision = { "id": "a001358278a0d811fe7072463f805da601121c2a", "directory": "7834ef7e7c357ce2af928115c6c6a42b7e2a44e6", "author": { "name": "Software Heritage", "fullname": "robot [email protected]", "email": "*****@*****.**", }, "committer": { "name": "Software Heritage", "fullname": "robot [email protected]", "email": "*****@*****.**", }, "message": "synthetic revision message", "date": "2000-01-17T11:23:54+00:00", "committer_date": "2000-01-17T11:23:54+00:00", "parents": tuple([ "29d8be353ed3480476f032475e7c244eff7371d5", "30d8be353ed3480476f032475e7c244eff7371d5", ]), "type": "tar", "synthetic": True, "extra_headers": (("gpgsig", "some-signature"), ), "metadata": { "original_artifact": [{ "archive_type": "tar", "name": "webbase-5.7.0.tar.gz", "sha1": "147f73f369733d088b7a6fa9c4e0273dcd3c7ccd", "sha1_git": "6a15ea8b881069adedf11feceec35588f2cfe8f1", "sha256": "401d0df797110bea805d358b85bcc1ced29549d3d73f" "309d36484e7edf7bb912", }], }, "merge": True, } actual_revision = converters.from_revision(revision_input) assert actual_revision == expected_revision
@pytest.mark.parametrize( "model_date,db_date", [ ( None, { "timestamp": None, "offset": 0, "neg_utc_offset": None, "offset_bytes": None, }, ), ( TimestampWithTimezone( timestamp=Timestamp( seconds=1234567890, microseconds=0, ), offset_bytes=b"+0200", ), { "timestamp": "2009-02-13T23:31:30+00:00", "offset": 120, "neg_utc_offset": False, "offset_bytes": b"+0200", }, ), ( TimestampWithTimezone( timestamp=Timestamp( seconds=1123456789, microseconds=0,
def test_original_malformed_objects(self, swh_storage, cook_extract_snapshot): """Tests that objects that were originally malformed: * are still interpreted somewhat correctly (if the loader could make sense of them), especially that they still have links to children * have their original manifest in the bundle """ date = TimestampWithTimezone.from_numeric_offset( Timestamp(1643819927, 0), 0, False) content = Content.from_data(b"foo") swh_storage.content_add([content]) # disordered # fmt: off malformed_dir_manifest = (b"" + b"100644 file2\x00" + content.sha1_git + b"100644 file1\x00" + content.sha1_git) # fmt: on directory = Directory( entries=( DirectoryEntry(name=b"file1", type="file", perms=0o100644, target=content.sha1_git), DirectoryEntry(name=b"file2", type="file", perms=0o100644, target=content.sha1_git), ), raw_manifest=f"tree {len(malformed_dir_manifest)}\x00".encode() + malformed_dir_manifest, ) swh_storage.directory_add([directory]) # 'committer' and 'author' swapped # fmt: off malformed_rev_manifest = ( b"tree " + hashutil.hash_to_bytehex(directory.id) + b"\n" + b"committer me <*****@*****.**> 1643819927 +0000\n" + b"author me <*****@*****.**> 1643819927 +0000\n" + b"\n" + b"rev") # fmt: on revision = Revision( message=b"rev", author=Person.from_fullname(b"me <*****@*****.**>"), date=date, committer=Person.from_fullname(b"me <*****@*****.**>"), committer_date=date, parents=(), type=RevisionType.GIT, directory=directory.id, synthetic=True, raw_manifest=f"commit {len(malformed_rev_manifest)}\x00".encode() + malformed_rev_manifest, ) swh_storage.revision_add([revision]) # 'tag' and 'tagger' swapped # fmt: off malformed_rel_manifest = ( b"object " + hashutil.hash_to_bytehex(revision.id) + b"\n" + b"type commit\n" + b"tagger me <*****@*****.**> 1643819927 +0000\n" + b"tag v1.1.0\n") # fmt: on release = Release( name=b"v1.1.0", message=None, author=Person.from_fullname(b"me <*****@*****.**>"), date=date, target=revision.id, target_type=ModelObjectType.REVISION, synthetic=True, raw_manifest=f"tag {len(malformed_rel_manifest)}\x00".encode() + malformed_rel_manifest, ) swh_storage.release_add([release]) snapshot = Snapshot( branches={ b"refs/tags/v1.1.0": SnapshotBranch(target=release.id, target_type=TargetType.RELEASE), b"HEAD": SnapshotBranch(target=revision.id, target_type=TargetType.REVISION), }) swh_storage.snapshot_add([snapshot]) with cook_extract_snapshot(swh_storage, snapshot.swhid()) as (ert, p): tag = ert.repo[b"refs/tags/v1.1.0"] assert tag.as_raw_string() == malformed_rel_manifest commit = ert.repo[tag.object[1]] assert commit.as_raw_string() == malformed_rev_manifest tree = ert.repo[commit.tree] assert tree.as_raw_string() == malformed_dir_manifest
def test_debian_origins_from_row__no_result(): """Tests debian_origins_from_row when there's no origin, visit, status, snapshot, branch, or matching branch. """ storage = get_storage("memory") origin_url = "deb://Debian/packages/kalgebra" snapshot_id = b"42424242424242424242" revision_id = b"21212121212121212121" storage.origin_add([Origin(url=origin_url)]) revision_row = { "id": b"\x00\x00\x03l1\x1e\xf3:(\x1b\x05h\x8fn\xad\xcf\xc0\x94:\xee", "directory": DIRECTORY_ID, "metadata": { "original_artifact": [ { "filename": "kalgebra_19.12.1-1.dsc", }, ] }, } # no visit assert debian_origins_from_row(revision_row, storage) == [] storage.origin_visit_add( [OriginVisit( origin=origin_url, date=now(), type="deb", visit=280, )]) # no status assert debian_origins_from_row(revision_row, storage) == [] status = OriginVisitStatus( origin=origin_url, visit=280, date=now(), status="full", snapshot=None, metadata=None, ) storage.origin_visit_status_add([status]) # no snapshot assert debian_origins_from_row(revision_row, storage) == [] status = attr.evolve(status, snapshot=snapshot_id, date=now()) storage.origin_visit_status_add([status]) storage_before_snapshot = copy.deepcopy(storage) snapshot = Snapshot(id=snapshot_id, branches={}) storage.snapshot_add([snapshot]) # no branch assert debian_origins_from_row(revision_row, storage) == [] # "remove" the snapshot, so we can add a new one with the same id storage = copy.deepcopy(storage_before_snapshot) snapshot = attr.evolve( snapshot, branches={ b"foo": None, }, ) storage.snapshot_add([snapshot]) # dangling branch assert debian_origins_from_row(revision_row, storage) == [] # "remove" the snapshot again storage = copy.deepcopy(storage_before_snapshot) snapshot = attr.evolve( snapshot, branches={ b"foo": SnapshotBranch( target_type=TargetType.REVISION, target=revision_id, ) }, ) storage.snapshot_add([snapshot]) # branch points to unknown revision assert debian_origins_from_row(revision_row, storage) == [] revision = Revision( id=revision_id, message=b"foo", author=Person.from_fullname(b"foo"), committer=Person.from_fullname(b"foo"), date=TimestampWithTimezone( timestamp=Timestamp(seconds=1580076204, microseconds=0), offset_bytes=b"+0100", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1580076204, microseconds=0), offset_bytes=b"+0100", ), type=RevisionType.DSC, directory= b"\xd5\x9a\x1f\x9c\x80\x9d\x8c}19P\xf6\xc8\xa2\x0f^%H\xcd\xdb", synthetic=True, metadata=None, parents=(), extra_headers=(), ) storage.revision_add([revision]) # no matching branch assert debian_origins_from_row(revision_row, storage) == []