Ejemplo n.º 1
0
    def write_refs(self, snapshot=None):
        """Writes all files in :file:`.git/refs/`.

        For non-snapshot objects, this is only ``master``."""
        refs: Dict[bytes, bytes]  # ref name -> target
        if self.obj_type == RootObjectType.DIRECTORY:
            # We need a synthetic revision pointing to the directory
            rev_id = self._make_stub_directory_revision(self.obj_id)

            refs = {b"refs/heads/master": hash_to_bytehex(rev_id)}
        elif self.obj_type == RootObjectType.REVISION:
            refs = {b"refs/heads/master": hash_to_bytehex(self.obj_id)}
        elif self.obj_type == RootObjectType.RELEASE:
            (release, ) = self.storage.release_get([self.obj_id])

            if release.name and re.match(rb"^[a-zA-Z0-9_.-]+$", release.name):
                release_name = release.name
            else:
                release_name = b"release"

            refs = {
                b"refs/tags/" + release_name: hash_to_bytehex(self.obj_id),
            }

            if release.target_type.value == ModelObjectType.REVISION:
                # Not necessary, but makes it easier to browse
                refs[b"ref/heads/master"] = hash_to_bytehex(release.target)
            # TODO: synthetize a master branch for other target types

        elif self.obj_type == RootObjectType.SNAPSHOT:
            if snapshot is None:
                # refs were already written in a previous step
                return
            branches = []
            for (branch_name, branch) in snapshot.branches.items():
                if branch is None:
                    logging.error("%s has dangling branch: %r",
                                  snapshot.swhid(), branch_name)
                else:
                    branches.append((branch_name, branch))
            refs = {
                branch_name:
                (b"ref: " + branch.target if branch.target_type
                 == TargetType.ALIAS else hash_to_bytehex(branch.target))
                for (branch_name, branch) in branches
            }
        else:
            assert_never(self.obj_type,
                         f"Unexpected root object type: {self.obj_type}")

        for (ref_name, ref_target) in refs.items():
            path = os.path.join(self.gitdir.encode(), ref_name)
            os.makedirs(os.path.dirname(path), exist_ok=True)
            with open(path, "wb") as fd:
                fd.write(ref_target)
Ejemplo n.º 2
0
    def check_snapshot_two_heads(self, ert, p, swhid):
        assert (hashutil.hash_to_bytehex(swhid.object_id) ==
                ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"]
                == ert.repo.refs[b"refs/remotes/origin/HEAD"] ==
                ert.repo.refs[b"refs/remotes/origin/master"] ==
                ert.repo.refs[b"refs/remotes/origin/b1"])

        c4_id = hashutil.hash_to_bytehex(swhid.object_id)
        c3_id = ert.repo.refs[b"refs/remotes/origin/b2"]

        assert ert.repo[c3_id].parents == ert.repo[c4_id].parents
Ejemplo n.º 3
0
    def to_dict(self) -> Dict:
        """Convert a HgRevision to a dict for SWHID computation"""
        date = normalize_timestamp(int(self.timestamp))

        extra_headers = [
            (b"time_offset_seconds", str(self.offset).encode("utf-8")),
        ]

        for key, value in self.extras.items():
            if key == b"branch" and value == b"default":
                # branch default is skipped to match historical implementation
                continue
            if key == b"transplant_source":
                # transplant_source is converted to hex
                # to match historical implementation
                value = hash_to_bytehex(escape_decode(value)[0])
            extra_headers.append((key, value))

        author = self.author.to_dict()

        return {
            "author": author,
            "date": date,
            "committer": author,
            "committer_date": date,
            "type": RevisionType.MERCURIAL.value,
            "message": self.description,
            "metadata": {
                "node": self.node_id
            },
            "extra_headers": tuple(extra_headers),
            "synthetic": False,
            "parents": self.parents,
        }
Ejemplo n.º 4
0
def test_multi_hash_file_bytehexdigest_with_md5(hash_test_data):
    fobj = io.BytesIO(hash_test_data.data)
    length = len(hash_test_data.data)
    checksums = MultiHash.from_file(fobj,
                                    hash_names=DEFAULT_ALGORITHMS | {"md5"},
                                    length=length).bytehexdigest()
    md5sum = {
        "md5": hash_to_bytehex(hashlib.md5(hash_test_data.data).digest())
    }
    assert checksums == {**hash_test_data.bytehex_checksums, **md5sum}
Ejemplo n.º 5
0
    def check_snapshot_two_double_fork_merge(self, ert, p, swhid):
        assert (hashutil.hash_to_bytehex(swhid.object_id) ==
                ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"]
                == ert.repo.refs[b"refs/remotes/origin/HEAD"] ==
                ert.repo.refs[b"refs/remotes/origin/master"])

        (c4_id, c5_id) = ert.repo[swhid.object_id.hex().encode()].parents
        assert c5_id == ert.repo.refs[b"refs/remotes/origin/c3"]

        (c2_id, c3_id) = ert.repo[c4_id].parents
        assert c3_id == ert.repo.refs[b"refs/remotes/origin/c1"]
Ejemplo n.º 6
0
    def check_snapshot_triple_merge(self, ert, p, swhid):
        assert (hashutil.hash_to_bytehex(swhid.object_id) ==
                ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"]
                == ert.repo.refs[b"refs/remotes/origin/HEAD"] ==
                ert.repo.refs[b"refs/remotes/origin/master"])

        (c2_id, c3_id,
         c4_id) = ert.repo[swhid.object_id.hex().encode()].parents
        assert c3_id == ert.repo.refs[b"refs/remotes/origin/b1"]
        assert c4_id == ert.repo.refs[b"refs/remotes/origin/b2"]

        assert (ert.repo[c2_id].parents == ert.repo[c3_id].parents ==
                ert.repo[c4_id].parents)
Ejemplo n.º 7
0
    def check_snapshot_tags(self, ert, p, swhid):
        assert (hashutil.hash_to_bytehex(swhid.object_id) ==
                ert.repo.refs[b"HEAD"] == ert.repo.refs[b"refs/heads/master"]
                == ert.repo.refs[b"refs/remotes/origin/HEAD"] ==
                ert.repo.refs[b"refs/remotes/origin/master"] ==
                ert.repo.refs[b"refs/tags/t5"])

        c2_id = ert.repo.refs[b"refs/tags/t2"]
        c5_id = hashutil.hash_to_bytehex(swhid.object_id)

        assert ert.repo[c5_id].parents == [c2_id]

        t5a = ert.repo[ert.repo.refs[b"refs/tags/t5a"]]
        # TODO: investigate why new dulwich adds \n
        assert t5a.message in (b"tag 5", b"tag 5\n")
        assert t5a.object == (dulwich.objects.Commit, c5_id)

        t4a = ert.repo[ert.repo.refs[b"refs/tags/t4a"]]
        (_, c4_id) = t4a.object
        assert ert.repo[c4_id].message == b"add file4\n"  # TODO: ditto
        (c3_id, ) = ert.repo[c4_id].parents
        assert ert.repo[c3_id].message == b"add file3\n"  # TODO: ditto
        assert ert.repo[c3_id].parents == [c2_id]
Ejemplo n.º 8
0
    def determine_wants(self, refs: Dict[bytes, HexBytes]) -> List[HexBytes]:
        """Get the list of bytehex sha1s that the git loader should fetch.

        This compares the remote refs sent by the server with the base snapshot
        provided by the loader.

        """
        if not refs:
            return []

        # Cache existing heads
        local_heads: Set[HexBytes] = set()
        for base_snapshot in self.base_snapshots:
            for branch_name, branch in base_snapshot.branches.items():
                if not branch or branch.target_type == TargetType.ALIAS:
                    continue
                local_heads.add(HexBytes(hashutil.hash_to_bytehex(branch.target)))

        self.heads = local_heads

        # Get the remote heads that we want to fetch
        remote_heads: Set[HexBytes] = set()
        for ref_name, ref_target in refs.items():
            if utils.ignore_branch_name(ref_name):
                continue
            remote_heads.add(ref_target)

        logger.debug("local_heads_count=%s", len(local_heads))
        logger.debug("remote_heads_count=%s", len(remote_heads))
        wanted_refs = list(remote_heads - local_heads)
        logger.debug("wanted_refs_count=%s", len(wanted_refs))
        if self.statsd is not None:
            self.statsd.histogram(
                "git_ignored_refs_percent",
                len(remote_heads - set(refs.values())) / len(refs),
                tags={},
            )
            self.statsd.histogram(
                "git_known_refs_percent",
                len(local_heads & remote_heads) / len(remote_heads),
                tags={},
            )
        return wanted_refs
    def test_corrupt_blob(self, mocker):
        # has a signature
        sha1 = hash_to_bytes("28c6f4023d65f74e3b59a2dea3c4277ed9ee07b0")

        blob = copy.deepcopy(self.repo[hash_to_bytehex(sha1)])

        class hasher:
            def digest():
                return sha1

        blob._sha = hasher

        converters.dulwich_blob_to_content(blob)
        converters.dulwich_blob_to_content_id(blob)

        sha1 = hash_to_bytes("1234" * 10)

        with pytest.raises(converters.HashMismatch):
            converters.dulwich_blob_to_content(blob)
        with pytest.raises(converters.HashMismatch):
            converters.dulwich_blob_to_content_id(blob)
Ejemplo n.º 10
0
    class HashTestData:

        data = b"1984\n"
        hex_checksums = {
            "sha1":
            "62be35bf00ff0c624f4a621e2ea5595a049e0731",
            "sha1_git":
            "568aaf43d83b2c3df8067f3bedbb97d83260be6d",
            "sha256":
            "26602113b4b9afd9d55466b08580d3c2"
            "4a9b50ee5b5866c0d91fab0e65907311",
            "blake2s256":
            "63cfb259e1fdb485bc5c55749697a6b21ef31fb7445f6c78a"
            "c9422f9f2dc8906",
        }

        checksums = {
            type: bytes.fromhex(cksum)
            for type, cksum in hex_checksums.items()
        }

        bytehex_checksums = {
            type: hashutil.hash_to_bytehex(cksum)
            for type, cksum in checksums.items()
        }

        git_hex_checksums = {
            "blob": hex_checksums["sha1_git"],
            "tree": "5b2e883aa33d2efab98442693ea4dd5f1b8871b0",
            "commit": "79e4093542e72f0fcb7cbd75cb7d270f9254aa8f",
            "tag": "d6bf62466f287b4d986c545890716ce058bddf67",
        }

        git_checksums = {
            type: bytes.fromhex(cksum)
            for type, cksum in git_hex_checksums.items()
        }
Ejemplo n.º 11
0
    def check_revision_two_roots(self, ert, p, swhid):
        assert ert.repo.refs[b"HEAD"].decode() == swhid.object_id.hex()

        (c3, ) = ert.repo[hashutil.hash_to_bytehex(swhid.object_id)].parents
        assert len(ert.repo[c3].parents) == 2
Ejemplo n.º 12
0
    def test_original_malformed_objects(self, swh_storage,
                                        cook_extract_snapshot):
        """Tests that objects that were originally malformed:

        * are still interpreted somewhat correctly (if the loader could make sense of
          them), especially that they still have links to children
        * have their original manifest in the bundle
        """
        date = TimestampWithTimezone.from_numeric_offset(
            Timestamp(1643819927, 0), 0, False)

        content = Content.from_data(b"foo")
        swh_storage.content_add([content])

        # disordered
        # fmt: off
        malformed_dir_manifest = (b"" + b"100644 file2\x00" +
                                  content.sha1_git + b"100644 file1\x00" +
                                  content.sha1_git)
        # fmt: on
        directory = Directory(
            entries=(
                DirectoryEntry(name=b"file1",
                               type="file",
                               perms=0o100644,
                               target=content.sha1_git),
                DirectoryEntry(name=b"file2",
                               type="file",
                               perms=0o100644,
                               target=content.sha1_git),
            ),
            raw_manifest=f"tree {len(malformed_dir_manifest)}\x00".encode() +
            malformed_dir_manifest,
        )
        swh_storage.directory_add([directory])

        # 'committer' and 'author' swapped
        # fmt: off
        malformed_rev_manifest = (
            b"tree " + hashutil.hash_to_bytehex(directory.id) + b"\n" +
            b"committer me <*****@*****.**> 1643819927 +0000\n" +
            b"author me <*****@*****.**> 1643819927 +0000\n" + b"\n" +
            b"rev")
        # fmt: on
        revision = Revision(
            message=b"rev",
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            committer=Person.from_fullname(b"me <*****@*****.**>"),
            committer_date=date,
            parents=(),
            type=RevisionType.GIT,
            directory=directory.id,
            synthetic=True,
            raw_manifest=f"commit {len(malformed_rev_manifest)}\x00".encode() +
            malformed_rev_manifest,
        )
        swh_storage.revision_add([revision])

        # 'tag' and 'tagger' swapped
        # fmt: off
        malformed_rel_manifest = (
            b"object " + hashutil.hash_to_bytehex(revision.id) + b"\n" +
            b"type commit\n" +
            b"tagger me <*****@*****.**> 1643819927 +0000\n" +
            b"tag v1.1.0\n")
        # fmt: on

        release = Release(
            name=b"v1.1.0",
            message=None,
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            target=revision.id,
            target_type=ModelObjectType.REVISION,
            synthetic=True,
            raw_manifest=f"tag {len(malformed_rel_manifest)}\x00".encode() +
            malformed_rel_manifest,
        )
        swh_storage.release_add([release])

        snapshot = Snapshot(
            branches={
                b"refs/tags/v1.1.0":
                SnapshotBranch(target=release.id,
                               target_type=TargetType.RELEASE),
                b"HEAD":
                SnapshotBranch(target=revision.id,
                               target_type=TargetType.REVISION),
            })
        swh_storage.snapshot_add([snapshot])

        with cook_extract_snapshot(swh_storage, snapshot.swhid()) as (ert, p):
            tag = ert.repo[b"refs/tags/v1.1.0"]
            assert tag.as_raw_string() == malformed_rel_manifest

            commit = ert.repo[tag.object[1]]
            assert commit.as_raw_string() == malformed_rev_manifest

            tree = ert.repo[commit.tree]
            assert tree.as_raw_string() == malformed_dir_manifest
Ejemplo n.º 13
0
def test_hash_to_bytehex(hash_test_data):
    for algo in hash_test_data.checksums:
        hex_checksum = hash_test_data.hex_checksums[algo].encode("ascii")
        assert hex_checksum == hashutil.hash_to_bytehex(
            hash_test_data.checksums[algo])