Example #1
0
def check_snapshot(expected_snapshot, storage):
    """Check for snapshot match.

    Provide the hashes as hexadecimal, the conversion is done
    within the method.

    Args:
        expected_snapshot (dict): full snapshot with hex ids
        storage (Storage): expected storage

    """
    expected_snapshot_id = expected_snapshot["id"]
    expected_branches = expected_snapshot["branches"]
    snap = snapshot_get_all_branches(hash_to_bytes(expected_snapshot_id))
    if snap is None:
        # display known snapshots instead if possible
        if hasattr(storage, "_snapshots"):  # in-mem storage
            from pprint import pprint

            for snap_id, (_snap, _) in storage._snapshots.items():
                snapd = _snap.to_dict()
                snapd["id"] = hash_to_hex(snapd["id"])
                branches = {
                    branch.decode("utf-8"): decode_target(target)
                    for branch, target in snapd["branches"].items()
                }
                snapd["branches"] = branches
                pprint(snapd)
        raise AssertionError("Snapshot is not found")

    branches = {
        branch.decode("utf-8"): decode_target(branch)
        for branch_name, branch in snap["branches"].items()
    }
    assert expected_branches == branches
Example #2
0
    def test_load_dangling_symref(self):
        with open(os.path.join(self.destination_path, ".git/HEAD"), "wb") as f:
            f.write(b"ref: refs/heads/dangling-branch\n")

        res = self.loader.load()
        assert res == {"status": "eventful"}

        visit_status = assert_last_visit_matches(
            self.loader.storage, self.repo_url, status="full", type="git"
        )
        snapshot_id = visit_status.snapshot
        assert snapshot_id is not None

        snapshot = snapshot_get_all_branches(self.loader.storage, snapshot_id)
        branches = snapshot.branches

        assert branches[b"HEAD"] == SnapshotBranch(
            target=b"refs/heads/dangling-branch",
            target_type=TargetType.ALIAS,
        )
        assert branches[b"refs/heads/dangling-branch"] is None

        stats = get_stats(self.loader.storage)
        assert stats == {
            "content": 4,
            "directory": 7,
            "origin": 1,
            "origin_visit": 1,
            "release": 0,
            "revision": 7,
            "skipped_content": 0,
            "snapshot": 1,
        }
def test_snapshot_large(swh_storage, branch_name, branch_target):  # noqa
    snapshot = Snapshot(branches={
        b"%s%05d" % (branch_name, i): branch_target
        for i in range(10000)
    }, )

    swh_storage.snapshot_add([snapshot])

    returned_snapshot = snapshot_get_all_branches(swh_storage, snapshot.id)
    assert snapshot == returned_snapshot
    def finalize_visit(
        self, status_visit: str, errors: Optional[List[str]] = None, **kwargs
    ) -> Dict[str, Any]:
        r = super().finalize_visit(status_visit=status_visit, **kwargs)
        success = status_visit == "full"

        # Update deposit status
        try:
            if not success:
                self.client.status_update(
                    self.deposit_id,
                    status="failed",
                    errors=errors,
                )
                return r

            snapshot_id = hash_to_bytes(r["snapshot_id"])
            snapshot = snapshot_get_all_branches(self.storage, snapshot_id)
            if not snapshot:
                return r
            branches = snapshot.branches
            logger.debug("branches: %s", branches)
            if not branches:
                return r
            rel_id = branches[b"HEAD"].target

            release = self.storage.release_get([rel_id])[0]
            if not release:
                return r

            # update the deposit's status to success with its
            # release-id and directory-id
            self.client.status_update(
                self.deposit_id,
                status="done",
                release_id=hash_to_hex(rel_id),
                directory_id=hash_to_hex(release.target),
                snapshot_id=r["snapshot_id"],
                origin_url=self.origin.url,
            )
        except Exception:
            logger.exception("Problem when trying to update the deposit's status")
            sentry_sdk.capture_exception()
            return {"status": "failed"}
        return r
def test_maven_loader_first_visit(swh_storage,
                                  expected_contents_and_directories,
                                  expected_snapshot, expected_releases):
    """With no prior visit, loading a jar ends up with 1 snapshot"""

    loader = MavenLoader(swh_storage, MVN_ORIGIN_URL, artifacts=MVN_ARTIFACTS)

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"

    actual_snapshot = snapshot_get_all_branches(
        swh_storage, hash_to_bytes(actual_load_status["snapshot_id"]))

    assert actual_load_status["snapshot_id"] == expected_snapshot.id.hex()
    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)
    assert_last_visit_matches(swh_storage,
                              MVN_ORIGIN_URL,
                              status="full",
                              type="maven")

    expected_contents, expected_directories = expected_contents_and_directories
    assert list(swh_storage.content_missing_per_sha1(expected_contents)) == []
    assert list(swh_storage.directory_missing(expected_directories)) == []

    rel_id = actual_snapshot.branches[b"releases/0.1.0"].target
    rel2_id = actual_snapshot.branches[b"releases/0.1.1"].target
    releases = swh_storage.release_get([rel_id, rel2_id])

    assert releases == expected_releases

    assert {
        "content": len(expected_contents),
        "directory": len(expected_directories),
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
    def index(self, id: str, data: None = None, **kwargs) -> List[Dict]:
        origin_url = id
        visit_status = origin_get_latest_visit_status(
            self.storage, origin_url, allowed_statuses=["full"], require_snapshot=True
        )
        if not visit_status:
            return []
        assert visit_status.snapshot is not None
        snapshot = snapshot_get_all_branches(self.storage, visit_status.snapshot)
        if snapshot is None:
            return []
        method = getattr(
            self, "_try_get_%s_head" % visit_status.type, self._try_get_head_generic
        )

        rev_id = method(snapshot.branches)  # type: ignore
        if rev_id is not None:
            return [{"origin_url": origin_url, "revision_id": rev_id,}]

        # could not find a head revision
        return []
Example #7
0
def _check_revision_in_origin(storage, origin, revision_id):
    seen_snapshots = set()  # no need to visit them again
    seen_revisions = set()

    for visit in iter_origin_visits(storage, origin):
        for status in iter_origin_visit_statuses(storage, origin, visit.visit):
            if status.snapshot is None:
                continue
            if status.snapshot in seen_snapshots:
                continue
            seen_snapshots.add(status.snapshot)
            snapshot = snapshot_get_all_branches(storage, status.snapshot)
            for (branch_name, branch) in snapshot.branches.items():
                if branch is None:
                    continue

                # If it's the revision passed as argument, then it is indeed in the
                # origin
                if branch.target == revision_id:
                    return True

                # Else, let's make sure the branch doesn't have any other revision

                # Get the revision at the top of the branch.
                if branch.target in seen_revisions:
                    continue
                seen_revisions.add(branch.target)
                revision = storage.revision_get([branch.target])[0]

                if revision is None:
                    # https://forge.softwareheritage.org/T997
                    continue

                # Check it doesn't have parents (else we would have to
                # recurse)
                assert revision.parents == (), "revision with parents"

    return False
Example #8
0
 def snapshot_get(self, snapshot_id):
     snp = snapshot_get_all_branches(self.storage,
                                     hash_to_bytes(snapshot_id))
     return converters.from_snapshot(snp.to_dict())
Example #9
0
    def test_load_changed(self):
        """Loads a repository, makes some changes by adding files, commits,
        and merges, load it again, and check the storage contains everything
        it should."""
        # Initial load
        res = self.loader.load()
        assert res == {"status": "eventful"}

        stats0 = get_stats(self.loader.storage)
        assert stats0 == {
            "content": 4,
            "directory": 7,
            "origin": 1,
            "origin_visit": 1,
            "release": 0,
            "revision": 7,
            "skipped_content": 0,
            "snapshot": 1,
        }

        # Load with a new file + revision
        with open(os.path.join(self.destination_path, "hello.py"), "a") as fd:
            fd.write("print('Hello world')\n")

        self.repo.stage([b"hello.py"])
        new_revision = self.repo.do_commit(b"Hello world\n").decode()
        new_dir = "85dae072a5aa9923ffa7a7568f819ff21bf49858"

        assert self.repo[new_revision.encode()].tree == new_dir.encode()

        revisions = REVISIONS1.copy()
        assert new_revision not in revisions
        revisions[new_revision] = new_dir

        res = self.loader.load()
        assert res == {"status": "eventful"}

        stats1 = get_stats(self.loader.storage)
        expected_stats = copy.deepcopy(stats0)
        # did one new visit
        expected_stats["origin_visit"] += 1
        # with one more of the following objects
        expected_stats["content"] += 1
        expected_stats["directory"] += 1
        expected_stats["revision"] += 1
        # concluding into 1 new snapshot
        expected_stats["snapshot"] += 1

        assert stats1 == expected_stats

        visit_status = assert_last_visit_matches(
            self.loader.storage, self.repo_url, status="full", type="git"
        )
        assert visit_status.snapshot is not None

        snapshot_id = visit_status.snapshot
        snapshot = snapshot_get_all_branches(self.loader.storage, snapshot_id)
        branches = snapshot.branches
        assert branches[b"HEAD"] == SnapshotBranch(
            target=b"refs/heads/master",
            target_type=TargetType.ALIAS,
        )
        assert branches[b"refs/heads/master"] == SnapshotBranch(
            target=hash_to_bytes(new_revision),
            target_type=TargetType.REVISION,
        )

        # Merge branch1 into HEAD.

        current = self.repo[b"HEAD"]
        branch1 = self.repo[b"refs/heads/branch1"]

        merged_tree = dulwich.objects.Tree()
        for item in self.repo[current.tree].items():
            merged_tree.add(*item)
        for item in self.repo[branch1.tree].items():
            merged_tree.add(*item)

        merged_dir_id = "dab8a37df8db8666d4e277bef9a546f585b5bedd"
        assert merged_tree.id.decode() == merged_dir_id
        self.repo.object_store.add_object(merged_tree)

        merge_commit = self.repo.do_commit(
            b"merge.\n", tree=merged_tree.id, merge_heads=[branch1.id]
        )

        assert merge_commit.decode() not in revisions
        revisions[merge_commit.decode()] = merged_tree.id.decode()

        res = self.loader.load()
        assert res == {"status": "eventful"}

        stats2 = get_stats(self.loader.storage)
        expected_stats = copy.deepcopy(stats1)
        # one more visit
        expected_stats["origin_visit"] += 1
        # with 1 new directory and revision
        expected_stats["directory"] += 1
        expected_stats["revision"] += 1
        # concluding into 1 new snapshot
        expected_stats["snapshot"] += 1

        assert stats2 == expected_stats

        visit_status = assert_last_visit_matches(
            self.loader.storage, self.repo_url, status="full", type="git"
        )
        assert visit_status.snapshot is not None

        merge_snapshot_id = visit_status.snapshot
        assert merge_snapshot_id != snapshot_id

        merge_snapshot = snapshot_get_all_branches(
            self.loader.storage, merge_snapshot_id
        )
        merge_branches = merge_snapshot.branches
        assert merge_branches[b"HEAD"] == SnapshotBranch(
            target=b"refs/heads/master",
            target_type=TargetType.ALIAS,
        )
        assert merge_branches[b"refs/heads/master"] == SnapshotBranch(
            target=hash_to_bytes(merge_commit.decode()),
            target_type=TargetType.REVISION,
        )
def test_load_nixguix_one_common_artifact_from_other_loader(
        swh_storage, datadir, requests_mock_datadir_visits, caplog):
    """Misformatted revision should be caught and logged, then loading continues"""
    caplog.set_level(logging.ERROR, "swh.loader.package.nixguix.loader")

    # 1. first ingest with for example the archive loader
    gnu_url = "https://ftp.gnu.org/gnu/8sync/"
    release = "0.1.0"
    artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz"
    gnu_artifacts = [{
        "time": 944729610,
        "url": artifact_url,
        "length": 221837,
        "filename": f"8sync-{release}.tar.gz",
        "version": release,
    }]
    archive_loader = ArchiveLoader(swh_storage,
                                   url=gnu_url,
                                   artifacts=gnu_artifacts)
    actual_load_status = archive_loader.load()
    expected_snapshot_id = "9efecc835e8f99254934f256b5301b94f348fd17"
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] == expected_snapshot_id  # noqa

    assert_last_visit_matches(
        archive_loader.storage,
        gnu_url,
        status="full",
        type="tar",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    # 2. Then ingest with the nixguix loader which lists the same artifact within its
    # sources.json

    # ensure test setup is ok
    data_sources = os.path.join(datadir, "https_nix-community.github.io",
                                "nixpkgs-swh_sources_special.json")
    all_sources = json.loads(open(data_sources).read())
    found = False
    for source in all_sources["sources"]:
        if source["urls"][0] == artifact_url:
            found = True
            assert (
                found is True
            ), f"test setup error: {artifact_url} must be in {data_sources}"

    # first visit with a snapshot, ok
    sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json"
    loader = NixGuixLoader(swh_storage, sources_url)
    actual_load_status2 = loader.load()
    assert actual_load_status2["status"] == "eventful"

    snapshot_id = actual_load_status2["snapshot_id"]

    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="full",
        type="nixguix",
        snapshot=hash_to_bytes(snapshot_id),
    )

    snapshot = snapshot_get_all_branches(swh_storage,
                                         hash_to_bytes(snapshot_id))
    assert snapshot
Example #11
0
    def push_snapshot_subgraph(self, obj_id: Sha1Git) -> None:
        """Fetches a snapshot and all its children, excluding directories and contents,
        and pushes them to the todo-lists.

        Also loads revisions if swh-graph is not available, see
        :meth:`push_revision_subgraph`."""
        loaded_from_graph = False

        if self.graph:
            revision_ids = []
            release_ids = []
            directory_ids = []
            content_ids = []

            from swh.graph.client import GraphArgumentException

            # First, try to cook using swh-graph, as it is more efficient than
            # swh-storage for querying the history
            obj_swhid = CoreSWHID(
                object_type=ObjectType.SNAPSHOT,
                object_id=obj_id,
            )
            try:
                swhids: Iterable[CoreSWHID] = map(
                    CoreSWHID.from_string,
                    self.graph.visit_nodes(str(obj_swhid),
                                           edges="snp:*,rel:*,rev:rev"),
                )
                for swhid in swhids:
                    if swhid.object_type is ObjectType.REVISION:
                        revision_ids.append(swhid.object_id)
                    elif swhid.object_type is ObjectType.RELEASE:
                        release_ids.append(swhid.object_id)
                    elif swhid.object_type is ObjectType.DIRECTORY:
                        directory_ids.append(swhid.object_id)
                    elif swhid.object_type is ObjectType.CONTENT:
                        content_ids.append(swhid.object_id)
                    elif swhid.object_type is ObjectType.SNAPSHOT:
                        assert (
                            swhid.object_id == obj_id
                        ), f"Snapshot {obj_id.hex()} references a different snapshot"
                    else:
                        assert_never(swhid.object_type,
                                     f"Unexpected SWHID object type: {swhid}")
            except GraphArgumentException as e:
                logger.info(
                    "Snapshot %s not found in swh-graph, falling back to fetching "
                    "history for each branch. %s",
                    hash_to_hex(obj_id),
                    e.args[0],
                )
            else:
                self._push(self._rev_stack, revision_ids)
                self._push(self._rel_stack, release_ids)
                self._push(self._dir_stack, directory_ids)
                self._push(self._cnt_stack, content_ids)
                loaded_from_graph = True

        # TODO: when self.graph is available and supports edge labels, use it
        # directly to get branch names.
        snapshot: Optional[Snapshot] = snapshot_get_all_branches(
            self.storage, obj_id)
        assert snapshot, "Unknown snapshot"  # should have been caught by check_exists()
        for branch in snapshot.branches.values():
            if not loaded_from_graph:
                if branch is None:
                    logging.warning("Dangling branch: %r", branch)
                    continue
                assert isinstance(branch, SnapshotBranch)  # for mypy
                if branch.target_type is TargetType.REVISION:
                    self.push_revision_subgraph(branch.target)
                elif branch.target_type is TargetType.RELEASE:
                    self.push_releases_subgraphs([branch.target])
                elif branch.target_type is TargetType.ALIAS:
                    # Nothing to do, this for loop also iterates on the target branch
                    # (if it exists)
                    pass
                elif branch.target_type is TargetType.DIRECTORY:
                    self._push(self._dir_stack, [branch.target])
                elif branch.target_type is TargetType.CONTENT:
                    self._push(self._cnt_stack, [branch.target])
                elif branch.target_type is TargetType.SNAPSHOT:
                    if swhid.object_id != obj_id:
                        raise NotImplementedError(
                            f"{swhid} has a snapshot as a branch.")
                else:
                    assert_never(branch.target_type,
                                 f"Unexpected target type: {self.obj_type}")

        self.write_refs(snapshot=snapshot)
def check_snapshot(
    expected_snapshot: Snapshot,
    storage: StorageInterface,
    allowed_empty: Iterable[Tuple[TargetType, bytes]] = [],
) -> Snapshot:
    """Check that:
    - snapshot exists in the storage and match
    - each object reference up to the revision/release targets exists

    Args:
        expected_snapshot: full snapshot to check for existence and consistency
        storage: storage to lookup information into
        allowed_empty: Iterable of branch we allow to be empty (some edge case loaders
          allows this case to happen, nixguix for example allows the branch evaluation"
          to target the nixpkgs git commit reference, which may not yet be resolvable at
          loading time)

    Returns:
        the snapshot stored in the storage for further test assertion if any is
        needed.

    """
    __tracebackhide__ = True  # Hide from pytest tracebacks on failure
    if not isinstance(expected_snapshot, Snapshot):
        raise AssertionError(
            f"argument 'expected_snapshot' must be a snapshot: {expected_snapshot!r}"
        )

    snapshot = snapshot_get_all_branches(storage, expected_snapshot.id)
    if snapshot is None:
        raise AssertionError(f"Snapshot {expected_snapshot.id.hex()} is not found")

    assert snapshot == expected_snapshot

    objects_by_target_type = defaultdict(list)
    object_to_branch = {}
    for branch, target in expected_snapshot.branches.items():
        if (target.target_type, branch) in allowed_empty:
            # safe for those elements to not be checked for existence
            continue
        objects_by_target_type[target.target_type].append(target.target)
        object_to_branch[target.target] = branch

    # check that alias references target something that exists, otherwise raise
    aliases: List[bytes] = objects_by_target_type.get(TargetType.ALIAS, [])
    for alias in aliases:
        if alias not in expected_snapshot.branches:
            raise InconsistentAliasBranchError(
                f"Alias branch {alias.decode('utf-8')} "
                f"should be in {list(expected_snapshot.branches)}"
            )

    revs = objects_by_target_type.get(TargetType.REVISION)
    if revs:
        revisions = storage.revision_get(revs)
        not_found = [rev_id for rev_id, rev in zip(revs, revisions) if rev is None]
        if not_found:
            missing_objs = ", ".join(
                str((object_to_branch[rev], rev.hex())) for rev in not_found
            )
            raise InexistentObjectsError(
                f"Branch/Revision(s) {missing_objs} should exist in storage"
            )
        # retrieve information from revision
        for revision in revisions:
            assert revision is not None
            objects_by_target_type[TargetType.DIRECTORY].append(revision.directory)
            object_to_branch[revision.directory] = revision.id

    rels = objects_by_target_type.get(TargetType.RELEASE)
    if rels:
        not_found = list(storage.release_missing(rels))
        if not_found:
            missing_objs = ", ".join(
                str((object_to_branch[rel], rel.hex())) for rel in not_found
            )
            raise InexistentObjectsError(
                f"Branch/Release(s) {missing_objs} should exist in storage"
            )

    # first level dirs exist?
    dirs = objects_by_target_type.get(TargetType.DIRECTORY)
    if dirs:
        not_found = list(storage.directory_missing(dirs))
        if not_found:
            missing_objs = ", ".join(
                str((object_to_branch[dir_].hex(), dir_.hex())) for dir_ in not_found
            )
            raise InexistentObjectsError(
                f"Missing directories {missing_objs}: "
                "(revision exists, directory target does not)"
            )
        for dir_ in dirs:  # retrieve new objects to check for existence
            paths = storage.directory_ls(dir_, recursive=True)
            for path in paths:
                if path["type"] == "dir":
                    target_type = TargetType.DIRECTORY
                else:
                    target_type = TargetType.CONTENT
                target = path["target"]
                objects_by_target_type[target_type].append(target)
                object_to_branch[target] = dir_

    # check nested directories
    dirs = objects_by_target_type.get(TargetType.DIRECTORY)
    if dirs:
        not_found = list(storage.directory_missing(dirs))
        if not_found:
            missing_objs = ", ".join(
                str((object_to_branch[dir_].hex(), dir_.hex())) for dir_ in not_found
            )
            raise InexistentObjectsError(
                f"Missing directories {missing_objs}: "
                "(revision exists, directory target does not)"
            )

    # check contents directories
    cnts = objects_by_target_type.get(TargetType.CONTENT)
    if cnts:
        not_found = list(storage.content_missing_per_sha1_git(cnts))
        if not_found:
            missing_objs = ", ".join(
                str((object_to_branch[cnt].hex(), cnt.hex())) for cnt in not_found
            )
            raise InexistentObjectsError(f"Missing contents {missing_objs}")

    return snapshot
def test_snapshot_small(swh_storage, snapshot):  # noqa
    swh_storage.snapshot_add([snapshot])

    returned_snapshot = snapshot_get_all_branches(swh_storage, snapshot.id)
    assert snapshot == returned_snapshot