def check_snapshot(expected_snapshot, storage): """Check for snapshot match. Provide the hashes as hexadecimal, the conversion is done within the method. Args: expected_snapshot (dict): full snapshot with hex ids storage (Storage): expected storage """ expected_snapshot_id = expected_snapshot["id"] expected_branches = expected_snapshot["branches"] snap = snapshot_get_all_branches(hash_to_bytes(expected_snapshot_id)) if snap is None: # display known snapshots instead if possible if hasattr(storage, "_snapshots"): # in-mem storage from pprint import pprint for snap_id, (_snap, _) in storage._snapshots.items(): snapd = _snap.to_dict() snapd["id"] = hash_to_hex(snapd["id"]) branches = { branch.decode("utf-8"): decode_target(target) for branch, target in snapd["branches"].items() } snapd["branches"] = branches pprint(snapd) raise AssertionError("Snapshot is not found") branches = { branch.decode("utf-8"): decode_target(branch) for branch_name, branch in snap["branches"].items() } assert expected_branches == branches
def test_load_dangling_symref(self): with open(os.path.join(self.destination_path, ".git/HEAD"), "wb") as f: f.write(b"ref: refs/heads/dangling-branch\n") res = self.loader.load() assert res == {"status": "eventful"} visit_status = assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git" ) snapshot_id = visit_status.snapshot assert snapshot_id is not None snapshot = snapshot_get_all_branches(self.loader.storage, snapshot_id) branches = snapshot.branches assert branches[b"HEAD"] == SnapshotBranch( target=b"refs/heads/dangling-branch", target_type=TargetType.ALIAS, ) assert branches[b"refs/heads/dangling-branch"] is None stats = get_stats(self.loader.storage) assert stats == { "content": 4, "directory": 7, "origin": 1, "origin_visit": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 1, }
def test_snapshot_large(swh_storage, branch_name, branch_target): # noqa snapshot = Snapshot(branches={ b"%s%05d" % (branch_name, i): branch_target for i in range(10000) }, ) swh_storage.snapshot_add([snapshot]) returned_snapshot = snapshot_get_all_branches(swh_storage, snapshot.id) assert snapshot == returned_snapshot
def finalize_visit( self, status_visit: str, errors: Optional[List[str]] = None, **kwargs ) -> Dict[str, Any]: r = super().finalize_visit(status_visit=status_visit, **kwargs) success = status_visit == "full" # Update deposit status try: if not success: self.client.status_update( self.deposit_id, status="failed", errors=errors, ) return r snapshot_id = hash_to_bytes(r["snapshot_id"]) snapshot = snapshot_get_all_branches(self.storage, snapshot_id) if not snapshot: return r branches = snapshot.branches logger.debug("branches: %s", branches) if not branches: return r rel_id = branches[b"HEAD"].target release = self.storage.release_get([rel_id])[0] if not release: return r # update the deposit's status to success with its # release-id and directory-id self.client.status_update( self.deposit_id, status="done", release_id=hash_to_hex(rel_id), directory_id=hash_to_hex(release.target), snapshot_id=r["snapshot_id"], origin_url=self.origin.url, ) except Exception: logger.exception("Problem when trying to update the deposit's status") sentry_sdk.capture_exception() return {"status": "failed"} return r
def test_maven_loader_first_visit(swh_storage, expected_contents_and_directories, expected_snapshot, expected_releases): """With no prior visit, loading a jar ends up with 1 snapshot""" loader = MavenLoader(swh_storage, MVN_ORIGIN_URL, artifacts=MVN_ARTIFACTS) actual_load_status = loader.load() assert actual_load_status["status"] == "eventful" actual_snapshot = snapshot_get_all_branches( swh_storage, hash_to_bytes(actual_load_status["snapshot_id"])) assert actual_load_status["snapshot_id"] == expected_snapshot.id.hex() check_snapshot(expected_snapshot, swh_storage) stats = get_stats(swh_storage) assert_last_visit_matches(swh_storage, MVN_ORIGIN_URL, status="full", type="maven") expected_contents, expected_directories = expected_contents_and_directories assert list(swh_storage.content_missing_per_sha1(expected_contents)) == [] assert list(swh_storage.directory_missing(expected_directories)) == [] rel_id = actual_snapshot.branches[b"releases/0.1.0"].target rel2_id = actual_snapshot.branches[b"releases/0.1.1"].target releases = swh_storage.release_get([rel_id, rel2_id]) assert releases == expected_releases assert { "content": len(expected_contents), "directory": len(expected_directories), "origin": 1, "origin_visit": 1, "release": 2, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats
def index(self, id: str, data: None = None, **kwargs) -> List[Dict]: origin_url = id visit_status = origin_get_latest_visit_status( self.storage, origin_url, allowed_statuses=["full"], require_snapshot=True ) if not visit_status: return [] assert visit_status.snapshot is not None snapshot = snapshot_get_all_branches(self.storage, visit_status.snapshot) if snapshot is None: return [] method = getattr( self, "_try_get_%s_head" % visit_status.type, self._try_get_head_generic ) rev_id = method(snapshot.branches) # type: ignore if rev_id is not None: return [{"origin_url": origin_url, "revision_id": rev_id,}] # could not find a head revision return []
def _check_revision_in_origin(storage, origin, revision_id): seen_snapshots = set() # no need to visit them again seen_revisions = set() for visit in iter_origin_visits(storage, origin): for status in iter_origin_visit_statuses(storage, origin, visit.visit): if status.snapshot is None: continue if status.snapshot in seen_snapshots: continue seen_snapshots.add(status.snapshot) snapshot = snapshot_get_all_branches(storage, status.snapshot) for (branch_name, branch) in snapshot.branches.items(): if branch is None: continue # If it's the revision passed as argument, then it is indeed in the # origin if branch.target == revision_id: return True # Else, let's make sure the branch doesn't have any other revision # Get the revision at the top of the branch. if branch.target in seen_revisions: continue seen_revisions.add(branch.target) revision = storage.revision_get([branch.target])[0] if revision is None: # https://forge.softwareheritage.org/T997 continue # Check it doesn't have parents (else we would have to # recurse) assert revision.parents == (), "revision with parents" return False
def snapshot_get(self, snapshot_id): snp = snapshot_get_all_branches(self.storage, hash_to_bytes(snapshot_id)) return converters.from_snapshot(snp.to_dict())
def test_load_changed(self): """Loads a repository, makes some changes by adding files, commits, and merges, load it again, and check the storage contains everything it should.""" # Initial load res = self.loader.load() assert res == {"status": "eventful"} stats0 = get_stats(self.loader.storage) assert stats0 == { "content": 4, "directory": 7, "origin": 1, "origin_visit": 1, "release": 0, "revision": 7, "skipped_content": 0, "snapshot": 1, } # Load with a new file + revision with open(os.path.join(self.destination_path, "hello.py"), "a") as fd: fd.write("print('Hello world')\n") self.repo.stage([b"hello.py"]) new_revision = self.repo.do_commit(b"Hello world\n").decode() new_dir = "85dae072a5aa9923ffa7a7568f819ff21bf49858" assert self.repo[new_revision.encode()].tree == new_dir.encode() revisions = REVISIONS1.copy() assert new_revision not in revisions revisions[new_revision] = new_dir res = self.loader.load() assert res == {"status": "eventful"} stats1 = get_stats(self.loader.storage) expected_stats = copy.deepcopy(stats0) # did one new visit expected_stats["origin_visit"] += 1 # with one more of the following objects expected_stats["content"] += 1 expected_stats["directory"] += 1 expected_stats["revision"] += 1 # concluding into 1 new snapshot expected_stats["snapshot"] += 1 assert stats1 == expected_stats visit_status = assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git" ) assert visit_status.snapshot is not None snapshot_id = visit_status.snapshot snapshot = snapshot_get_all_branches(self.loader.storage, snapshot_id) branches = snapshot.branches assert branches[b"HEAD"] == SnapshotBranch( target=b"refs/heads/master", target_type=TargetType.ALIAS, ) assert branches[b"refs/heads/master"] == SnapshotBranch( target=hash_to_bytes(new_revision), target_type=TargetType.REVISION, ) # Merge branch1 into HEAD. current = self.repo[b"HEAD"] branch1 = self.repo[b"refs/heads/branch1"] merged_tree = dulwich.objects.Tree() for item in self.repo[current.tree].items(): merged_tree.add(*item) for item in self.repo[branch1.tree].items(): merged_tree.add(*item) merged_dir_id = "dab8a37df8db8666d4e277bef9a546f585b5bedd" assert merged_tree.id.decode() == merged_dir_id self.repo.object_store.add_object(merged_tree) merge_commit = self.repo.do_commit( b"merge.\n", tree=merged_tree.id, merge_heads=[branch1.id] ) assert merge_commit.decode() not in revisions revisions[merge_commit.decode()] = merged_tree.id.decode() res = self.loader.load() assert res == {"status": "eventful"} stats2 = get_stats(self.loader.storage) expected_stats = copy.deepcopy(stats1) # one more visit expected_stats["origin_visit"] += 1 # with 1 new directory and revision expected_stats["directory"] += 1 expected_stats["revision"] += 1 # concluding into 1 new snapshot expected_stats["snapshot"] += 1 assert stats2 == expected_stats visit_status = assert_last_visit_matches( self.loader.storage, self.repo_url, status="full", type="git" ) assert visit_status.snapshot is not None merge_snapshot_id = visit_status.snapshot assert merge_snapshot_id != snapshot_id merge_snapshot = snapshot_get_all_branches( self.loader.storage, merge_snapshot_id ) merge_branches = merge_snapshot.branches assert merge_branches[b"HEAD"] == SnapshotBranch( target=b"refs/heads/master", target_type=TargetType.ALIAS, ) assert merge_branches[b"refs/heads/master"] == SnapshotBranch( target=hash_to_bytes(merge_commit.decode()), target_type=TargetType.REVISION, )
def test_load_nixguix_one_common_artifact_from_other_loader( swh_storage, datadir, requests_mock_datadir_visits, caplog): """Misformatted revision should be caught and logged, then loading continues""" caplog.set_level(logging.ERROR, "swh.loader.package.nixguix.loader") # 1. first ingest with for example the archive loader gnu_url = "https://ftp.gnu.org/gnu/8sync/" release = "0.1.0" artifact_url = f"https://ftp.gnu.org/gnu/8sync/8sync-{release}.tar.gz" gnu_artifacts = [{ "time": 944729610, "url": artifact_url, "length": 221837, "filename": f"8sync-{release}.tar.gz", "version": release, }] archive_loader = ArchiveLoader(swh_storage, url=gnu_url, artifacts=gnu_artifacts) actual_load_status = archive_loader.load() expected_snapshot_id = "9efecc835e8f99254934f256b5301b94f348fd17" assert actual_load_status["status"] == "eventful" assert actual_load_status["snapshot_id"] == expected_snapshot_id # noqa assert_last_visit_matches( archive_loader.storage, gnu_url, status="full", type="tar", snapshot=hash_to_bytes(expected_snapshot_id), ) # 2. Then ingest with the nixguix loader which lists the same artifact within its # sources.json # ensure test setup is ok data_sources = os.path.join(datadir, "https_nix-community.github.io", "nixpkgs-swh_sources_special.json") all_sources = json.loads(open(data_sources).read()) found = False for source in all_sources["sources"]: if source["urls"][0] == artifact_url: found = True assert ( found is True ), f"test setup error: {artifact_url} must be in {data_sources}" # first visit with a snapshot, ok sources_url = "https://nix-community.github.io/nixpkgs-swh/sources_special.json" loader = NixGuixLoader(swh_storage, sources_url) actual_load_status2 = loader.load() assert actual_load_status2["status"] == "eventful" snapshot_id = actual_load_status2["snapshot_id"] assert_last_visit_matches( swh_storage, sources_url, status="full", type="nixguix", snapshot=hash_to_bytes(snapshot_id), ) snapshot = snapshot_get_all_branches(swh_storage, hash_to_bytes(snapshot_id)) assert snapshot
def push_snapshot_subgraph(self, obj_id: Sha1Git) -> None: """Fetches a snapshot and all its children, excluding directories and contents, and pushes them to the todo-lists. Also loads revisions if swh-graph is not available, see :meth:`push_revision_subgraph`.""" loaded_from_graph = False if self.graph: revision_ids = [] release_ids = [] directory_ids = [] content_ids = [] from swh.graph.client import GraphArgumentException # First, try to cook using swh-graph, as it is more efficient than # swh-storage for querying the history obj_swhid = CoreSWHID( object_type=ObjectType.SNAPSHOT, object_id=obj_id, ) try: swhids: Iterable[CoreSWHID] = map( CoreSWHID.from_string, self.graph.visit_nodes(str(obj_swhid), edges="snp:*,rel:*,rev:rev"), ) for swhid in swhids: if swhid.object_type is ObjectType.REVISION: revision_ids.append(swhid.object_id) elif swhid.object_type is ObjectType.RELEASE: release_ids.append(swhid.object_id) elif swhid.object_type is ObjectType.DIRECTORY: directory_ids.append(swhid.object_id) elif swhid.object_type is ObjectType.CONTENT: content_ids.append(swhid.object_id) elif swhid.object_type is ObjectType.SNAPSHOT: assert ( swhid.object_id == obj_id ), f"Snapshot {obj_id.hex()} references a different snapshot" else: assert_never(swhid.object_type, f"Unexpected SWHID object type: {swhid}") except GraphArgumentException as e: logger.info( "Snapshot %s not found in swh-graph, falling back to fetching " "history for each branch. %s", hash_to_hex(obj_id), e.args[0], ) else: self._push(self._rev_stack, revision_ids) self._push(self._rel_stack, release_ids) self._push(self._dir_stack, directory_ids) self._push(self._cnt_stack, content_ids) loaded_from_graph = True # TODO: when self.graph is available and supports edge labels, use it # directly to get branch names. snapshot: Optional[Snapshot] = snapshot_get_all_branches( self.storage, obj_id) assert snapshot, "Unknown snapshot" # should have been caught by check_exists() for branch in snapshot.branches.values(): if not loaded_from_graph: if branch is None: logging.warning("Dangling branch: %r", branch) continue assert isinstance(branch, SnapshotBranch) # for mypy if branch.target_type is TargetType.REVISION: self.push_revision_subgraph(branch.target) elif branch.target_type is TargetType.RELEASE: self.push_releases_subgraphs([branch.target]) elif branch.target_type is TargetType.ALIAS: # Nothing to do, this for loop also iterates on the target branch # (if it exists) pass elif branch.target_type is TargetType.DIRECTORY: self._push(self._dir_stack, [branch.target]) elif branch.target_type is TargetType.CONTENT: self._push(self._cnt_stack, [branch.target]) elif branch.target_type is TargetType.SNAPSHOT: if swhid.object_id != obj_id: raise NotImplementedError( f"{swhid} has a snapshot as a branch.") else: assert_never(branch.target_type, f"Unexpected target type: {self.obj_type}") self.write_refs(snapshot=snapshot)
def check_snapshot( expected_snapshot: Snapshot, storage: StorageInterface, allowed_empty: Iterable[Tuple[TargetType, bytes]] = [], ) -> Snapshot: """Check that: - snapshot exists in the storage and match - each object reference up to the revision/release targets exists Args: expected_snapshot: full snapshot to check for existence and consistency storage: storage to lookup information into allowed_empty: Iterable of branch we allow to be empty (some edge case loaders allows this case to happen, nixguix for example allows the branch evaluation" to target the nixpkgs git commit reference, which may not yet be resolvable at loading time) Returns: the snapshot stored in the storage for further test assertion if any is needed. """ __tracebackhide__ = True # Hide from pytest tracebacks on failure if not isinstance(expected_snapshot, Snapshot): raise AssertionError( f"argument 'expected_snapshot' must be a snapshot: {expected_snapshot!r}" ) snapshot = snapshot_get_all_branches(storage, expected_snapshot.id) if snapshot is None: raise AssertionError(f"Snapshot {expected_snapshot.id.hex()} is not found") assert snapshot == expected_snapshot objects_by_target_type = defaultdict(list) object_to_branch = {} for branch, target in expected_snapshot.branches.items(): if (target.target_type, branch) in allowed_empty: # safe for those elements to not be checked for existence continue objects_by_target_type[target.target_type].append(target.target) object_to_branch[target.target] = branch # check that alias references target something that exists, otherwise raise aliases: List[bytes] = objects_by_target_type.get(TargetType.ALIAS, []) for alias in aliases: if alias not in expected_snapshot.branches: raise InconsistentAliasBranchError( f"Alias branch {alias.decode('utf-8')} " f"should be in {list(expected_snapshot.branches)}" ) revs = objects_by_target_type.get(TargetType.REVISION) if revs: revisions = storage.revision_get(revs) not_found = [rev_id for rev_id, rev in zip(revs, revisions) if rev is None] if not_found: missing_objs = ", ".join( str((object_to_branch[rev], rev.hex())) for rev in not_found ) raise InexistentObjectsError( f"Branch/Revision(s) {missing_objs} should exist in storage" ) # retrieve information from revision for revision in revisions: assert revision is not None objects_by_target_type[TargetType.DIRECTORY].append(revision.directory) object_to_branch[revision.directory] = revision.id rels = objects_by_target_type.get(TargetType.RELEASE) if rels: not_found = list(storage.release_missing(rels)) if not_found: missing_objs = ", ".join( str((object_to_branch[rel], rel.hex())) for rel in not_found ) raise InexistentObjectsError( f"Branch/Release(s) {missing_objs} should exist in storage" ) # first level dirs exist? dirs = objects_by_target_type.get(TargetType.DIRECTORY) if dirs: not_found = list(storage.directory_missing(dirs)) if not_found: missing_objs = ", ".join( str((object_to_branch[dir_].hex(), dir_.hex())) for dir_ in not_found ) raise InexistentObjectsError( f"Missing directories {missing_objs}: " "(revision exists, directory target does not)" ) for dir_ in dirs: # retrieve new objects to check for existence paths = storage.directory_ls(dir_, recursive=True) for path in paths: if path["type"] == "dir": target_type = TargetType.DIRECTORY else: target_type = TargetType.CONTENT target = path["target"] objects_by_target_type[target_type].append(target) object_to_branch[target] = dir_ # check nested directories dirs = objects_by_target_type.get(TargetType.DIRECTORY) if dirs: not_found = list(storage.directory_missing(dirs)) if not_found: missing_objs = ", ".join( str((object_to_branch[dir_].hex(), dir_.hex())) for dir_ in not_found ) raise InexistentObjectsError( f"Missing directories {missing_objs}: " "(revision exists, directory target does not)" ) # check contents directories cnts = objects_by_target_type.get(TargetType.CONTENT) if cnts: not_found = list(storage.content_missing_per_sha1_git(cnts)) if not_found: missing_objs = ", ".join( str((object_to_branch[cnt].hex(), cnt.hex())) for cnt in not_found ) raise InexistentObjectsError(f"Missing contents {missing_objs}") return snapshot
def test_snapshot_small(swh_storage, snapshot): # noqa swh_storage.snapshot_add([snapshot]) returned_snapshot = snapshot_get_all_branches(swh_storage, snapshot.id) assert snapshot == returned_snapshot