def test_origin_visit_stats_upsert_batch(self, swh_scheduler) -> None: """Batch upsert is ok""" visit_stats = [ OriginVisitStats( url="foo", visit_type="git", last_eventful=utcnow(), last_uneventful=None, last_failed=None, last_notfound=None, last_snapshot=hash_to_bytes( "d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), ), OriginVisitStats( url="bar", visit_type="git", last_eventful=None, last_uneventful=utcnow(), last_notfound=None, last_failed=None, last_snapshot=hash_to_bytes( "fffcc0710eb6cf9efd5b920a8453e1e07157bfff"), ), ] swh_scheduler.origin_visit_stats_upsert(visit_stats) for visit_stat in swh_scheduler.origin_visit_stats_get([ (vs.url, vs.visit_type) for vs in visit_stats ]): assert visit_stat is not None
def diff_revisions( self, rev_from, rev_to, from_dir_model, to_dir_model, expected_changes, mock_get_dir, mock_get_rev, ): rev_from_bytes = hash_to_bytes(rev_from) rev_to_bytes = hash_to_bytes(rev_to) def _get_rev(*args, **kwargs): if args[1] == rev_from_bytes: return {"directory": from_dir_model["target"]} else: return {"directory": to_dir_model["target"]} def _get_dir(*args, **kwargs): from_dir = from_dir_model.get_hash_data(args[1]) to_dir = to_dir_model.get_hash_data(args[1]) return from_dir if from_dir != None else to_dir mock_get_rev.side_effect = _get_rev mock_get_dir.side_effect = _get_dir changes = diff.diff_revisions( None, rev_from_bytes, rev_to_bytes, track_renaming=True ) self.assertEqual(changes, expected_changes)
def test_hash_collision_exception(): hex_hash_id = "38762cf7f55934b34d179ae6a4c80cadccbb7f0a" hash_id = hashutil.hash_to_bytes(hex_hash_id) content = { "blake2s256": hashutil.hash_to_bytes( "8f677e3214ca8b2acad91884a1571ef3f12b786501f9a6bedfd6239d82095dd2" ), "sha1_git": hashutil.hash_to_bytes("ba9aaa145ccd24ef760cf31c74d8f7ca1a2e47b0"), "sha256": hashutil.hash_to_bytes( "2bb787a73e37352f92383abe7e2902936d1059ad9f1ba6daaa9c1e58ee6970d0" ), "sha1": hash_id, } exc = HashCollision("sha1", hash_id, [content]) assert exc.algo == "sha1" assert exc.hash_id == hex_hash_id assert exc.colliding_contents == [content_hex_hashes(content)] assert exc.colliding_content_hashes() == [content]
def test_journal_client_origin_visit_status_from_journal_last_successful( swh_scheduler): visit_statuses = [ { "origin": "bar", "visit": 1, "status": "partial", "date": utcnow(), "type": "git", "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"), }, { "origin": "foo", "visit": 1, "status": "full", "date": DATE1, "type": "git", "snapshot": hash_to_bytes("eeecc0710eb6cf9efd5b920a8453e1e07157bfff"), }, { "origin": "foo", "visit": 2, "status": "partial", "date": DATE2, "type": "git", "snapshot": hash_to_bytes("aaacc0710eb6cf9efd5b920a8453e1e07157baaa"), }, { "origin": "foo", "visit": 3, "status": "full", "date": DATE3, "type": "git", "snapshot": hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), }, ] process_journal_objects({"origin_visit_status": visit_statuses}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_successful=DATE3, last_visit=DATE3, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes( "dddcc0710eb6cf9efd5b920a8453e1e07157bddd"), next_position_offset=0, successive_visits=3, ), )
def fill_idx_storage(idx_storage: IndexerStorageInterface, nb_rows: int) -> List[int]: tools: List[Dict[str, Any]] = [{ "tool_name": "tool %d" % i, "tool_version": "0.0.1", "tool_configuration": {}, } for i in range(2)] tools = idx_storage.indexer_configuration_add(tools) origin_metadata = [ OriginIntrinsicMetadataRow( id="file://dev/%04d" % origin_id, from_revision=hash_to_bytes("abcd{:0>36}".format(origin_id)), indexer_configuration_id=tools[origin_id % 2]["id"], metadata={"name": "origin %d" % origin_id}, mappings=["mapping%d" % (origin_id % 10)], ) for origin_id in range(nb_rows) ] revision_metadata = [ RevisionIntrinsicMetadataRow( id=hash_to_bytes("abcd{:0>36}".format(origin_id)), indexer_configuration_id=tools[origin_id % 2]["id"], metadata={"name": "origin %d" % origin_id}, mappings=["mapping%d" % (origin_id % 10)], ) for origin_id in range(nb_rows) ] idx_storage.revision_intrinsic_metadata_add(revision_metadata) idx_storage.origin_intrinsic_metadata_add(origin_metadata) return [tool["id"] for tool in tools]
def complete_deposit(sample_archive, deposit_collection, authenticated_client): """Returns a completed deposit (load success)""" deposit = create_deposit( authenticated_client, deposit_collection.name, sample_archive, external_id="external-id-complete", deposit_status=DEPOSIT_STATUS_LOAD_SUCCESS, ) origin = "https://hal.archives-ouvertes.fr/hal-01727745" directory_id = "42a13fc721c8716ff695d0d62fc851d641f3a12b" release_id = hash_to_bytes("548b3c0a2bb43e1fca191e24b5803ff6b3bc7c10") snapshot_id = hash_to_bytes("e5e82d064a9c3df7464223042e0c55d72ccff7f0") deposit.swhid = f"swh:1:dir:{directory_id}" deposit.swhid_context = str( QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id), origin=origin, visit=CoreSWHID(object_type=ObjectType.SNAPSHOT, object_id=snapshot_id), anchor=CoreSWHID(object_type=ObjectType.RELEASE, object_id=release_id), path=b"/", ) ) deposit.save() return deposit
def check_revisions_ordering( mocker, rev_walker_type, expected_result, truncated_history, revisions_list=_revisions_list, ): storage = mocker.patch("swh.storage.postgresql.storage.Storage") if not truncated_history: storage.revision_log.return_value = revisions_list else: revs_lists_truncated = [ None if hash_to_hex(rev["id"]) == _rev_missing else rev for rev in revisions_list ] storage.revision_log.return_value = revs_lists_truncated revs_walker = get_revisions_walker( rev_walker_type, storage, hash_to_bytes(_rev_start) ) assert list(map(hash_to_bytes, expected_result)) == [ rev["id"] for rev in revs_walker ] assert revs_walker.is_history_truncated() == truncated_history if truncated_history: missing_revs = revs_walker.missing_revisions() assert missing_revs == {hash_to_bytes(_rev_missing)} else: assert revs_walker.missing_revisions() == set()
def setUp(self): super().setUp() # replace actual license computation with a mock self.orig_compute_license = fossology_license.compute_license fossology_license.compute_license = mock_compute_license self.indexer = FossologyLicenseIndexer(CONFIG) self.indexer.catch_exceptions = False self.idx_storage = self.indexer.idx_storage fill_storage(self.indexer.storage) fill_obj_storage(self.indexer.objstorage) self.id0 = "01c9379dfc33803963d07c1ccc748d3fe4c96bb5" self.id1 = "688a5ef812c53907562fe379d4b3851e69c7cb15" self.id2 = "da39a3ee5e6b4b0d3255bfef95601890afd80709" # empty content tool = { k.replace("tool_", ""): v for (k, v) in self.indexer.tool.items() } # then self.expected_results = [ *[ ContentLicenseRow( id=hash_to_bytes(self.id0), tool=tool, license=license) for license in SHA1_TO_LICENSES[self.id0] ], *[ ContentLicenseRow( id=hash_to_bytes(self.id1), tool=tool, license=license) for license in SHA1_TO_LICENSES[self.id1] ], *[], # self.id2 ]
def test_dulwich_tag_to_release_no_author_no_date(self): sha = hash_to_bytes("f6e367357b446bd1315276de5e88ba3d0d99e136") target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" tag = dulwich.objects.Tag() tag.name = b"blah" tag.object = (dulwich.objects.Commit, target) tag.message = message tag.signature = None tag.tagger = None tag.tag_time = None tag.tag_timezone = None assert tag.sha().digest() == sha # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=None, date=None, id=sha, message=message, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) assert actual_release == expected_release
def test_dulwich_tag_to_release_signature(self): target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71") tag = dulwich.objects.Tag() tag.name = b"blah" tag.object = (dulwich.objects.Commit, target) tag.message = message tag.signature = GPGSIG tag.tagger = None tag.tag_time = None tag.tag_timezone = None assert tag.sha().digest() == sha # when actual_release = converters.dulwich_tag_to_release(tag) # then expected_release = Release( author=None, date=None, id=sha, message=message + GPGSIG, metadata=None, name=b"blah", synthetic=False, target=hash_to_bytes(target.decode()), target_type=ObjectType.REVISION, ) assert actual_release == expected_release
def test_journal_client_origin_visit_status_duplicated_messages(swh_scheduler): """A duplicated message must be ignored""" visit_status = { "origin": "foo", "visit": 1, "status": "full", "date": DATE1, "type": "git", "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), } process_journal_objects({"origin_visit_status": [visit_status]}, scheduler=swh_scheduler) process_journal_objects({"origin_visit_status": [visit_status]}, scheduler=swh_scheduler) actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo", "git")]) assert_visit_stats_ok( actual_origin_visit_stats[0], OriginVisitStats( url="foo", visit_type="git", last_successful=DATE1, last_visit=DATE1, last_visit_status=LastVisitStatus.successful, last_snapshot=hash_to_bytes( "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"), successive_visits=1, ), )
def test_npm_artifact_use_mtime_if_no_time(swh_storage, requests_mock_datadir): """With no time upload, artifact is skipped""" package = "jammit-express" url = package_url(package) loader = NpmLoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("33b8f105d48ce16b6c59158af660e0cc78bcbef4") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } # artifact is used expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"HEAD": SnapshotBranch( target_type=TargetType.ALIAS, target=b"releases/0.0.1" ), b"releases/0.0.1": SnapshotBranch( target_type=TargetType.RELEASE, target=hash_to_bytes("3e3b800570869fa9b3dbc302500553e62400cc06"), ), }, ) assert_last_visit_matches( swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id ) check_snapshot(expected_snapshot, swh_storage)
def test_weird_tree(self): """Tests a tree with entries the wrong order""" raw_manifest = ( b"0644 file2\x00" b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce" b"0644 file1\x00" b"d\x1f\xb6\xe0\x8d\xdb.O\xd0\x96\xdc\xf1\x8e\x80\xb8\x94\xbf~%\xce" ) tree = dulwich.objects.Tree.from_raw_string(b"tree", raw_manifest) assert converters.dulwich_tree_to_directory(tree) == Directory( entries=( # in alphabetical order, as it should be DirectoryEntry( name=b"file1", type="file", target=hash_to_bytes( "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), perms=0o644, ), DirectoryEntry( name=b"file2", type="file", target=hash_to_bytes( "641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce"), perms=0o644, ), ), raw_manifest=b"tree 62\x00" + raw_manifest, )
def test_pypi_visit_1_release_with_2_artifacts(swh_storage, requests_mock_datadir): """With no prior visit, load a pypi project ends up with 1 snapshot""" url = "https://pypi.org/project/nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("1394b2e59351a944cc763bd9d26d90ce8e8121a8") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id ) expected_snapshot = Snapshot( id=expected_snapshot_id, branches={ b"releases/1.1.0/nexter-1.1.0.zip": SnapshotBranch( target=hash_to_bytes("f7d43faeb65b64d3faa67e4f46559db57d26b9a4"), target_type=TargetType.RELEASE, ), b"releases/1.1.0/nexter-1.1.0.tar.gz": SnapshotBranch( target=hash_to_bytes("732bb9dc087e6015884daaebb8b82559be729b5a"), target_type=TargetType.RELEASE, ), }, ) check_snapshot(expected_snapshot, swh_storage)
def test_load(self): with requests_mock.Mocker() as m: for file_ in self.files.values(): path = os.path.join(RESOURCES_PATH, file_['name']) with open(path, 'rb') as fd: m.get(file_['uri'], content=fd.read()) self._load() self.assertCountSnapshots(1) self.assertCountReleases(0) # FIXME: Why not 1? self.assertCountRevisions(1) self.assertCountDirectories(14) self.assertCountContents(315) # Check the root dir was loaded, and contains 'src/' root_hash = 'c906789049d2327a69b81cca6a1c1737321c836f' ls_root = list(self.storage.directory_ls(hash_to_bytes(root_hash))) src_dirs = [x for x in ls_root if x['name'] == b'src'] self.assertEqual(len(src_dirs), 1, src_dirs) # Check 'src/hello.c' exists src_dir_hash = src_dirs[0]['target'] ls_src = list(self.storage.directory_ls(src_dir_hash)) hello_c = [x for x in ls_src if x['name'] == b'hello.c'] self.assertEqual(len(hello_c), 1, hello_c) # Check 'src.hello.c' was loaded and has the right hash hello_c_hash = 'b60a061ac9dd25b29d57b756b5959aadc1fe6386' self.assertEqual(hello_c[0]['sha1'], hash_to_bytes(hello_c_hash)) missing = list( self.storage.content_missing([{ 'sha1': hash_to_bytes(hello_c_hash) }])) self.assertEqual(missing, [])
def test_load_empty_tree(self): empty_dir_id = "4b825dc642cb6eb9a060e54bf8d69288fbee4904" # Check the empty tree does not already exist for some reason # (it would make this test pointless) assert list( self.loader.storage.directory_missing([hash_to_bytes(empty_dir_id)]) ) == [hash_to_bytes(empty_dir_id)] empty_tree = dulwich.objects.Tree() assert empty_tree.id.decode() == empty_dir_id self.repo.object_store.add_object(empty_tree) self.repo.do_commit(b"remove all bugs\n", tree=empty_tree.id) res = self.loader.load() assert res == {"status": "eventful"} assert ( list(self.loader.storage.directory_missing([hash_to_bytes(empty_dir_id)])) == [] ) results = self.loader.storage.directory_get_entries(hash_to_bytes(empty_dir_id)) assert results.next_page_token is None assert results.results == []
def test_eoferror(swh_storage, requests_mock_datadir): """Load a truncated archive which is invalid to make the uncompress function raising the exception EOFError. We then check if a snapshot is created, meaning this error is well managed. """ sources = ( "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json" # noqa ) loader = NixGuixLoader(swh_storage, sources) loader.load() expected_snapshot = Snapshot( id=hash_to_bytes("4257fa2350168c6bfec726a06452ea27a2c0cb33"), branches={ b"evaluation": SnapshotBranch( target=hash_to_bytes( "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"), target_type=TargetType.REVISION, ), }, ) check_snapshot(expected_snapshot, storage=swh_storage)
def test_api_vault_cook_notfound(api_client, mocker, directory, revision, unknown_directory, unknown_revision): mock_vault = mocker.patch("swh.web.common.archive.vault") mock_vault.cook.side_effect = NotFoundExc("object not found") mock_vault.fetch.side_effect = NotFoundExc("cooked archive not found") mock_vault.progress.side_effect = NotFoundExc("cooking request not found") for obj_type, obj_id in ( ("directory", directory), ("revision_gitfast", revision), ): obj_name = obj_type.split("_")[0] url = reverse( f"api-1-vault-cook-{obj_type}", url_args={f"{obj_type[:3]}_id": obj_id}, ) rv = check_api_get_responses(api_client, url, status_code=404) assert rv.data["exception"] == "NotFoundExc" assert (rv.data["reason"] == f"Cooking of {obj_name} '{obj_id}' was never requested.") mock_vault.progress.assert_called_with(obj_type, hashutil.hash_to_bytes(obj_id)) for obj_type, obj_id in ( ("directory", unknown_directory), ("revision_gitfast", unknown_revision), ): obj_name = obj_type.split("_")[0] url = reverse(f"api-1-vault-cook-{obj_type}", url_args={f"{obj_type[:3]}_id": obj_id}) rv = check_api_post_responses(api_client, url, data=None, status_code=404) assert rv.data["exception"] == "NotFoundExc" assert rv.data["reason"] == f"{obj_name.title()} '{obj_id}' not found." mock_vault.cook.assert_called_with(obj_type, hashutil.hash_to_bytes(obj_id), email=None) fetch_url = reverse( f"api-1-vault-fetch-{obj_type}", url_args={f"{obj_type[:3]}_id": obj_id}, ) rv = check_api_get_responses(api_client, fetch_url, status_code=404) assert rv.data["exception"] == "NotFoundExc" assert (rv.data["reason"] == f"Cooked archive for {obj_name} '{obj_id}' not found.") mock_vault.fetch.assert_called_with(obj_type, hashutil.hash_to_bytes(obj_id))
def test_api_release_target_type_not_a_revision(self, new_rel1, new_rel2, new_rel3, content, directory, release): for new_rel_id, target_type, target in ( (new_rel1, 'content', content), (new_rel2, 'directory', directory), (new_rel3, 'release', release)): if target_type == 'content': target = target['sha1_git'] sample_release = { 'author': { 'email': b'*****@*****.**', 'fullname': b'author <*****@*****.**>', 'name': b'author' }, 'date': { 'timestamp': int(datetime.now().timestamp()), 'offset': 0, 'negative_utc': False, }, 'id': hash_to_bytes(new_rel_id), 'message': b'sample release message', 'name': b'sample release', 'synthetic': False, 'target': hash_to_bytes(target), 'target_type': target_type } self.storage.release_add([sample_release]) url = reverse('api-release', url_args={'sha1_git': new_rel_id}) rv = self.client.get(url) expected_release = self.release_get(new_rel_id) author_id = expected_release['author']['id'] author_url = reverse('api-person', url_args={'person_id': author_id}) if target_type == 'content': url_args = {'q': 'sha1_git:%s' % target} else: url_args = {'sha1_git': target} target_url = reverse('api-%s' % target_type, url_args=url_args) expected_release['author_url'] = author_url expected_release['target_url'] = target_url self.assertEqual(rv.status_code, 200) self.assertEqual(rv['Content-Type'], 'application/json') self.assertEqual(rv.data, expected_release)
def test_from_release(self): release_input = { 'id': hashutil.hash_to_bytes('aad23fa492a0c5fed0708a6703be875448c86884'), 'target': hashutil.hash_to_bytes('5e46d564378afc44b31bb89f99d5675195fbdf67'), 'target_type': 'revision', 'date': { 'timestamp': datetime.datetime(2015, 1, 1, 22, 0, 0, tzinfo=datetime.timezone.utc).timestamp(), 'offset': 0, 'negative_utc': False, }, 'author': { 'name': b'author name', 'fullname': b'Author Name author@email', 'email': b'author@email', }, 'name': b'v0.0.1', 'message': b'some comment on release', 'synthetic': True, } expected_release = { 'id': 'aad23fa492a0c5fed0708a6703be875448c86884', 'target': '5e46d564378afc44b31bb89f99d5675195fbdf67', 'target_type': 'revision', 'date': '2015-01-01T22:00:00+00:00', 'author': { 'name': 'author name', 'fullname': 'Author Name author@email', 'email': 'author@email', }, 'name': 'v0.0.1', 'message': 'some comment on release', 'target_type': 'revision', 'synthetic': True, } # when actual_release = converters.from_release(release_input) # then self.assertEqual(actual_release, expected_release)
def test_api_vault_cook(api_client, mocker, directory, revision): mock_archive = mocker.patch("swh.web.api.views.vault.archive") for obj_type, obj_id in ( ("directory", directory), ("revision_gitfast", revision), ): fetch_url = reverse( f"api-1-vault-fetch-{obj_type}", url_args={f"{obj_type[:3]}_id": obj_id}, ) stub_cook = { "type": obj_type, "progress_msg": None, "task_id": 1, "task_status": "done", "object_id": obj_id, } stub_fetch = b"content" mock_archive.vault_cook.return_value = stub_cook mock_archive.vault_fetch.return_value = stub_fetch email = "*****@*****.**" url = reverse( f"api-1-vault-cook-{obj_type}", url_args={f"{obj_type[:3]}_id": obj_id}, query_params={"email": email}, ) rv = check_api_post_responses(api_client, url, data=None, status_code=200) assert rv.data == { "fetch_url": rv.wsgi_request.build_absolute_uri(fetch_url), "obj_type": obj_type, "progress_message": None, "id": 1, "status": "done", "obj_id": obj_id, } mock_archive.vault_cook.assert_called_with( obj_type, hashutil.hash_to_bytes(obj_id), email) rv = check_http_get_response(api_client, fetch_url, status_code=200) assert rv["Content-Type"] == "application/gzip" assert rv.content == stub_fetch mock_archive.vault_fetch.assert_called_with( obj_type, hashutil.hash_to_bytes(obj_id))
def test_pypi_visit_with_missing_artifact( swh_storage, requests_mock_datadir_missing_one ): """Load a pypi project with some missing artifacts ends up with 1 snapshot""" url = "https://pypi.org/project/0805nexter" loader = PyPILoader(swh_storage, url) actual_load_status = loader.load() expected_snapshot_id = hash_to_bytes("00785a38479abe5fbfa402df96be26d2ddf89c97") assert actual_load_status == { "status": "eventful", "snapshot_id": expected_snapshot_id.hex(), } assert_last_visit_matches( swh_storage, url, status="partial", type="pypi", snapshot=expected_snapshot_id, ) expected_snapshot = Snapshot( id=hash_to_bytes(expected_snapshot_id), branches={ b"releases/1.2.0": SnapshotBranch( target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"), target_type=TargetType.RELEASE, ), b"HEAD": SnapshotBranch( target=b"releases/1.2.0", target_type=TargetType.ALIAS, ), }, ) check_snapshot(expected_snapshot, storage=swh_storage) stats = get_stats(swh_storage) assert { "content": 3, "directory": 2, "origin": 1, "origin_visit": 1, "release": 1, "revision": 0, "skipped_content": 0, "snapshot": 1, } == stats
def process_put( self, request, headers: ParsedRequestHeaders, collection_name: str, deposit: Deposit, ) -> None: """Update the deposit with status and SWHIDs Returns: 204 No content 400 Bad request if checks fail """ data = request.data status = data["status"] deposit.status = status if status == DEPOSIT_STATUS_LOAD_SUCCESS: origin_url = data["origin_url"] directory_id = data["directory_id"] release_id = data["release_id"] dir_id = CoreSWHID(object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id)) snp_id = CoreSWHID( object_type=ObjectType.SNAPSHOT, object_id=hash_to_bytes(data["snapshot_id"]), ) rel_id = CoreSWHID(object_type=ObjectType.RELEASE, object_id=hash_to_bytes(release_id)) deposit.swhid = str(dir_id) # new id with contextual information deposit.swhid_context = str( QualifiedSWHID( object_type=ObjectType.DIRECTORY, object_id=hash_to_bytes(directory_id), origin=origin_url, visit=snp_id, anchor=rel_id, path="/", )) else: # rejected deposit.status = status if "status_detail" in data: deposit.status_detail = data["status_detail"] deposit.save()
def test_lookup_missing_hashes_some_present(archive_data, content, directory): missing_rev = random_sha1() missing_rel = random_sha1() missing_snp = random_sha1() grouped_swhids = { CONTENT: [hash_to_bytes(content["sha1_git"])], DIRECTORY: [hash_to_bytes(directory)], REVISION: [hash_to_bytes(missing_rev)], RELEASE: [hash_to_bytes(missing_rel)], SNAPSHOT: [hash_to_bytes(missing_snp)], } actual_result = archive.lookup_missing_hashes(grouped_swhids) assert actual_result == {missing_rev, missing_rel, missing_snp}
def test_commit_without_manifest(self): """Tests a Release can still be produced when the manifest is not understood by the custom parser in dulwich_commit_to_revision.""" target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some commit message" author = Person(fullname=b"Foo <*****@*****.**>", name=b"Foo", email=b"*****@*****.**") commit = dulwich.objects.Commit() commit.tree = target commit.message = message commit.author = commit.committer = b"Foo <*****@*****.**>" commit.author_time = commit.commit_time = 1641980946 commit.author_timezone = commit.commit_timezone = 3600 assert converters.dulwich_commit_to_revision(commit) == Revision( message=b"some commit message", author=author, committer=author, date=TimestampWithTimezone( timestamp=Timestamp(seconds=1641980946, microseconds=0), offset_bytes=b"+0100", ), committer_date=TimestampWithTimezone( timestamp=Timestamp(seconds=1641980946, microseconds=0), offset_bytes=b"+0100", ), type=RevisionType.GIT, directory=hash_to_bytes(target.decode()), synthetic=False, metadata=None, parents=(), )
def test_corrupt_commit(self, attribute): sha = hash_to_bytes("3f0ac5a6d15d89cf928209a57334e3b77c5651b9") target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some commit message" commit = dulwich.objects.Commit() commit.tree = target commit.message = message commit.gpgsig = GPGSIG commit.author = commit.committer = b"Foo <*****@*****.**>" commit.author_time = commit.commit_time = 1641980946 commit.author_timezone = commit.commit_timezone = 3600 converters.dulwich_commit_to_revision(commit) assert commit.sha().digest() == sha original_sha = commit.sha() setattr(commit, attribute, b"abcde") commit.sha() # reset tag._needs_serialization commit._sha = original_sha # force the wrong hash with pytest.raises(converters.HashMismatch): converters.dulwich_commit_to_revision(commit) if attribute == "_gpgsig": setattr(commit, attribute, None) commit.sha() # reset tag._needs_serialization commit._sha = original_sha # force the wrong hash with pytest.raises(converters.HashMismatch): converters.dulwich_commit_to_revision(commit)
def test_corrupt_tag(self, attribute): sha = hash_to_bytes("46fff489610ed733d2cc904e363070dadee05c71") target = b"641fb6e08ddb2e4fd096dcf18e80b894bf7e25ce" message = b"some release message" tag = dulwich.objects.Tag() tag.name = b"blah" tag.object = (dulwich.objects.Commit, target) tag.message = message tag.signature = GPGSIG tag.tagger = None tag.tag_time = None tag.tag_timezone = None assert tag.sha().digest() == sha converters.dulwich_tag_to_release(tag) original_sha = tag.sha() setattr(tag, attribute, b"abcde") tag.sha() # reset tag._needs_serialization tag._sha = original_sha # force the wrong hash with pytest.raises(converters.HashMismatch): converters.dulwich_tag_to_release(tag) if attribute == "signature": setattr(tag, attribute, None) tag.sha() # reset tag._needs_serialization tag._sha = original_sha # force the wrong hash with pytest.raises(converters.HashMismatch): converters.dulwich_tag_to_release(tag)
def revision_log(self, rev_id, limit=None): rev_id_bytes = hash_to_bytes(rev_id) return list( map( converters.from_revision, self.storage.revision_log([rev_id_bytes], limit=limit), ))
def content_find(self, content: Dict[str, Any]) -> Dict[str, Any]: cnt_ids_bytes = { algo_hash: hash_to_bytes(content[algo_hash]) for algo_hash in ALGORITHMS if content.get(algo_hash) } cnt = self.storage.content_find(cnt_ids_bytes) return converters.from_content(cnt[0].to_dict()) if cnt else cnt
def _list_contents_to_index(self, partition_id: int, nb_partitions: int, indexed: Set[Sha1]) -> Iterable[Sha1]: """Compute from storage the new contents to index in the partition_id . The already indexed contents are skipped. Args: partition_id: Index of the partition to fetch data from nb_partitions: Total number of partition indexed: Set of content already indexed. Yields: Sha1 id (bytes) of contents to index """ if not isinstance(partition_id, int) or not isinstance( nb_partitions, int): raise TypeError( f"identifiers must be int, not {partition_id!r} and {nb_partitions!r}." ) next_page_token = None while True: result = self.storage.content_get_partition( partition_id, nb_partitions, page_token=next_page_token) contents = result.results for c in contents: _id = hashutil.hash_to_bytes(c.sha1) if _id in indexed: continue yield _id next_page_token = result.next_page_token if next_page_token is None: break