def test_maven_loader_extrinsic_metadata(swh_storage, expected_releases,
                                         expected_json_metadata,
                                         expected_pom_metadata):
    """With no prior visit, loading a jar ends up with 1 snapshot.
    Extrinsic metadata is the pom file associated to the source jar.
    """
    loader = MavenLoader(swh_storage, MVN_ORIGIN_URL, artifacts=MVN_ARTIFACTS)

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"

    for i, expected_release in enumerate(expected_releases):

        expected_release_id = expected_release.id
        release = swh_storage.release_get([expected_release_id])[0]
        assert release is not None

        release_swhid = CoreSWHID(object_type=ObjectType.RELEASE,
                                  object_id=expected_release_id)
        directory_swhid = ExtendedSWHID(
            object_type=ExtendedObjectType.DIRECTORY, object_id=release.target)
        metadata_authority = MetadataAuthority(
            type=MetadataAuthorityType.FORGE,
            url=REPO_BASE_URL,
        )

        expected_metadata = [
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.maven.loader.MavenLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="maven-pom",
                metadata=expected_pom_metadata[i],
                origin=MVN_ORIGIN_URL,
                release=release_swhid,
            ),
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.maven.loader.MavenLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="maven-json",
                metadata=json.dumps(expected_json_metadata[i]).encode(),
                origin=MVN_ORIGIN_URL,
                release=release_swhid,
            ),
        ]

        res = swh_storage.raw_extrinsic_metadata_get(directory_swhid,
                                                     metadata_authority)
        assert res.next_page_token is None
        assert set(res.results) == set(expected_metadata)
def db_to_raw_extrinsic_metadata(row) -> RawExtrinsicMetadata:
    target = row["raw_extrinsic_metadata.target"]
    if not target.startswith("swh:1:"):
        warnings.warn("Fetching raw_extrinsic_metadata row with URL target",
                      DeprecationWarning)
        target = str(Origin(url=target).swhid())

    return RawExtrinsicMetadata(
        target=ExtendedSWHID.from_string(target),
        authority=MetadataAuthority(
            type=MetadataAuthorityType(row["metadata_authority.type"]),
            url=row["metadata_authority.url"],
        ),
        fetcher=MetadataFetcher(
            name=row["metadata_fetcher.name"],
            version=row["metadata_fetcher.version"],
        ),
        discovery_date=row["discovery_date"],
        format=row["format"],
        metadata=row["raw_extrinsic_metadata.metadata"],
        origin=row["origin"],
        visit=row["visit"],
        snapshot=map_optional(CoreSWHID.from_string, row["snapshot"]),
        release=map_optional(CoreSWHID.from_string, row["release"]),
        revision=map_optional(CoreSWHID.from_string, row["revision"]),
        path=row["path"],
        directory=map_optional(CoreSWHID.from_string, row["directory"]),
    )
Example #3
0
def row_to_raw_extrinsic_metadata(
        row: RawExtrinsicMetadataRow) -> RawExtrinsicMetadata:
    discovery_date = row.discovery_date.replace(tzinfo=datetime.timezone.utc)

    return RawExtrinsicMetadata(
        target=ExtendedSWHID.from_string(row.target),
        authority=MetadataAuthority(
            type=MetadataAuthorityType(row.authority_type),
            url=row.authority_url,
        ),
        fetcher=MetadataFetcher(
            name=row.fetcher_name,
            version=row.fetcher_version,
        ),
        discovery_date=discovery_date,
        format=row.format,
        metadata=row.metadata,
        origin=row.origin,
        visit=row.visit,
        snapshot=map_optional(CoreSWHID.from_string, row.snapshot),
        release=map_optional(CoreSWHID.from_string, row.release),
        revision=map_optional(CoreSWHID.from_string, row.revision),
        path=row.path,
        directory=map_optional(CoreSWHID.from_string, row.directory),
    )
 def get_metadata_fetcher(self) -> MetadataFetcher:
     tool = self.metadata()["tool"]
     return MetadataFetcher(
         name=tool["name"],
         version=tool["version"],
         metadata=tool["configuration"],
     )
Example #5
0
def test_deposit_metadata_origin(
    url,
    authenticated_client,
    deposit_collection,
    atom_dataset,
    swh_storage,
):
    """Posting a swhid reference is stored on raw extrinsic metadata storage"""
    xml_data = atom_dataset["entry-data-with-origin-reference"].format(url=url)
    origin_swhid = Origin(url).swhid()
    deposit_client = authenticated_client.deposit_client
    swh_storage.origin_add([Origin(url)])
    response = post_atom(
        authenticated_client,
        reverse(COL_IRI, args=[deposit_collection.name]),
        data=xml_data,
    )

    assert response.status_code == status.HTTP_201_CREATED, response.content.decode(
    )
    response_content = ElementTree.fromstring(response.content)
    # Ensure the deposit is finalized
    deposit_id = int(
        response_content.findtext("swh:deposit_id", namespaces=NAMESPACES))
    deposit = Deposit.objects.get(pk=deposit_id)
    # we got not swhid as input so we cannot have those
    assert deposit.swhid is None
    assert deposit.swhid_context is None
    assert deposit.complete_date == deposit.reception_date
    assert deposit.complete_date is not None
    assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS

    # Ensure metadata stored in the metadata storage is consistent
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url=deposit_client.provider_url,
    )

    actual_authority = swh_storage.metadata_authority_get(
        MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url)
    assert actual_authority == metadata_authority

    config = APIConfig()
    metadata_fetcher = MetadataFetcher(
        name=config.tool["name"],
        version=config.tool["version"],
    )

    actual_fetcher = swh_storage.metadata_fetcher_get(config.tool["name"],
                                                      config.tool["version"])
    assert actual_fetcher == metadata_fetcher

    # Get the deposited metadata object and check it:

    page_results = swh_storage.raw_extrinsic_metadata_get(
        origin_swhid, metadata_authority)

    assert len(page_results.results) == 1
    assert page_results.next_page_token is None

    metadata = RawExtrinsicMetadata(
        target=origin_swhid,
        discovery_date=deposit.complete_date,
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="sword-v2-atom-codemeta",
        metadata=xml_data.encode(),
    )
    assert page_results == PagedResult(
        results=[metadata],
        next_page_token=None,
    )

    # Get metadata about the deposited metadata object and check it:
    _assert_deposit_info_on_metadata(swh_storage, metadata.swhid(), deposit,
                                     metadata_fetcher)
Example #6
0
def test_deposit_metadata_swhid(
    swhid,
    authenticated_client,
    deposit_collection,
    atom_dataset,
    swh_storage,
):
    """Posting a swhid reference is stored on raw extrinsic metadata storage"""
    swhid_reference = QualifiedSWHID.from_string(swhid)
    swhid_target = extended_swhid_from_qualified(swhid_reference)

    xml_data = atom_dataset["entry-data-with-swhid"].format(
        swhid=swhid,
        metadata_provenance_url=
        "https://hal-test.archives-ouvertes.fr/hal-abcdefgh",
    )
    deposit_client = authenticated_client.deposit_client

    _insert_object(swh_storage, swhid_reference)

    response = post_atom(
        authenticated_client,
        reverse(COL_IRI, args=[deposit_collection.name]),
        data=xml_data,
    )

    assert response.status_code == status.HTTP_201_CREATED, response.content.decode(
    )
    response_content = ElementTree.fromstring(response.content)

    # Ensure the deposit is finalized
    deposit_id = int(
        response_content.findtext("swh:deposit_id", namespaces=NAMESPACES))
    deposit = Deposit.objects.get(pk=deposit_id)
    assert deposit.swhid == str(swhid_target)
    assert deposit.swhid_context == str(swhid_reference)
    assert deposit.complete_date == deposit.reception_date
    assert deposit.complete_date is not None
    assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS

    # Ensure metadata stored in the metadata storage is consistent
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url=deposit_client.provider_url,
    )

    actual_authority = swh_storage.metadata_authority_get(
        MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url)
    assert actual_authority == metadata_authority

    config = APIConfig()
    metadata_fetcher = MetadataFetcher(
        name=config.tool["name"],
        version=config.tool["version"],
    )

    actual_fetcher = swh_storage.metadata_fetcher_get(config.tool["name"],
                                                      config.tool["version"])
    assert actual_fetcher == metadata_fetcher

    # Get the deposited metadata object and check it:

    page_results = swh_storage.raw_extrinsic_metadata_get(
        swhid_target, metadata_authority)

    assert len(page_results.results) == 1
    assert page_results.next_page_token is None

    metadata_context = compute_metadata_context(swhid_reference)
    metadata = RawExtrinsicMetadata(
        target=swhid_target,
        discovery_date=deposit.complete_date,
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="sword-v2-atom-codemeta",
        metadata=xml_data.encode(),
        **metadata_context,
    )
    assert page_results == PagedResult(
        results=[metadata],
        next_page_token=None,
    )

    # Get metadata about the deposited metadata object and check it:
    _assert_deposit_info_on_metadata(swh_storage, metadata.swhid(), deposit,
                                     metadata_fetcher)
Example #7
0
def test_pypi_release_metadata_structure(
    swh_storage, requests_mock_datadir, _0805nexter_api_info
):
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    expected_release_id = hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68")

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/1.2.0",
                target_type=TargetType.ALIAS,
            ),
            b"releases/1.1.0": SnapshotBranch(
                target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.2.0": SnapshotBranch(
                target=expected_release_id,
                target_type=TargetType.RELEASE,
            ),
        },
    )

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id
    )

    check_snapshot(expected_snapshot, swh_storage)

    release = swh_storage.release_get([expected_release_id])[0]
    assert release is not None

    release_swhid = CoreSWHID(
        object_type=ObjectType.RELEASE, object_id=expected_release_id
    )
    directory_swhid = ExtendedSWHID(
        object_type=ExtendedObjectType.DIRECTORY, object_id=release.target
    )
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://pypi.org/",
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=directory_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.pypi.loader.PyPILoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="pypi-project-json",
            metadata=json.dumps(
                json.loads(_0805nexter_api_info)["releases"]["1.2.0"][0]
            ).encode(),
            origin=url,
            release=release_swhid,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        directory_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )
Example #8
0
    RawExtrinsicMetadata,
    Snapshot,
    SnapshotBranch,
    TargetType,
)
from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID
from swh.storage import get_storage
from swh.storage.interface import PagedResult
from swh.storage.migrate_extrinsic_metadata import (
    handle_row,
    pypi_origin_from_filename,
    pypi_project_from_filename,
)

FETCHER = MetadataFetcher(
    name="migrate-extrinsic-metadata-from-revisions",
    version="0.0.1",
)
PYPI_AUTHORITY = MetadataAuthority(
    type=MetadataAuthorityType.FORGE,
    url="https://pypi.org/",
)
SWH_AUTHORITY = MetadataAuthority(
    type=MetadataAuthorityType.REGISTRY,
    url="https://softwareheritage.org/",
)

DIRECTORY_ID = b"a" * 20
DIRECTORY_SWHID = ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY,
                                object_id=DIRECTORY_ID)

Example #9
0
def test_deposit_loading_ok(swh_storage, deposit_client,
                            requests_mock_datadir):
    url = "https://hal-test.archives-ouvertes.fr/some-external-id"
    deposit_id = 666
    loader = DepositLoader(swh_storage,
                           url,
                           deposit_id,
                           deposit_client,
                           default_filename="archive.zip")

    actual_load_status = loader.load()
    expected_snapshot_id = "338b45d87e02fb5cbf324694bc4a898623d6a30f"
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id,
    }

    assert_last_visit_matches(
        loader.storage,
        url,
        status="full",
        type="deposit",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    release_id_hex = "2566a64a27bc00362e265be9666d7606750530a1"
    release_id = hash_to_bytes(release_id_hex)

    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"HEAD":
            SnapshotBranch(
                target=release_id,
                target_type=TargetType.RELEASE,
            ),
        },
    )
    check_snapshot(expected_snapshot, storage=loader.storage)

    release = loader.storage.release_get([release_id])[0]
    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2017, 10, 7, 15, 17, 8,
                          tzinfo=datetime.timezone.utc))
    person = Person(
        fullname=b"Software Heritage",
        name=b"Software Heritage",
        email=b"*****@*****.**",
    )
    assert release == Release(
        id=release_id,
        name=b"HEAD",
        message=b"hal: Deposit 666 in collection hal\n",
        author=person,
        date=date,
        target_type=ModelObjectType.DIRECTORY,
        target=b"\xfd-\xf1-\xc5SL\x1d\xa1\xe9\x18\x0b\x91Q\x02\xfbo`\x1d\x19",
        synthetic=True,
        metadata=None,
    )

    # check metadata

    fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )

    authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="https://hal-test.archives-ouvertes.fr/",
    )

    # Check origin metadata
    orig_meta = loader.storage.raw_extrinsic_metadata_get(
        Origin(url).swhid(), authority)
    assert orig_meta.next_page_token is None
    raw_meta = loader.client.metadata_get(deposit_id)
    raw_metadata: str = raw_meta["raw_metadata"]
    # 2 raw metadata xml + 1 json dict
    assert len(orig_meta.results) == 2
    orig_meta0 = orig_meta.results[0]
    assert orig_meta0.authority == authority
    assert orig_meta0.fetcher == fetcher

    # Check directory metadata
    assert release.target_type == ModelObjectType.DIRECTORY
    directory_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                                object_id=release.target)
    actual_dir_meta = loader.storage.raw_extrinsic_metadata_get(
        directory_swhid, authority)
    assert actual_dir_meta.next_page_token is None
    assert len(actual_dir_meta.results) == 1
    dir_meta = actual_dir_meta.results[0]
    assert dir_meta.authority == authority
    assert dir_meta.fetcher == fetcher
    assert dir_meta.metadata.decode() == raw_metadata

    # Retrieve the information for deposit status update query to the deposit
    urls = [
        m for m in requests_mock_datadir.request_history
        if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/"
    ]

    assert len(urls) == 1
    update_query = urls[0]

    body = update_query.json()
    expected_body = {
        "status": "done",
        "release_id": release_id_hex,
        "directory_id": hash_to_hex(release.target),
        "snapshot_id": expected_snapshot_id,
        "origin_url": url,
    }

    assert body == expected_body

    stats = get_stats(loader.storage)
    assert {
        "content": 303,
        "directory": 12,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Example #10
0
class StorageData:
    """Data model objects to use within tests."""

    content = Content(
        data=b"42\n",
        length=3,
        sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"),
        sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        sha256=hash_to_bytes(
            "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0"
        ),
        blake2s256=hash_to_bytes(
            "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d"
        ),
        status="visible",
    )
    content2 = Content(
        data=b"4242\n",
        length=5,
        sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"),
        sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"),
        sha256=hash_to_bytes(
            "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd"
        ),
        blake2s256=hash_to_bytes(
            "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d"
        ),
        status="visible",
    )
    content3 = Content(
        data=b"424242\n",
        length=7,
        sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"),
        sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"),
        sha256=hash_to_bytes(
            "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36"
        ),
        blake2s256=hash_to_bytes(
            "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11"
        ),
        status="visible",
        ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc),
    )
    contents: Tuple[Content, ...] = (content, content2, content3)

    skipped_content = SkippedContent(
        length=1024 * 1024 * 200,
        sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"),
        sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
        origin="file:///dev/zero",
    )
    skipped_content2 = SkippedContent(
        length=1024 * 1024 * 300,
        sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"),
        sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
    )
    skipped_contents: Tuple[SkippedContent,
                            ...] = (skipped_content, skipped_content2)

    directory5 = Directory(
        id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"),
        entries=(),
    )
    directory = Directory(
        id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar\xc3",
                type="dir",
                target=directory5.id,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
    )
    directory2 = Directory(
        id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"),
        entries=tuple([
            DirectoryEntry(
                name=b"oof",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            )
        ], ),
    )
    directory3 = Directory(
        id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"subdir",
                type="dir",
                target=directory.id,
                perms=from_disk.DentryPerms.directory,
            ),
            DirectoryEntry(
                name=b"hello",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
        ], ),
    )
    directory4 = Directory(
        id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"),
        entries=tuple([
            DirectoryEntry(
                name=b"subdir1",
                type="dir",
                target=directory3.id,
                perms=from_disk.DentryPerms.directory,
            )
        ], ),
    )

    directory6 = Directory(
        id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=b"\x00" * 20,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar",
                type="dir",
                target=b"\x01" * 20,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
        raw_manifest=(
            b"tree 61\x00"
            b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"  # noqa
            b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"  # noqa
        ),
    )

    directories: Tuple[Directory, ...] = (
        directory2,
        directory,
        directory3,
        directory4,
        directory5,
        directory6,
    )

    revision = Revision(
        id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
        },
        extra_headers=(
            (b"gpgsig", b"test123"),
            (b"mergetag", b"foo\\bar"),
            (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"),
        ),
        synthetic=True,
    )
    revision2 = Revision(
        id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    revision3 = Revision(
        id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id, revision2.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=True,
    )
    revision4 = Revision(
        id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([revision3.id]),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    git_revisions: Tuple[Revision,
                         ...] = (revision, revision2, revision3, revision4)

    hg_revision = Revision(
        id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
            "node": "a316dfb434af2b451c1f393496b7eaeda343f543",
        },
        extra_headers=(),
        synthetic=True,
    )
    hg_revision2 = Revision(
        id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ),
        synthetic=False,
    )
    hg_revision3 = Revision(
        id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id, hg_revision2.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ),
        synthetic=True,
    )
    hg_revision4 = Revision(
        id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([hg_revision3.id]),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ),
        synthetic=False,
    )
    hg_revisions: Tuple[Revision, ...] = (
        hg_revision,
        hg_revision2,
        hg_revision3,
        hg_revision4,
    )
    revisions: Tuple[Revision, ...] = git_revisions + hg_revisions

    origins: Tuple[Origin, ...] = (
        Origin(url="https://github.com/user1/repo1"),
        Origin(url="https://github.com/user2/repo1"),
        Origin(url="https://github.com/user3/repo1"),
        Origin(url="https://gitlab.com/user1/repo1"),
        Origin(url="https://gitlab.com/user2/repo1"),
        Origin(url="https://forge.softwareheritage.org/source/repo1"),
        Origin(url="https://example.рф/🏛️.txt"),
    )
    origin, origin2 = origins[:2]

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="http://hal.inria.example.com/",
    )
    metadata_authority2 = MetadataAuthority(
        type=MetadataAuthorityType.REGISTRY,
        url="http://wikidata.example.com/",
    )
    authorities: Tuple[MetadataAuthority, ...] = (
        metadata_authority,
        metadata_authority2,
    )

    metadata_fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )
    metadata_fetcher2 = MetadataFetcher(
        name="swh-example",
        version="0.0.1",
    )
    fetchers: Tuple[MetadataFetcher,
                    ...] = (metadata_fetcher, metadata_fetcher2)

    date_visit1 = datetime.datetime(2015,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit2 = datetime.datetime(2017,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit3 = datetime.datetime(2018,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)

    type_visit1 = "git"
    type_visit2 = "hg"
    type_visit3 = "deb"

    origin_visit = OriginVisit(
        origin=origin.url,
        visit=1,
        date=date_visit1,
        type=type_visit1,
    )
    origin_visit2 = OriginVisit(
        origin=origin.url,
        visit=2,
        date=date_visit2,
        type=type_visit1,
    )
    origin_visit3 = OriginVisit(
        origin=origin2.url,
        visit=1,
        date=date_visit1,
        type=type_visit2,
    )
    origin_visits: Tuple[OriginVisit, ...] = (
        origin_visit,
        origin_visit2,
        origin_visit3,
    )

    release = Release(
        id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"),
        name=b"v0.0.1",
        author=Person(
            name=b"olasd",
            email=b"*****@*****.**",
            fullname=b"olasd <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0042",
        ),
        target=revision.id,
        target_type=ObjectType.REVISION,
        message=b"synthetic release",
        synthetic=True,
    )
    release2 = Release(
        id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision2.id,
        target_type=ObjectType.REVISION,
        message=b"v0.0.2\nMisc performance improvements + bug fixes",
        synthetic=False,
    )
    release3 = Release(
        id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision3.id,
        target_type=ObjectType.REVISION,
        message=b"yet another synthetic release",
        synthetic=True,
    )

    releases: Tuple[Release, ...] = (release, release2, release3)

    snapshot = Snapshot(
        id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"),
        branches={
            b"master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        },
    )
    empty_snapshot = Snapshot(
        id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
        branches={},
    )
    complete_snapshot = Snapshot(
        id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"),
        branches={
            b"directory":
            SnapshotBranch(
                target=directory.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"directory2":
            SnapshotBranch(
                target=directory2.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"content":
            SnapshotBranch(
                target=content.sha1_git,
                target_type=TargetType.CONTENT,
            ),
            b"alias":
            SnapshotBranch(
                target=b"revision",
                target_type=TargetType.ALIAS,
            ),
            b"revision":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
            b"release":
            SnapshotBranch(
                target=release.id,
                target_type=TargetType.RELEASE,
            ),
            b"snapshot":
            SnapshotBranch(
                target=empty_snapshot.id,
                target_type=TargetType.SNAPSHOT,
            ),
            b"dangling":
            None,
        },
    )

    snapshots: Tuple[Snapshot,
                     ...] = (snapshot, empty_snapshot, complete_snapshot)

    content_metadata1 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin.url,
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    content_metadata2 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin2.url,
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="yaml",
        metadata=b"foo: bar",
    )
    content_metadata3 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
        origin=origin.url,
        visit=42,
        snapshot=snapshot.swhid(),
        release=release.swhid(),
        revision=revision.swhid(),
        directory=directory.swhid(),
        path=b"/foo/bar",
    )

    content_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        content_metadata1,
        content_metadata2,
        content_metadata3,
    )

    origin_metadata1 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    origin_metadata2 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )
    origin_metadata3 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )

    origin_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        origin_metadata1,
        origin_metadata2,
        origin_metadata3,
    )

    extid1 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=revision.id),
        extid_type="git",
        extid=revision.id,
    )

    extid2 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=hg_revision.id),
        extid_type="mercurial",
        extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"),
    )

    extid3 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory.id),
        extid_type="directory",
        extid=b"something",
    )
    extid4 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory2.id),
        extid_type="directory",
        extid=b"something",
        extid_version=2,
    )

    extids: Tuple[ExtID, ...] = (
        extid1,
        extid2,
        extid3,
        extid4,
    )
def test_loader_one_visit(swh_storage, requests_mock_datadir, raw_sources):
    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()
    expected_snapshot_id = SNAPSHOT1.id
    expected_snapshot_id_hex = expected_snapshot_id.hex()
    assert load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id_hex,
    }

    release_id = SNAPSHOT1.branches[
        b"https://github.com/owner-1/repository-1/revision-1.tgz"].target
    check_snapshot(SNAPSHOT1, storage=swh_storage)

    assert swh_storage.release_get([release_id])[0] == Release(
        id=release_id,
        name=b"https://github.com/owner-1/repository-1/revision-1.tgz",
        message=None,
        target=hash_to_bytes("4de2e07d3742718d928e974b8a4c721b9f7b33bf"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b""),
        date=None,
    )

    stats = get_stats(swh_storage)
    assert {
        "content": 1,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    # The visit is partial because urls pointing to non tarball file
    # are not handled yet
    assert_last_visit_matches(swh_storage,
                              sources_url,
                              status="partial",
                              type="nixguix")

    visit_status = origin_get_latest_visit_status(swh_storage, sources_url)
    snapshot_swhid = ExtendedSWHID(object_type=ExtendedObjectType.SNAPSHOT,
                                   object_id=visit_status.snapshot)
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url=sources_url,
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=snapshot_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.nixguix.loader.NixGuixLoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="nixguix-sources-json",
            metadata=raw_sources,
            origin=sources_url,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        snapshot_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )
Example #12
0
def test_opam_metadata(tmpdir, requests_mock_datadir, fake_opam_root,
                       swh_storage, datadir):
    opam_url = f"file://{datadir}/fake_opam_repo"
    opam_root = fake_opam_root
    opam_instance = "loadertest"

    opam_package = "ocb"
    url = f"opam+{opam_url}/packages/{opam_package}"

    loader = OpamLoader(
        swh_storage,
        url,
        opam_root,
        opam_instance,
        opam_url,
        opam_package,
        initialize_opam_root=True,
    )

    actual_load_status = loader.load()

    assert actual_load_status["status"] == "eventful"

    expected_release_id = hash_to_bytes(
        "c231e541eb29c712635ada394b04127ac69e9fb0")

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"HEAD":
            SnapshotBranch(
                target=b"ocb.0.1",
                target_type=TargetType.ALIAS,
            ),
            b"ocb.0.1":
            SnapshotBranch(
                target=expected_release_id,
                target_type=TargetType.RELEASE,
            ),
        },
    )

    assert_last_visit_matches(swh_storage,
                              url,
                              status="full",
                              type="opam",
                              snapshot=expected_snapshot.id)

    check_snapshot(expected_snapshot, swh_storage)

    release = swh_storage.release_get([expected_release_id])[0]
    assert release is not None

    release_swhid = CoreSWHID(object_type=ObjectType.RELEASE,
                              object_id=expected_release_id)
    directory_swhid = ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY,
                                    object_id=release.target)
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url=opam_url,
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=directory_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.opam.loader.OpamLoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="opam-package-definition",
            metadata=OCB_METADATA,
            origin=url,
            release=release_swhid,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        directory_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )
Example #13
0
 def swh_deposit_fetcher(self):
     return MetadataFetcher(
         name=self.tool["name"],
         version=self.tool["version"],
     )
Example #14
0
def test_deposit_loading_ok_2(swh_storage, deposit_client,
                              requests_mock_datadir):
    """Field dates should be se appropriately"""
    external_id = "some-external-id"
    url = f"https://hal-test.archives-ouvertes.fr/{external_id}"
    deposit_id = 777
    loader = DepositLoader(swh_storage,
                           url,
                           deposit_id,
                           deposit_client,
                           default_filename="archive.zip")

    actual_load_status = loader.load()
    expected_snapshot_id = "3449b8ff31abeacefd33cca60e3074c1649dc3a1"

    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id,
    }
    assert_last_visit_matches(
        loader.storage,
        url,
        status="full",
        type="deposit",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    release_id = "ba6c9a59ae3256e765d32b211cc183dc2380aed7"
    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"HEAD":
            SnapshotBranch(target=hash_to_bytes(release_id),
                           target_type=TargetType.RELEASE)
        },
    )

    check_snapshot(expected_snapshot, storage=loader.storage)

    raw_meta = loader.client.metadata_get(deposit_id)
    # Ensure the date fields are set appropriately in the release

    # Retrieve the release
    release = loader.storage.release_get([hash_to_bytes(release_id)])[0]
    assert release
    # swh-deposit uses the numeric 'offset_minutes' instead of the bytes offset
    # attribute, because its dates are always well-formed, and it can only send
    # JSON-serializable data.
    release_date_dict = {
        "timestamp": release.date.timestamp.to_dict(),
        "offset": release.date.offset_minutes(),
    }

    assert release_date_dict == raw_meta["deposit"]["author_date"]

    assert not release.metadata

    provider = {
        "provider_name": "hal",
        "provider_type": "deposit_client",
        "provider_url": "https://hal-test.archives-ouvertes.fr/",
        "metadata": None,
    }
    tool = {
        "name": "swh-deposit",
        "version": "0.0.1",
        "configuration": {
            "sword_version": "2"
        },
    }

    fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )

    authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="https://hal-test.archives-ouvertes.fr/",
    )

    # Check the origin metadata swh side
    origin_extrinsic_metadata = loader.storage.raw_extrinsic_metadata_get(
        Origin(url).swhid(), authority)
    assert origin_extrinsic_metadata.next_page_token is None
    raw_metadata: str = raw_meta["raw_metadata"]
    # 1 raw metadata xml + 1 json dict
    assert len(origin_extrinsic_metadata.results) == 2

    origin_swhid = Origin(url).swhid()

    expected_metadata = []
    origin_meta = origin_extrinsic_metadata.results[0]
    expected_metadata.append(
        RawExtrinsicMetadata(
            target=origin_swhid,
            discovery_date=origin_meta.discovery_date,
            metadata=raw_metadata.encode(),
            format="sword-v2-atom-codemeta-v2",
            authority=authority,
            fetcher=fetcher,
        ))

    origin_metadata = {
        "metadata": [raw_metadata],
        "provider": provider,
        "tool": tool,
    }
    expected_metadata.append(
        RawExtrinsicMetadata(
            target=origin_swhid,
            discovery_date=origin_extrinsic_metadata.results[-1].
            discovery_date,
            metadata=json.dumps(origin_metadata).encode(),
            format="original-artifacts-json",
            authority=authority,
            fetcher=fetcher,
        ))

    assert sorted(
        origin_extrinsic_metadata.results) == sorted(expected_metadata)

    # Check the release metadata swh side
    assert release.target_type == ModelObjectType.DIRECTORY
    directory_swhid = ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY,
                                    object_id=release.target)
    actual_directory_metadata = loader.storage.raw_extrinsic_metadata_get(
        directory_swhid, authority)

    assert actual_directory_metadata.next_page_token is None
    assert len(actual_directory_metadata.results) == 1

    release_swhid = CoreSWHID(object_type=ObjectType.RELEASE,
                              object_id=hash_to_bytes(release_id))
    dir_metadata_template = RawExtrinsicMetadata(
        target=directory_swhid,
        format="sword-v2-atom-codemeta-v2",
        authority=authority,
        fetcher=fetcher,
        origin=url,
        release=release_swhid,
        # to satisfy the constructor
        discovery_date=now(),
        metadata=b"",
    )

    expected_directory_metadata = []
    dir_metadata = actual_directory_metadata.results[0]
    expected_directory_metadata.append(
        RawExtrinsicMetadata.from_dict({
            **{
                k: v
                for (k, v) in dir_metadata_template.to_dict().items() if k != "id"
            },
            "discovery_date": dir_metadata.discovery_date,
            "metadata": raw_metadata.encode(),
        }))

    assert sorted(actual_directory_metadata.results) == sorted(
        expected_directory_metadata)

    # Retrieve the information for deposit status update query to the deposit
    urls = [
        m for m in requests_mock_datadir.request_history
        if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/"
    ]

    assert len(urls) == 1
    update_query = urls[0]

    body = update_query.json()
    expected_body = {
        "status": "done",
        "release_id": release_id,
        "directory_id": hash_to_hex(release.target),
        "snapshot_id": expected_snapshot_id,
        "origin_url": url,
    }

    assert body == expected_body
Example #15
0
def test_put_update_metadata_done_deposit_nominal(
    tmp_path,
    authenticated_client,
    complete_deposit,
    deposit_collection,
    atom_dataset,
    sample_data,
    swh_storage,
):
    """Nominal scenario, client send an update of metadata on a deposit with status "done"
    with an existing swhid. Such swhid has its metadata updated accordingly both in
    the deposit backend and in the metadata storage.

    Response: 204

    """
    deposit_swhid = CoreSWHID.from_string(complete_deposit.swhid)
    assert deposit_swhid.object_type == ObjectType.DIRECTORY
    directory_id = hash_to_bytes(deposit_swhid.object_id)

    # directory targeted by the complete_deposit does not exist in the storage
    assert list(swh_storage.directory_missing([directory_id
                                               ])) == [directory_id]

    # so let's create a directory reference in the storage (current deposit targets an
    # unknown swhid)
    existing_directory = sample_data.directory
    swh_storage.directory_add([existing_directory])
    assert list(swh_storage.directory_missing([existing_directory.id])) == []

    # and patch one complete deposit swhid so it targets said reference
    complete_deposit.swhid = str(existing_directory.swhid())
    complete_deposit.save()

    actual_existing_requests_archive = DepositRequest.objects.filter(
        deposit=complete_deposit, type="archive")
    nb_archives = len(actual_existing_requests_archive)
    actual_existing_requests_metadata = DepositRequest.objects.filter(
        deposit=complete_deposit, type="metadata")
    nb_metadata = len(actual_existing_requests_metadata)

    update_uri = reverse(EDIT_IRI,
                         args=[deposit_collection.name, complete_deposit.id])
    response = put_atom(
        authenticated_client,
        update_uri,
        data=atom_dataset["entry-data1"],
        HTTP_X_CHECK_SWHID=complete_deposit.swhid,
    )

    assert response.status_code == status.HTTP_204_NO_CONTENT

    new_requests_meta = DepositRequest.objects.filter(deposit=complete_deposit,
                                                      type="metadata")
    assert len(new_requests_meta) == nb_metadata + 1
    request_meta1 = new_requests_meta[0]
    raw_metadata1 = request_meta1.raw_metadata
    assert raw_metadata1 == atom_dataset["entry-data1"]

    # check we did not touch the other parts
    requests_archive1 = DepositRequest.objects.filter(deposit=complete_deposit,
                                                      type="archive")
    assert len(requests_archive1) == nb_archives
    assert set(actual_existing_requests_archive) == set(requests_archive1)

    # Ensure metadata stored in the metadata storage is consistent
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url=complete_deposit.client.provider_url,
    )

    actual_authority = swh_storage.metadata_authority_get(
        MetadataAuthorityType.DEPOSIT_CLIENT,
        url=complete_deposit.client.provider_url)
    assert actual_authority == metadata_authority

    config = APIConfig()
    metadata_fetcher = MetadataFetcher(
        name=config.tool["name"],
        version=config.tool["version"],
    )

    actual_fetcher = swh_storage.metadata_fetcher_get(config.tool["name"],
                                                      config.tool["version"])
    assert actual_fetcher == metadata_fetcher

    directory_swhid = ExtendedSWHID.from_string(complete_deposit.swhid)
    page_results = swh_storage.raw_extrinsic_metadata_get(
        directory_swhid, metadata_authority)
    assert page_results == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=directory_swhid,
                discovery_date=request_meta1.date,
                authority=metadata_authority,
                fetcher=metadata_fetcher,
                format="sword-v2-atom-codemeta",
                metadata=raw_metadata1.encode(),
                origin=complete_deposit.origin_url,
            )
        ],
        next_page_token=None,
    )
Example #16
0
AUTHORITY = MetadataAuthority(
    type=MetadataAuthorityType.FORGE,
    url="http://example.org/",
)
ORIGIN_URL = "http://example.org/archive.tgz"
ORIGIN_SWHID = Origin(ORIGIN_URL).swhid()

REVISION_ID = hash_to_bytes("8ff44f081d43176474b267de5451f2c2e88089d0")
RELEASE_ID = hash_to_bytes("9477a708196b44e59efb4e47b7d979a4146bd428")
RELEASE_SWHID = CoreSWHID.from_string(f"swh:1:rel:{RELEASE_ID.hex()}")
DIRECTORY_ID = hash_to_bytes("aa" * 20)
DIRECTORY_SWHID = ExtendedSWHID.from_string(f"swh:1:dir:{DIRECTORY_ID.hex()}")

FETCHER = MetadataFetcher(
    name="swh.loader.package.tests.test_loader_metadata.MetadataTestLoader",
    version=__version__,
)

DISCOVERY_DATE = datetime.datetime.now(tz=datetime.timezone.utc)

DIRECTORY_METADATA = [
    RawExtrinsicMetadata(
        target=DIRECTORY_SWHID,
        discovery_date=DISCOVERY_DATE,
        authority=AUTHORITY,
        fetcher=FETCHER,
        format="test-format1",
        metadata=b"foo bar",
        origin=ORIGIN_URL,
        release=RELEASE_SWHID,
    ),
Example #17
0
        },
    ),
]

METADATA_AUTHORITIES = [
    MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="http://example.org/",
        metadata={},
    ),
]

METADATA_FETCHERS = [
    MetadataFetcher(
        name="test-fetcher",
        version="1.0.0",
        metadata={},
    )
]

RAW_EXTRINSIC_METADATA = [
    RawExtrinsicMetadata(
        target=Origin("http://example.org/foo.git").swhid(),
        discovery_date=datetime.datetime(2020, 7, 30, 17, 8, 20, tzinfo=UTC),
        authority=attr.evolve(METADATA_AUTHORITIES[0], metadata=None),
        fetcher=attr.evolve(METADATA_FETCHERS[0], metadata=None),
        format="json",
        metadata=b'{"foo": "bar"}',
    ),
    RawExtrinsicMetadata(
        target=ExtendedSWHID.from_string(str(CONTENTS[0].swhid())),
Example #18
0
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info):
    package = "org"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    release_id = "d38cc0b571cd41f3c85513864e049766b42032a7"
    versions = [
        ("0.0.2", release_id),
        ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"),
        ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.4", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.2",
        message=b"Synthetic release for NPM source package org version 0.0.2\n",
        target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person(
            fullname=b"mooz <*****@*****.**>",
            name=b"mooz",
            email=b"*****@*****.**",
        ),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    contents = swh_storage.content_get(_expected_new_contents_first_visit)
    count = sum(0 if content is None else 1 for content in contents)
    assert count == len(_expected_new_contents_first_visit)

    assert (
        list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == []
    )

    assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == []

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://npmjs.com/",
    )

    for (version_name, release_id) in versions:
        release = swh_storage.release_get([hash_to_bytes(release_id)])[0]
        assert release.target_type == ModelObjectType.DIRECTORY
        directory_id = release.target
        directory_swhid = ExtendedSWHID(
            object_type=ExtendedObjectType.DIRECTORY,
            object_id=directory_id,
        )
        release_swhid = CoreSWHID(
            object_type=ObjectType.RELEASE,
            object_id=hash_to_bytes(release_id),
        )
        expected_metadata = [
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.npm.loader.NpmLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="replicate-npm-package-json",
                metadata=json.dumps(
                    json.loads(org_api_info)["versions"][version_name]
                ).encode(),
                origin="https://www.npmjs.com/package/org",
                release=release_swhid,
            )
        ]
        assert swh_storage.raw_extrinsic_metadata_get(
            directory_swhid,
            metadata_authority,
        ) == PagedResult(
            next_page_token=None,
            results=expected_metadata,
        )

    stats = get_stats(swh_storage)

    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Example #19
0
    RawExtrinsicMetadata,
    Snapshot,
)
import swh.storage.exc

ORIGIN = Origin(url="some-url")
PARENT_ORIGIN = Origin(url="base-origin-url")

METADATA_AUTHORITY = MetadataAuthority(type=MetadataAuthorityType.FORGE,
                                       url="http://example.org/")
REMD = RawExtrinsicMetadata(
    target=ORIGIN.swhid(),
    discovery_date=datetime.datetime.now(tz=datetime.timezone.utc),
    authority=METADATA_AUTHORITY,
    fetcher=MetadataFetcher(
        name="test fetcher",
        version="0.0.1",
    ),
    format="test-format",
    metadata=b'{"foo": "bar"}',
)


class DummyLoader:
    """Base Loader to overload and simplify the base class (technical: to avoid repetition
    in other *Loader classes)"""

    visit_type = "git"

    def __init__(self, storage, *args, **kwargs):
        super().__init__(storage, ORIGIN.url, *args, **kwargs)