Beispiel #1
0
def _assert_deposit_info_on_metadata(swh_storage, metadata_swhid, deposit,
                                     metadata_fetcher):
    swh_authority = MetadataAuthority(
        MetadataAuthorityType.REGISTRY,
        "http://deposit.softwareheritage.example/",
    )
    page_results = swh_storage.raw_extrinsic_metadata_get(
        metadata_swhid, swh_authority)

    assert len(page_results.results) == 1
    assert page_results.next_page_token is None

    expected_xml_data = textwrap.dedent(f"""\
        <deposit xmlns="https://www.softwareheritage.org/schema/2018/deposit">
            <deposit_id>{deposit.id}</deposit_id>
            <deposit_client>https://hal-test.archives-ouvertes.fr/</deposit_client>
            <deposit_collection>test</deposit_collection>
        </deposit>
        """)
    assert page_results == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=metadata_swhid,
                discovery_date=deposit.complete_date,
                authority=swh_authority,
                fetcher=metadata_fetcher,
                format="xml-deposit-info",
                metadata=expected_xml_data.encode(),
            )
        ],
        next_page_token=None,
    )
Beispiel #2
0
def test_load_artifact_metadata(swh_storage, caplog):
    loader = MetadataTestLoader(swh_storage, ORIGIN_URL)

    load_status = loader.load()
    assert load_status == {
        "status": "eventful",
        "snapshot_id": FULL_SNAPSHOT_ID,
    }

    authority = MetadataAuthority(
        type=MetadataAuthorityType.REGISTRY,
        url="https://softwareheritage.org/",
    )

    result = swh_storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority,
    )
    assert result.next_page_token is None
    assert len(result.results) == 1
    assert result.results[0] == RawExtrinsicMetadata(
        target=DIRECTORY_SWHID,
        discovery_date=result.results[0].discovery_date,
        authority=authority,
        fetcher=FETCHER,
        format="original-artifacts-json",
        metadata=b'[{"artifact_key": "value", "length": 0}]',
        origin=ORIGIN_URL,
        release=RELEASE_SWHID,
    )
Beispiel #3
0
def row_to_raw_extrinsic_metadata(
        row: RawExtrinsicMetadataRow) -> RawExtrinsicMetadata:
    discovery_date = row.discovery_date.replace(tzinfo=datetime.timezone.utc)

    return RawExtrinsicMetadata(
        target=ExtendedSWHID.from_string(row.target),
        authority=MetadataAuthority(
            type=MetadataAuthorityType(row.authority_type),
            url=row.authority_url,
        ),
        fetcher=MetadataFetcher(
            name=row.fetcher_name,
            version=row.fetcher_version,
        ),
        discovery_date=discovery_date,
        format=row.format,
        metadata=row.metadata,
        origin=row.origin,
        visit=row.visit,
        snapshot=map_optional(CoreSWHID.from_string, row.snapshot),
        release=map_optional(CoreSWHID.from_string, row.release),
        revision=map_optional(CoreSWHID.from_string, row.revision),
        path=row.path,
        directory=map_optional(CoreSWHID.from_string, row.directory),
    )
 def get_metadata_authority(self):
     p_url = urlparse(self.origin.url)
     return MetadataAuthority(
         type=MetadataAuthorityType.FORGE,
         url=f"{p_url.scheme}://{p_url.netloc}/",
         metadata={},
     )
def db_to_raw_extrinsic_metadata(row) -> RawExtrinsicMetadata:
    target = row["raw_extrinsic_metadata.target"]
    if not target.startswith("swh:1:"):
        warnings.warn("Fetching raw_extrinsic_metadata row with URL target",
                      DeprecationWarning)
        target = str(Origin(url=target).swhid())

    return RawExtrinsicMetadata(
        target=ExtendedSWHID.from_string(target),
        authority=MetadataAuthority(
            type=MetadataAuthorityType(row["metadata_authority.type"]),
            url=row["metadata_authority.url"],
        ),
        fetcher=MetadataFetcher(
            name=row["metadata_fetcher.name"],
            version=row["metadata_fetcher.version"],
        ),
        discovery_date=row["discovery_date"],
        format=row["format"],
        metadata=row["raw_extrinsic_metadata.metadata"],
        origin=row["origin"],
        visit=row["visit"],
        snapshot=map_optional(CoreSWHID.from_string, row["snapshot"]),
        release=map_optional(CoreSWHID.from_string, row["release"]),
        revision=map_optional(CoreSWHID.from_string, row["revision"]),
        path=row["path"],
        directory=map_optional(CoreSWHID.from_string, row["directory"]),
    )
def test_maven_loader_extrinsic_metadata(swh_storage, expected_releases,
                                         expected_json_metadata,
                                         expected_pom_metadata):
    """With no prior visit, loading a jar ends up with 1 snapshot.
    Extrinsic metadata is the pom file associated to the source jar.
    """
    loader = MavenLoader(swh_storage, MVN_ORIGIN_URL, artifacts=MVN_ARTIFACTS)

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"

    for i, expected_release in enumerate(expected_releases):

        expected_release_id = expected_release.id
        release = swh_storage.release_get([expected_release_id])[0]
        assert release is not None

        release_swhid = CoreSWHID(object_type=ObjectType.RELEASE,
                                  object_id=expected_release_id)
        directory_swhid = ExtendedSWHID(
            object_type=ExtendedObjectType.DIRECTORY, object_id=release.target)
        metadata_authority = MetadataAuthority(
            type=MetadataAuthorityType.FORGE,
            url=REPO_BASE_URL,
        )

        expected_metadata = [
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.maven.loader.MavenLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="maven-pom",
                metadata=expected_pom_metadata[i],
                origin=MVN_ORIGIN_URL,
                release=release_swhid,
            ),
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.maven.loader.MavenLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="maven-json",
                metadata=json.dumps(expected_json_metadata[i]).encode(),
                origin=MVN_ORIGIN_URL,
                release=release_swhid,
            ),
        ]

        res = swh_storage.raw_extrinsic_metadata_get(directory_swhid,
                                                     metadata_authority)
        assert res.next_page_token is None
        assert set(res.results) == set(expected_metadata)
 def get_metadata_authority(self) -> MetadataAuthority:
     provider = self.metadata()["provider"]
     assert provider["provider_type"] == MetadataAuthorityType.DEPOSIT_CLIENT.value
     return MetadataAuthority(
         type=MetadataAuthorityType.DEPOSIT_CLIENT,
         url=provider["provider_url"],
         metadata={
             "name": provider["provider_name"],
             **(provider["metadata"] or {}),
         },
     )
Beispiel #8
0
def handle_row(row: Dict[str, Any], storage, deposit_cur, dry_run: bool):
    type_ = row["type"]

    # default date in case we can't find a better one
    discovery_date = row["date"] or row["committer_date"]

    metadata = row["metadata"]

    if metadata is None:
        return

    if type_ == "dsc":
        origin = None  # it will be defined later, using debian_origins_from_row

        # TODO: the debian loader writes the changelog date as the revision's
        # author date and committer date. Instead, we should use the visit's date

        if "extrinsic" in metadata:
            extrinsic_files = metadata["extrinsic"]["raw"]["files"]
            for artifact_entry in metadata["original_artifact"]:
                extrinsic_file = extrinsic_files[artifact_entry["filename"]]
                for key in ("sha256", ):
                    assert artifact_entry["checksums"][key] == extrinsic_file[
                        key]
                    artifact_entry["url"] = extrinsic_file["uri"]
            del metadata["extrinsic"]

    elif type_ == "tar":
        provider = metadata.get("extrinsic", {}).get("provider")
        if provider is not None:
            # This is the format all the package loaders currently write, and
            # it is the easiest, thanks to the 'provider' and 'when' fields,
            # which have all the information we need to tell them easily
            # and generate accurate metadata

            discovery_date = iso8601.parse_date(metadata["extrinsic"]["when"])

            # New versions of the loaders write the provider; use it.
            if provider.startswith("https://replicate.npmjs.com/"):
                # npm loader format 1

                parsed_url = urlparse(provider)
                assert re.match("^/[^/]+/?$", parsed_url.path), parsed_url
                package_name = unquote(parsed_url.path.strip("/"))
                origin = "https://www.npmjs.com/package/" + package_name
                assert_origin_exists(storage, origin)

                load_metadata(
                    storage,
                    row["id"],
                    row["directory"],
                    discovery_date,
                    metadata["extrinsic"]["raw"],
                    NPM_FORMAT,
                    authority=AUTHORITIES["npmjs"],
                    origin=origin,
                    dry_run=dry_run,
                )
                del metadata["extrinsic"]

            elif provider.startswith("https://pypi.org/"):
                # pypi loader format 1

                match = re.match(
                    "https://pypi.org/pypi/(?P<project_name>.*)/json",
                    provider)
                assert match, f"unexpected provider URL format: {provider}"
                project_name = match.group("project_name")
                origin = f"https://pypi.org/project/{project_name}/"
                assert_origin_exists(storage, origin)

                load_metadata(
                    storage,
                    row["id"],
                    row["directory"],
                    discovery_date,
                    metadata["extrinsic"]["raw"],
                    PYPI_FORMAT,
                    authority=AUTHORITIES["pypi"],
                    origin=origin,
                    dry_run=dry_run,
                )
                del metadata["extrinsic"]

            elif provider.startswith("https://cran.r-project.org/"):
                # cran loader

                provider = metadata["extrinsic"]["provider"]
                if provider.startswith("https://cran.r-project.org/package="):
                    origin = metadata["extrinsic"]["provider"]
                else:
                    package_name = cran_package_from_url(provider)
                    origin = f"https://cran.r-project.org/package={package_name}"
                assert origin is not None

                # Ideally we should assert the origin exists, but we can't:
                # https://forge.softwareheritage.org/T2536
                if (hashlib.sha1(origin.encode()).digest() not in _origins
                        and storage.origin_get([origin])[0] is None):
                    return

                raw_extrinsic_metadata = metadata["extrinsic"]["raw"]

                # this is actually intrinsic, ignore it
                if "version" in raw_extrinsic_metadata:
                    del raw_extrinsic_metadata["version"]

                # Copy the URL to the original_artifacts metadata
                assert len(metadata["original_artifact"]) == 1
                if "url" in metadata["original_artifact"][0]:
                    assert (metadata["original_artifact"][0]["url"] ==
                            raw_extrinsic_metadata["url"]), row
                else:
                    metadata["original_artifact"][0][
                        "url"] = raw_extrinsic_metadata["url"]
                del raw_extrinsic_metadata["url"]

                assert (
                    raw_extrinsic_metadata == {}
                ), f"Unexpected metadata keys: {list(raw_extrinsic_metadata)}"

                del metadata["extrinsic"]

            elif (provider.startswith(
                    "https://nix-community.github.io/nixpkgs-swh/")
                  or provider == "https://guix.gnu.org/sources.json"):
                # nixguix loader
                origin = provider
                assert_origin_exists(storage, origin)

                authority = MetadataAuthority(
                    type=MetadataAuthorityType.FORGE,
                    url=provider,
                    metadata={},
                )
                assert row[
                    "date"] is None  # the nixguix loader does not write dates

                load_metadata(
                    storage,
                    row["id"],
                    row["directory"],
                    discovery_date,
                    metadata["extrinsic"]["raw"],
                    NIXGUIX_FORMAT,
                    authority=authority,
                    origin=origin,
                    dry_run=dry_run,
                )
                del metadata["extrinsic"]

            elif provider.startswith("https://ftp.gnu.org/"):
                # archive loader format 1

                origin = provider
                assert_origin_exists(storage, origin)

                assert len(metadata["original_artifact"]) == 1
                metadata["original_artifact"][0]["url"] = metadata[
                    "extrinsic"]["raw"]["url"]

                # Remove duplicate keys of original_artifacts
                for key in ("url", "time", "length", "version", "filename"):
                    del metadata["extrinsic"]["raw"][key]

                assert metadata["extrinsic"]["raw"] == {}
                del metadata["extrinsic"]

            elif provider.startswith("https://deposit.softwareheritage.org/"):
                origin = metadata["extrinsic"]["raw"]["origin"]["url"]
                assert_origin_exists(storage, origin)

                if "@xmlns" in metadata:
                    assert metadata["@xmlns"] == ATOM_NS
                    assert metadata["@xmlns:codemeta"] in (CODEMETA_NS,
                                                           [CODEMETA_NS])
                    assert "intrinsic" not in metadata
                    assert "extra_headers" not in metadata

                    # deposit loader format 1
                    # in this case, the metadata seems to be both directly in metadata
                    # and in metadata["extrinsic"]["raw"]["metadata"]

                    (origin, discovery_date) = handle_deposit_row(
                        row, discovery_date, origin, storage, deposit_cur,
                        dry_run)

                    remove_atom_codemeta_metadata_with_xmlns(metadata)
                    if "client" in metadata:
                        del metadata["client"]
                    del metadata["extrinsic"]
                else:
                    # deposit loader format 2
                    actual_metadata = metadata["extrinsic"]["raw"][
                        "origin_metadata"]["metadata"]
                    if isinstance(actual_metadata, str):
                        # new format introduced in
                        # https://forge.softwareheritage.org/D4105
                        actual_metadata = json.loads(actual_metadata)
                    if "@xmlns" in actual_metadata:
                        assert actual_metadata["@xmlns"] == ATOM_NS
                        assert actual_metadata["@xmlns:codemeta"] in (
                            CODEMETA_NS,
                            [CODEMETA_NS],
                        )
                    elif "{http://www.w3.org/2005/Atom}id" in actual_metadata:
                        assert (
                            "{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author"
                            in actual_metadata)
                    else:
                        # new format introduced in
                        # https://forge.softwareheritage.org/D4065
                        # it's the same as the first case, but with the @xmlns
                        # declarations stripped
                        # Most of them should have the "id", but some revisions,
                        # like 4d3890004fade1f4ec3bf7004a4af0c490605128, are missing
                        # this field
                        assert ("id" in actual_metadata
                                or "title" in actual_metadata
                                or "atom:title" in actual_metadata)
                        assert "codemeta:author" in actual_metadata

                    (origin, discovery_date) = handle_deposit_row(
                        row, discovery_date, origin, storage, deposit_cur,
                        dry_run)

                    del metadata["extrinsic"]
            else:
                assert False, f"unknown provider {provider}"

        # Older versions don't write the provider; use heuristics instead.
        elif (metadata.get("package_source", {}).get(
                "url", "").startswith("https://registry.npmjs.org/")):
            # npm loader format 2

            package_source_url = metadata["package_source"]["url"]
            package_name = npm_package_from_source_url(package_source_url)
            origin = "https://www.npmjs.com/package/" + package_name
            assert_origin_exists(storage, origin)

            load_metadata(
                storage,
                row["id"],
                row["directory"],
                discovery_date,
                metadata["package"],
                NPM_FORMAT,
                authority=AUTHORITIES["npmjs"],
                origin=origin,
                dry_run=dry_run,
            )
            del metadata["package"]

            assert "original_artifact" not in metadata

            # rebuild an "original_artifact"-like metadata dict from what we
            # can salvage of "package_source"
            package_source_metadata = metadata["package_source"]
            keep_keys = {"blake2s256", "filename", "sha1", "sha256", "url"}
            discard_keys = {
                "date",  # is equal to the revision date
                "name",  # was loaded above
                "version",  # same
            }
            assert (set(package_source_metadata) == keep_keys
                    | discard_keys), package_source_metadata

            # will be loaded below
            metadata["original_artifact"] = [{
                "filename":
                package_source_metadata["filename"],
                "checksums": {
                    "sha1": package_source_metadata["sha1"],
                    "sha256": package_source_metadata["sha256"],
                    "blake2s256": package_source_metadata["blake2s256"],
                },
                "url":
                package_source_metadata["url"],
            }]
            del metadata["package_source"]

        elif "@xmlns" in metadata:
            assert metadata["@xmlns:codemeta"] in (CODEMETA_NS, [CODEMETA_NS])
            assert "intrinsic" not in metadata
            assert "extra_headers" not in metadata

            # deposit loader format 3

            if row["message"] == b"swh: Deposit 159 in collection swh":
                # There is no deposit 159 in the deposit DB, for some reason
                assert (hash_to_hex(
                    row["id"]) == "8e9cee14a6ad39bca4347077b87fb5bbd8953bb1")
                return
            elif row["message"] == b"hal: Deposit 342 in collection hal":
                # They have status 'failed' and no swhid
                return

            origin = None  # TODO
            discovery_date = None  # TODO

            (origin,
             discovery_date) = handle_deposit_row(row, discovery_date, origin,
                                                  storage, deposit_cur,
                                                  dry_run)
            remove_atom_codemeta_metadata_with_xmlns(metadata)
            if "client" in metadata:
                del metadata["client"]  # found in the deposit db
            if "committer" in metadata:
                del metadata["committer"]  # found on the revision object

        elif "{http://www.w3.org/2005/Atom}id" in metadata:
            assert ("{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author"
                    in metadata
                    or "{http://www.w3.org/2005/Atom}author" in metadata)
            assert "intrinsic" not in metadata
            assert "extra_headers" not in metadata

            # deposit loader format 4

            origin = None
            discovery_date = None  # TODO

            (origin,
             discovery_date) = handle_deposit_row(row, discovery_date, origin,
                                                  storage, deposit_cur,
                                                  dry_run)
            remove_atom_codemeta_metadata_without_xmlns(metadata)

        elif hash_to_hex(
                row["id"]) == "a86747d201ab8f8657d145df4376676d5e47cf9f":
            # deposit 91, is missing "{http://www.w3.org/2005/Atom}id" for some
            # reason, and has an invalid oririn
            return

        elif (isinstance(metadata.get("original_artifact"), dict)
              and metadata["original_artifact"]["url"].startswith(
                  "https://files.pythonhosted.org/")) or (
                      isinstance(metadata.get("original_artifact"), list)
                      and len(metadata.get("original_artifact")) == 1
                      and metadata["original_artifact"][0].get(
                          "url",
                          "").startswith("https://files.pythonhosted.org/")):
            if isinstance(metadata.get("original_artifact"), dict):
                metadata["original_artifact"] = [metadata["original_artifact"]]

            assert len(metadata["original_artifact"]) == 1

            version = metadata.get("project", {}).get("version")
            filename = metadata["original_artifact"][0]["filename"]
            if version:
                origin = pypi_origin_from_project_name(
                    filename.split("-" + version)[0])
                if not _check_revision_in_origin(storage, origin, row["id"]):
                    origin = None
            else:
                origin = None
            if origin is None:
                origin = pypi_origin_from_filename(storage, row["id"],
                                                   filename)

            if "project" in metadata:
                # pypi loader format 2
                load_metadata(
                    storage,
                    row["id"],
                    row["directory"],
                    discovery_date,
                    metadata["project"],
                    PYPI_FORMAT,
                    authority=AUTHORITIES["pypi"],
                    origin=origin,
                    dry_run=dry_run,
                )
                del metadata["project"]
            else:
                assert set(metadata) == {"original_artifact"}, set(metadata)
                # pypi loader format 3
                pass  # nothing to do, there's no metadata

        elif row["message"] == b"synthetic revision message":
            assert isinstance(metadata["original_artifact"], list), metadata
            assert not any("url" in d for d in metadata["original_artifact"])

            # archive loader format 2

            origin = None

        elif deposit_revision_message_re.match(row["message"]):
            # deposit without metadata in the revision

            assert set(metadata) == {"original_artifact"}, metadata

            origin = None  # TODO
            discovery_date = None

            (origin,
             discovery_date) = handle_deposit_row(row, discovery_date, origin,
                                                  storage, deposit_cur,
                                                  dry_run)
        else:
            assert False, f"Unable to detect type of metadata for row: {row}"

    # Ignore common intrinsic metadata keys
    for key in ("intrinsic", "extra_headers"):
        if key in metadata:
            del metadata[key]

    # Ignore loader-specific intrinsic metadata keys
    if type_ == "hg":
        del metadata["node"]
    elif type_ == "dsc":
        if "package_info" in metadata:
            del metadata["package_info"]

    if "original_artifact" in metadata:
        for original_artifact in metadata["original_artifact"]:
            # Rename keys to the expected format of original-artifacts-json.
            rename_keys = [
                ("name", "filename"),  # eg. from old Debian loader
                ("size", "length"),  # eg. from old PyPI loader
            ]
            for (old_name, new_name) in rename_keys:
                if old_name in original_artifact:
                    assert new_name not in original_artifact
                    original_artifact[new_name] = original_artifact.pop(
                        old_name)

            # Move the checksums to their own subdict, which is the expected format
            # of original-artifacts-json.
            if "sha1" in original_artifact:
                assert "checksums" not in original_artifact
                original_artifact["checksums"] = {}
                for key in ("sha1", "sha256", "sha1_git", "blake2s256"):
                    if key in original_artifact:
                        original_artifact["checksums"][
                            key] = original_artifact.pop(key)

            if "date" in original_artifact:
                # The information comes from the package repository rather than SWH,
                # so it shouldn't be in the 'original-artifacts' metadata
                # (which has SWH as authority).
                # Moreover, it's not a very useful information, so let's just drop it.
                del original_artifact["date"]

            allowed_keys = {
                "checksums",
                "filename",
                "length",
                "url",
                "archive_type",
            }
            assert set(original_artifact) <= allowed_keys, set(
                original_artifact)

        if type_ == "dsc":
            assert origin is None
            origins = debian_origins_from_row(row, storage)
            if not origins:
                print(
                    f"Missing Debian origin for revision: {hash_to_hex(row['id'])}"
                )
        else:
            origins = [origin]

        for origin in origins:
            load_metadata(
                storage,
                row["id"],
                row["directory"],
                discovery_date,
                metadata["original_artifact"],
                ORIGINAL_ARTIFACT_FORMAT,
                authority=AUTHORITIES["swh"],
                origin=origin,
                dry_run=dry_run,
            )
        del metadata["original_artifact"]

    assert metadata == {}, (
        f"remaining metadata keys for {row['id'].hex()} (type: {row['type']}): "
        f"{metadata}")
Beispiel #9
0
def test_deposit_loading_ok_2(swh_storage, deposit_client,
                              requests_mock_datadir):
    """Field dates should be se appropriately"""
    external_id = "some-external-id"
    url = f"https://hal-test.archives-ouvertes.fr/{external_id}"
    deposit_id = 777
    loader = DepositLoader(swh_storage,
                           url,
                           deposit_id,
                           deposit_client,
                           default_filename="archive.zip")

    actual_load_status = loader.load()
    expected_snapshot_id = "3449b8ff31abeacefd33cca60e3074c1649dc3a1"

    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id,
    }
    assert_last_visit_matches(
        loader.storage,
        url,
        status="full",
        type="deposit",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    release_id = "ba6c9a59ae3256e765d32b211cc183dc2380aed7"
    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"HEAD":
            SnapshotBranch(target=hash_to_bytes(release_id),
                           target_type=TargetType.RELEASE)
        },
    )

    check_snapshot(expected_snapshot, storage=loader.storage)

    raw_meta = loader.client.metadata_get(deposit_id)
    # Ensure the date fields are set appropriately in the release

    # Retrieve the release
    release = loader.storage.release_get([hash_to_bytes(release_id)])[0]
    assert release
    # swh-deposit uses the numeric 'offset_minutes' instead of the bytes offset
    # attribute, because its dates are always well-formed, and it can only send
    # JSON-serializable data.
    release_date_dict = {
        "timestamp": release.date.timestamp.to_dict(),
        "offset": release.date.offset_minutes(),
    }

    assert release_date_dict == raw_meta["deposit"]["author_date"]

    assert not release.metadata

    provider = {
        "provider_name": "hal",
        "provider_type": "deposit_client",
        "provider_url": "https://hal-test.archives-ouvertes.fr/",
        "metadata": None,
    }
    tool = {
        "name": "swh-deposit",
        "version": "0.0.1",
        "configuration": {
            "sword_version": "2"
        },
    }

    fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )

    authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="https://hal-test.archives-ouvertes.fr/",
    )

    # Check the origin metadata swh side
    origin_extrinsic_metadata = loader.storage.raw_extrinsic_metadata_get(
        Origin(url).swhid(), authority)
    assert origin_extrinsic_metadata.next_page_token is None
    raw_metadata: str = raw_meta["raw_metadata"]
    # 1 raw metadata xml + 1 json dict
    assert len(origin_extrinsic_metadata.results) == 2

    origin_swhid = Origin(url).swhid()

    expected_metadata = []
    origin_meta = origin_extrinsic_metadata.results[0]
    expected_metadata.append(
        RawExtrinsicMetadata(
            target=origin_swhid,
            discovery_date=origin_meta.discovery_date,
            metadata=raw_metadata.encode(),
            format="sword-v2-atom-codemeta-v2",
            authority=authority,
            fetcher=fetcher,
        ))

    origin_metadata = {
        "metadata": [raw_metadata],
        "provider": provider,
        "tool": tool,
    }
    expected_metadata.append(
        RawExtrinsicMetadata(
            target=origin_swhid,
            discovery_date=origin_extrinsic_metadata.results[-1].
            discovery_date,
            metadata=json.dumps(origin_metadata).encode(),
            format="original-artifacts-json",
            authority=authority,
            fetcher=fetcher,
        ))

    assert sorted(
        origin_extrinsic_metadata.results) == sorted(expected_metadata)

    # Check the release metadata swh side
    assert release.target_type == ModelObjectType.DIRECTORY
    directory_swhid = ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY,
                                    object_id=release.target)
    actual_directory_metadata = loader.storage.raw_extrinsic_metadata_get(
        directory_swhid, authority)

    assert actual_directory_metadata.next_page_token is None
    assert len(actual_directory_metadata.results) == 1

    release_swhid = CoreSWHID(object_type=ObjectType.RELEASE,
                              object_id=hash_to_bytes(release_id))
    dir_metadata_template = RawExtrinsicMetadata(
        target=directory_swhid,
        format="sword-v2-atom-codemeta-v2",
        authority=authority,
        fetcher=fetcher,
        origin=url,
        release=release_swhid,
        # to satisfy the constructor
        discovery_date=now(),
        metadata=b"",
    )

    expected_directory_metadata = []
    dir_metadata = actual_directory_metadata.results[0]
    expected_directory_metadata.append(
        RawExtrinsicMetadata.from_dict({
            **{
                k: v
                for (k, v) in dir_metadata_template.to_dict().items() if k != "id"
            },
            "discovery_date": dir_metadata.discovery_date,
            "metadata": raw_metadata.encode(),
        }))

    assert sorted(actual_directory_metadata.results) == sorted(
        expected_directory_metadata)

    # Retrieve the information for deposit status update query to the deposit
    urls = [
        m for m in requests_mock_datadir.request_history
        if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/"
    ]

    assert len(urls) == 1
    update_query = urls[0]

    body = update_query.json()
    expected_body = {
        "status": "done",
        "release_id": release_id,
        "directory_id": hash_to_hex(release.target),
        "snapshot_id": expected_snapshot_id,
        "origin_url": url,
    }

    assert body == expected_body
Beispiel #10
0
class StorageData:
    """Data model objects to use within tests."""

    content = Content(
        data=b"42\n",
        length=3,
        sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"),
        sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        sha256=hash_to_bytes(
            "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0"
        ),
        blake2s256=hash_to_bytes(
            "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d"
        ),
        status="visible",
    )
    content2 = Content(
        data=b"4242\n",
        length=5,
        sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"),
        sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"),
        sha256=hash_to_bytes(
            "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd"
        ),
        blake2s256=hash_to_bytes(
            "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d"
        ),
        status="visible",
    )
    content3 = Content(
        data=b"424242\n",
        length=7,
        sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"),
        sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"),
        sha256=hash_to_bytes(
            "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36"
        ),
        blake2s256=hash_to_bytes(
            "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11"
        ),
        status="visible",
        ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc),
    )
    contents: Tuple[Content, ...] = (content, content2, content3)

    skipped_content = SkippedContent(
        length=1024 * 1024 * 200,
        sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"),
        sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
        origin="file:///dev/zero",
    )
    skipped_content2 = SkippedContent(
        length=1024 * 1024 * 300,
        sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"),
        sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
    )
    skipped_contents: Tuple[SkippedContent,
                            ...] = (skipped_content, skipped_content2)

    directory5 = Directory(
        id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"),
        entries=(),
    )
    directory = Directory(
        id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar\xc3",
                type="dir",
                target=directory5.id,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
    )
    directory2 = Directory(
        id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"),
        entries=tuple([
            DirectoryEntry(
                name=b"oof",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            )
        ], ),
    )
    directory3 = Directory(
        id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"subdir",
                type="dir",
                target=directory.id,
                perms=from_disk.DentryPerms.directory,
            ),
            DirectoryEntry(
                name=b"hello",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
        ], ),
    )
    directory4 = Directory(
        id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"),
        entries=tuple([
            DirectoryEntry(
                name=b"subdir1",
                type="dir",
                target=directory3.id,
                perms=from_disk.DentryPerms.directory,
            )
        ], ),
    )

    directory6 = Directory(
        id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=b"\x00" * 20,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar",
                type="dir",
                target=b"\x01" * 20,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
        raw_manifest=(
            b"tree 61\x00"
            b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"  # noqa
            b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"  # noqa
        ),
    )

    directories: Tuple[Directory, ...] = (
        directory2,
        directory,
        directory3,
        directory4,
        directory5,
        directory6,
    )

    revision = Revision(
        id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
        },
        extra_headers=(
            (b"gpgsig", b"test123"),
            (b"mergetag", b"foo\\bar"),
            (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"),
        ),
        synthetic=True,
    )
    revision2 = Revision(
        id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    revision3 = Revision(
        id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id, revision2.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=True,
    )
    revision4 = Revision(
        id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([revision3.id]),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    git_revisions: Tuple[Revision,
                         ...] = (revision, revision2, revision3, revision4)

    hg_revision = Revision(
        id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
            "node": "a316dfb434af2b451c1f393496b7eaeda343f543",
        },
        extra_headers=(),
        synthetic=True,
    )
    hg_revision2 = Revision(
        id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ),
        synthetic=False,
    )
    hg_revision3 = Revision(
        id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id, hg_revision2.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ),
        synthetic=True,
    )
    hg_revision4 = Revision(
        id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([hg_revision3.id]),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ),
        synthetic=False,
    )
    hg_revisions: Tuple[Revision, ...] = (
        hg_revision,
        hg_revision2,
        hg_revision3,
        hg_revision4,
    )
    revisions: Tuple[Revision, ...] = git_revisions + hg_revisions

    origins: Tuple[Origin, ...] = (
        Origin(url="https://github.com/user1/repo1"),
        Origin(url="https://github.com/user2/repo1"),
        Origin(url="https://github.com/user3/repo1"),
        Origin(url="https://gitlab.com/user1/repo1"),
        Origin(url="https://gitlab.com/user2/repo1"),
        Origin(url="https://forge.softwareheritage.org/source/repo1"),
        Origin(url="https://example.рф/🏛️.txt"),
    )
    origin, origin2 = origins[:2]

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="http://hal.inria.example.com/",
    )
    metadata_authority2 = MetadataAuthority(
        type=MetadataAuthorityType.REGISTRY,
        url="http://wikidata.example.com/",
    )
    authorities: Tuple[MetadataAuthority, ...] = (
        metadata_authority,
        metadata_authority2,
    )

    metadata_fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )
    metadata_fetcher2 = MetadataFetcher(
        name="swh-example",
        version="0.0.1",
    )
    fetchers: Tuple[MetadataFetcher,
                    ...] = (metadata_fetcher, metadata_fetcher2)

    date_visit1 = datetime.datetime(2015,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit2 = datetime.datetime(2017,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit3 = datetime.datetime(2018,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)

    type_visit1 = "git"
    type_visit2 = "hg"
    type_visit3 = "deb"

    origin_visit = OriginVisit(
        origin=origin.url,
        visit=1,
        date=date_visit1,
        type=type_visit1,
    )
    origin_visit2 = OriginVisit(
        origin=origin.url,
        visit=2,
        date=date_visit2,
        type=type_visit1,
    )
    origin_visit3 = OriginVisit(
        origin=origin2.url,
        visit=1,
        date=date_visit1,
        type=type_visit2,
    )
    origin_visits: Tuple[OriginVisit, ...] = (
        origin_visit,
        origin_visit2,
        origin_visit3,
    )

    release = Release(
        id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"),
        name=b"v0.0.1",
        author=Person(
            name=b"olasd",
            email=b"*****@*****.**",
            fullname=b"olasd <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0042",
        ),
        target=revision.id,
        target_type=ObjectType.REVISION,
        message=b"synthetic release",
        synthetic=True,
    )
    release2 = Release(
        id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision2.id,
        target_type=ObjectType.REVISION,
        message=b"v0.0.2\nMisc performance improvements + bug fixes",
        synthetic=False,
    )
    release3 = Release(
        id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision3.id,
        target_type=ObjectType.REVISION,
        message=b"yet another synthetic release",
        synthetic=True,
    )

    releases: Tuple[Release, ...] = (release, release2, release3)

    snapshot = Snapshot(
        id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"),
        branches={
            b"master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        },
    )
    empty_snapshot = Snapshot(
        id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
        branches={},
    )
    complete_snapshot = Snapshot(
        id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"),
        branches={
            b"directory":
            SnapshotBranch(
                target=directory.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"directory2":
            SnapshotBranch(
                target=directory2.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"content":
            SnapshotBranch(
                target=content.sha1_git,
                target_type=TargetType.CONTENT,
            ),
            b"alias":
            SnapshotBranch(
                target=b"revision",
                target_type=TargetType.ALIAS,
            ),
            b"revision":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
            b"release":
            SnapshotBranch(
                target=release.id,
                target_type=TargetType.RELEASE,
            ),
            b"snapshot":
            SnapshotBranch(
                target=empty_snapshot.id,
                target_type=TargetType.SNAPSHOT,
            ),
            b"dangling":
            None,
        },
    )

    snapshots: Tuple[Snapshot,
                     ...] = (snapshot, empty_snapshot, complete_snapshot)

    content_metadata1 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin.url,
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    content_metadata2 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin2.url,
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="yaml",
        metadata=b"foo: bar",
    )
    content_metadata3 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
        origin=origin.url,
        visit=42,
        snapshot=snapshot.swhid(),
        release=release.swhid(),
        revision=revision.swhid(),
        directory=directory.swhid(),
        path=b"/foo/bar",
    )

    content_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        content_metadata1,
        content_metadata2,
        content_metadata3,
    )

    origin_metadata1 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    origin_metadata2 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )
    origin_metadata3 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )

    origin_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        origin_metadata1,
        origin_metadata2,
        origin_metadata3,
    )

    extid1 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=revision.id),
        extid_type="git",
        extid=revision.id,
    )

    extid2 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=hg_revision.id),
        extid_type="mercurial",
        extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"),
    )

    extid3 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory.id),
        extid_type="directory",
        extid=b"something",
    )
    extid4 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory2.id),
        extid_type="directory",
        extid=b"something",
        extid_version=2,
    )

    extids: Tuple[ExtID, ...] = (
        extid1,
        extid2,
        extid3,
        extid4,
    )
Beispiel #11
0
    def _store_metadata_deposit(
        self,
        deposit: Deposit,
        swhid_reference: Union[str, QualifiedSWHID],
        metadata_tree: ElementTree.Element,
        raw_metadata: bytes,
        deposit_origin: Optional[str] = None,
    ) -> Tuple[ExtendedSWHID, Deposit, DepositRequest]:
        """When all user inputs pass the checks, this associates the raw_metadata to the
           swhid_reference in the raw extrinsic metadata storage. In case of any issues,
           a bad request response is returned to the user with the details.

            Checks:
            - metadata are technically parsable
            - metadata pass the functional checks
            - SWHID (if any) is technically valid

        Args:
            deposit: Deposit reference
            swhid_reference: The swhid or the origin to attach metadata information to
            metadata_tree: Full element tree of metadata to check for validity
              (parsed out of raw_metadata)
            raw_metadata: The actual raw metadata to send in the storage metadata
            deposit_origin: Optional deposit origin url to use if any (e.g. deposit
              update scenario provides one)

        Raises:
            DepositError in case of incorrect inputs from the deposit client
            (e.g. functionally invalid metadata, ...)

        Returns:
            Tuple of target swhid, deposit, and deposit request

        """
        metadata_ok, error_details = check_metadata(metadata_tree)
        if not metadata_ok:
            assert error_details, "Details should be set when a failure occurs"
            raise DepositError(
                BAD_REQUEST,
                "Functional metadata checks failure",
                convert_status_detail(error_details),
            )

        metadata_authority = MetadataAuthority(
            type=MetadataAuthorityType.DEPOSIT_CLIENT,
            url=deposit.client.provider_url,
        )

        metadata_fetcher = self.swh_deposit_fetcher()

        # replace metadata within the deposit backend
        deposit_request_data = {
            RAW_METADATA_KEY: raw_metadata,
        }

        # actually add the metadata to the completed deposit
        deposit_request = self._deposit_request_put(deposit, deposit_request_data)

        target_swhid: ExtendedSWHID  # origin URL or CoreSWHID
        if isinstance(swhid_reference, str):
            target_swhid = Origin(swhid_reference).swhid()
            metadata_context = {}
        else:
            metadata_context = compute_metadata_context(swhid_reference)
            if deposit_origin:  # metadata deposit update on completed deposit
                metadata_context["origin"] = deposit_origin

            target_swhid = extended_swhid_from_qualified(swhid_reference)

        self._check_swhid_in_archive(target_swhid)

        # metadata deposited by the client
        metadata_object = RawExtrinsicMetadata(
            target=target_swhid,  # core swhid or origin
            discovery_date=deposit_request.date,
            authority=metadata_authority,
            fetcher=metadata_fetcher,
            format="sword-v2-atom-codemeta",
            metadata=raw_metadata,
            **metadata_context,
        )

        # metadata on the metadata object
        swh_deposit_authority = self.swh_deposit_authority()
        swh_deposit_fetcher = self.swh_deposit_fetcher()
        metametadata_object = RawExtrinsicMetadata(
            target=metadata_object.swhid(),
            discovery_date=deposit_request.date,
            authority=swh_deposit_authority,
            fetcher=swh_deposit_fetcher,
            format="xml-deposit-info",
            metadata=render_to_string(
                "deposit/deposit_info.xml", context={"deposit": deposit}
            ).encode(),
        )

        # write to metadata storage
        self.storage_metadata.metadata_authority_add(
            [metadata_authority, swh_deposit_authority]
        )
        self.storage_metadata.metadata_fetcher_add(
            [metadata_fetcher, swh_deposit_fetcher]
        )
        self.storage_metadata.raw_extrinsic_metadata_add(
            [metadata_object, metametadata_object]
        )

        return (target_swhid, deposit, deposit_request)
Beispiel #12
0
def test_deposit_metadata_origin(
    url,
    authenticated_client,
    deposit_collection,
    atom_dataset,
    swh_storage,
):
    """Posting a swhid reference is stored on raw extrinsic metadata storage"""
    xml_data = atom_dataset["entry-data-with-origin-reference"].format(url=url)
    origin_swhid = Origin(url).swhid()
    deposit_client = authenticated_client.deposit_client
    swh_storage.origin_add([Origin(url)])
    response = post_atom(
        authenticated_client,
        reverse(COL_IRI, args=[deposit_collection.name]),
        data=xml_data,
    )

    assert response.status_code == status.HTTP_201_CREATED, response.content.decode(
    )
    response_content = ElementTree.fromstring(response.content)
    # Ensure the deposit is finalized
    deposit_id = int(
        response_content.findtext("swh:deposit_id", namespaces=NAMESPACES))
    deposit = Deposit.objects.get(pk=deposit_id)
    # we got not swhid as input so we cannot have those
    assert deposit.swhid is None
    assert deposit.swhid_context is None
    assert deposit.complete_date == deposit.reception_date
    assert deposit.complete_date is not None
    assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS

    # Ensure metadata stored in the metadata storage is consistent
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url=deposit_client.provider_url,
    )

    actual_authority = swh_storage.metadata_authority_get(
        MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url)
    assert actual_authority == metadata_authority

    config = APIConfig()
    metadata_fetcher = MetadataFetcher(
        name=config.tool["name"],
        version=config.tool["version"],
    )

    actual_fetcher = swh_storage.metadata_fetcher_get(config.tool["name"],
                                                      config.tool["version"])
    assert actual_fetcher == metadata_fetcher

    # Get the deposited metadata object and check it:

    page_results = swh_storage.raw_extrinsic_metadata_get(
        origin_swhid, metadata_authority)

    assert len(page_results.results) == 1
    assert page_results.next_page_token is None

    metadata = RawExtrinsicMetadata(
        target=origin_swhid,
        discovery_date=deposit.complete_date,
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="sword-v2-atom-codemeta",
        metadata=xml_data.encode(),
    )
    assert page_results == PagedResult(
        results=[metadata],
        next_page_token=None,
    )

    # Get metadata about the deposited metadata object and check it:
    _assert_deposit_info_on_metadata(swh_storage, metadata.swhid(), deposit,
                                     metadata_fetcher)
Beispiel #13
0
def test_pypi_release_metadata_structure(
    swh_storage, requests_mock_datadir, _0805nexter_api_info
):
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    expected_release_id = hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68")

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/1.2.0",
                target_type=TargetType.ALIAS,
            ),
            b"releases/1.1.0": SnapshotBranch(
                target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.2.0": SnapshotBranch(
                target=expected_release_id,
                target_type=TargetType.RELEASE,
            ),
        },
    )

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id
    )

    check_snapshot(expected_snapshot, swh_storage)

    release = swh_storage.release_get([expected_release_id])[0]
    assert release is not None

    release_swhid = CoreSWHID(
        object_type=ObjectType.RELEASE, object_id=expected_release_id
    )
    directory_swhid = ExtendedSWHID(
        object_type=ExtendedObjectType.DIRECTORY, object_id=release.target
    )
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://pypi.org/",
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=directory_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.pypi.loader.PyPILoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="pypi-project-json",
            metadata=json.dumps(
                json.loads(_0805nexter_api_info)["releases"]["1.2.0"][0]
            ).encode(),
            origin=url,
            release=release_swhid,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        directory_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )
def test_loader_one_visit(swh_storage, requests_mock_datadir, raw_sources):
    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()
    expected_snapshot_id = SNAPSHOT1.id
    expected_snapshot_id_hex = expected_snapshot_id.hex()
    assert load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id_hex,
    }

    release_id = SNAPSHOT1.branches[
        b"https://github.com/owner-1/repository-1/revision-1.tgz"].target
    check_snapshot(SNAPSHOT1, storage=swh_storage)

    assert swh_storage.release_get([release_id])[0] == Release(
        id=release_id,
        name=b"https://github.com/owner-1/repository-1/revision-1.tgz",
        message=None,
        target=hash_to_bytes("4de2e07d3742718d928e974b8a4c721b9f7b33bf"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b""),
        date=None,
    )

    stats = get_stats(swh_storage)
    assert {
        "content": 1,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    # The visit is partial because urls pointing to non tarball file
    # are not handled yet
    assert_last_visit_matches(swh_storage,
                              sources_url,
                              status="partial",
                              type="nixguix")

    visit_status = origin_get_latest_visit_status(swh_storage, sources_url)
    snapshot_swhid = ExtendedSWHID(object_type=ExtendedObjectType.SNAPSHOT,
                                   object_id=visit_status.snapshot)
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url=sources_url,
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=snapshot_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.nixguix.loader.NixGuixLoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="nixguix-sources-json",
            metadata=raw_sources,
            origin=sources_url,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        snapshot_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )
Beispiel #15
0
def test_opam_metadata(tmpdir, requests_mock_datadir, fake_opam_root,
                       swh_storage, datadir):
    opam_url = f"file://{datadir}/fake_opam_repo"
    opam_root = fake_opam_root
    opam_instance = "loadertest"

    opam_package = "ocb"
    url = f"opam+{opam_url}/packages/{opam_package}"

    loader = OpamLoader(
        swh_storage,
        url,
        opam_root,
        opam_instance,
        opam_url,
        opam_package,
        initialize_opam_root=True,
    )

    actual_load_status = loader.load()

    assert actual_load_status["status"] == "eventful"

    expected_release_id = hash_to_bytes(
        "c231e541eb29c712635ada394b04127ac69e9fb0")

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"HEAD":
            SnapshotBranch(
                target=b"ocb.0.1",
                target_type=TargetType.ALIAS,
            ),
            b"ocb.0.1":
            SnapshotBranch(
                target=expected_release_id,
                target_type=TargetType.RELEASE,
            ),
        },
    )

    assert_last_visit_matches(swh_storage,
                              url,
                              status="full",
                              type="opam",
                              snapshot=expected_snapshot.id)

    check_snapshot(expected_snapshot, swh_storage)

    release = swh_storage.release_get([expected_release_id])[0]
    assert release is not None

    release_swhid = CoreSWHID(object_type=ObjectType.RELEASE,
                              object_id=expected_release_id)
    directory_swhid = ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY,
                                    object_id=release.target)
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url=opam_url,
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=directory_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.opam.loader.OpamLoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="opam-package-definition",
            metadata=OCB_METADATA,
            origin=url,
            release=release_swhid,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        directory_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )
Beispiel #16
0
 def swh_deposit_authority(self):
     return MetadataAuthority(
         type=MetadataAuthorityType.REGISTRY,
         url=self.config["swh_authority_url"],
     )
    MetadataAuthority,
    MetadataAuthorityType,
    MetadataFetcher,
    Origin,
    RawExtrinsicMetadata,
)
from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID
from swh.storage.migrate_extrinsic_metadata import cran_package_from_url, handle_row

FETCHER = MetadataFetcher(
    name="migrate-extrinsic-metadata-from-revisions",
    version="0.0.1",
)
SWH_AUTHORITY = MetadataAuthority(
    type=MetadataAuthorityType.REGISTRY,
    url="https://softwareheritage.org/",
    metadata={},
)

DIRECTORY_ID = b"a" * 20
DIRECTORY_SWHID = ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY,
                                object_id=DIRECTORY_ID)


def test_gnu():
    original_artifacts = [{
        "length": 842501,
        "filename": "gperf-3.0.1.tar.gz",
        "checksums": {
            "sha1":
            "c4453ee492032b369006ee464f4dd4e2c0c0e650",
 def get_metadata_authority(self):
     return MetadataAuthority(
         type=MetadataAuthorityType.FORGE,
         url="https://npmjs.com/",
         metadata={},
     )
Beispiel #19
0
)
from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID
from swh.storage import get_storage
from swh.storage.interface import PagedResult
from swh.storage.migrate_extrinsic_metadata import (
    handle_row,
    pypi_origin_from_filename,
    pypi_project_from_filename,
)

FETCHER = MetadataFetcher(
    name="migrate-extrinsic-metadata-from-revisions",
    version="0.0.1",
)
PYPI_AUTHORITY = MetadataAuthority(
    type=MetadataAuthorityType.FORGE,
    url="https://pypi.org/",
)
SWH_AUTHORITY = MetadataAuthority(
    type=MetadataAuthorityType.REGISTRY,
    url="https://softwareheritage.org/",
)

DIRECTORY_ID = b"a" * 20
DIRECTORY_SWHID = ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY,
                                object_id=DIRECTORY_ID)


def now():
    return datetime.datetime.now(tz=datetime.timezone.utc)

 def get_metadata_authority(self):
     return MetadataAuthority(type=MetadataAuthorityType.FORGE,
                              url=self.opam_url)
Beispiel #21
0
def test_deposit_metadata_swhid(
    swhid,
    authenticated_client,
    deposit_collection,
    atom_dataset,
    swh_storage,
):
    """Posting a swhid reference is stored on raw extrinsic metadata storage"""
    swhid_reference = QualifiedSWHID.from_string(swhid)
    swhid_target = extended_swhid_from_qualified(swhid_reference)

    xml_data = atom_dataset["entry-data-with-swhid"].format(
        swhid=swhid,
        metadata_provenance_url=
        "https://hal-test.archives-ouvertes.fr/hal-abcdefgh",
    )
    deposit_client = authenticated_client.deposit_client

    _insert_object(swh_storage, swhid_reference)

    response = post_atom(
        authenticated_client,
        reverse(COL_IRI, args=[deposit_collection.name]),
        data=xml_data,
    )

    assert response.status_code == status.HTTP_201_CREATED, response.content.decode(
    )
    response_content = ElementTree.fromstring(response.content)

    # Ensure the deposit is finalized
    deposit_id = int(
        response_content.findtext("swh:deposit_id", namespaces=NAMESPACES))
    deposit = Deposit.objects.get(pk=deposit_id)
    assert deposit.swhid == str(swhid_target)
    assert deposit.swhid_context == str(swhid_reference)
    assert deposit.complete_date == deposit.reception_date
    assert deposit.complete_date is not None
    assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS

    # Ensure metadata stored in the metadata storage is consistent
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url=deposit_client.provider_url,
    )

    actual_authority = swh_storage.metadata_authority_get(
        MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url)
    assert actual_authority == metadata_authority

    config = APIConfig()
    metadata_fetcher = MetadataFetcher(
        name=config.tool["name"],
        version=config.tool["version"],
    )

    actual_fetcher = swh_storage.metadata_fetcher_get(config.tool["name"],
                                                      config.tool["version"])
    assert actual_fetcher == metadata_fetcher

    # Get the deposited metadata object and check it:

    page_results = swh_storage.raw_extrinsic_metadata_get(
        swhid_target, metadata_authority)

    assert len(page_results.results) == 1
    assert page_results.next_page_token is None

    metadata_context = compute_metadata_context(swhid_reference)
    metadata = RawExtrinsicMetadata(
        target=swhid_target,
        discovery_date=deposit.complete_date,
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="sword-v2-atom-codemeta",
        metadata=xml_data.encode(),
        **metadata_context,
    )
    assert page_results == PagedResult(
        results=[metadata],
        next_page_token=None,
    )

    # Get metadata about the deposited metadata object and check it:
    _assert_deposit_info_on_metadata(swh_storage, metadata.swhid(), deposit,
                                     metadata_fetcher)
Beispiel #22
0
    MetadataAuthorityType,
    MetadataFetcher,
    ObjectType,
    Origin,
    Person,
    RawExtrinsicMetadata,
    Release,
    Sha1Git,
)
from swh.model.swhids import CoreSWHID, ExtendedSWHID

EMPTY_SNAPSHOT_ID = "1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"
FULL_SNAPSHOT_ID = "4ac5730a9393f5099b63a35a17b6c33d36d70c3a"

AUTHORITY = MetadataAuthority(
    type=MetadataAuthorityType.FORGE,
    url="http://example.org/",
)
ORIGIN_URL = "http://example.org/archive.tgz"
ORIGIN_SWHID = Origin(ORIGIN_URL).swhid()

REVISION_ID = hash_to_bytes("8ff44f081d43176474b267de5451f2c2e88089d0")
RELEASE_ID = hash_to_bytes("9477a708196b44e59efb4e47b7d979a4146bd428")
RELEASE_SWHID = CoreSWHID.from_string(f"swh:1:rel:{RELEASE_ID.hex()}")
DIRECTORY_ID = hash_to_bytes("aa" * 20)
DIRECTORY_SWHID = ExtendedSWHID.from_string(f"swh:1:dir:{DIRECTORY_ID.hex()}")

FETCHER = MetadataFetcher(
    name="swh.loader.package.tests.test_loader_metadata.MetadataTestLoader",
    version=__version__,
)
Beispiel #23
0
    MetadataAuthority,
    MetadataAuthorityType,
    MetadataFetcher,
    Origin,
    RawExtrinsicMetadata,
)
from swh.model.swhids import CoreSWHID, ExtendedObjectType, ExtendedSWHID
from swh.storage.migrate_extrinsic_metadata import cran_package_from_url, handle_row

FETCHER = MetadataFetcher(
    name="migrate-extrinsic-metadata-from-revisions",
    version="0.0.1",
)
SWH_AUTHORITY = MetadataAuthority(
    type=MetadataAuthorityType.REGISTRY,
    url="https://softwareheritage.org/",
    metadata={},
)
NIX_UNSTABLE_AUTHORITY = MetadataAuthority(
    type=MetadataAuthorityType.FORGE,
    url="https://nix-community.github.io/nixpkgs-swh/sources-unstable.json",
    metadata={},
)

DIRECTORY_ID = b"a" * 20
DIRECTORY_SWHID = ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY,
                                object_id=DIRECTORY_ID)


def test_nixguix():
    extrinsic_metadata = {
Beispiel #24
0
 def get_metadata_authority(self):
     return MetadataAuthority(
         type=MetadataAuthorityType.FORGE,
         url=self.origin.url,
         metadata={},
     )
Beispiel #25
0
def test_put_update_metadata_done_deposit_nominal(
    tmp_path,
    authenticated_client,
    complete_deposit,
    deposit_collection,
    atom_dataset,
    sample_data,
    swh_storage,
):
    """Nominal scenario, client send an update of metadata on a deposit with status "done"
    with an existing swhid. Such swhid has its metadata updated accordingly both in
    the deposit backend and in the metadata storage.

    Response: 204

    """
    deposit_swhid = CoreSWHID.from_string(complete_deposit.swhid)
    assert deposit_swhid.object_type == ObjectType.DIRECTORY
    directory_id = hash_to_bytes(deposit_swhid.object_id)

    # directory targeted by the complete_deposit does not exist in the storage
    assert list(swh_storage.directory_missing([directory_id
                                               ])) == [directory_id]

    # so let's create a directory reference in the storage (current deposit targets an
    # unknown swhid)
    existing_directory = sample_data.directory
    swh_storage.directory_add([existing_directory])
    assert list(swh_storage.directory_missing([existing_directory.id])) == []

    # and patch one complete deposit swhid so it targets said reference
    complete_deposit.swhid = str(existing_directory.swhid())
    complete_deposit.save()

    actual_existing_requests_archive = DepositRequest.objects.filter(
        deposit=complete_deposit, type="archive")
    nb_archives = len(actual_existing_requests_archive)
    actual_existing_requests_metadata = DepositRequest.objects.filter(
        deposit=complete_deposit, type="metadata")
    nb_metadata = len(actual_existing_requests_metadata)

    update_uri = reverse(EDIT_IRI,
                         args=[deposit_collection.name, complete_deposit.id])
    response = put_atom(
        authenticated_client,
        update_uri,
        data=atom_dataset["entry-data1"],
        HTTP_X_CHECK_SWHID=complete_deposit.swhid,
    )

    assert response.status_code == status.HTTP_204_NO_CONTENT

    new_requests_meta = DepositRequest.objects.filter(deposit=complete_deposit,
                                                      type="metadata")
    assert len(new_requests_meta) == nb_metadata + 1
    request_meta1 = new_requests_meta[0]
    raw_metadata1 = request_meta1.raw_metadata
    assert raw_metadata1 == atom_dataset["entry-data1"]

    # check we did not touch the other parts
    requests_archive1 = DepositRequest.objects.filter(deposit=complete_deposit,
                                                      type="archive")
    assert len(requests_archive1) == nb_archives
    assert set(actual_existing_requests_archive) == set(requests_archive1)

    # Ensure metadata stored in the metadata storage is consistent
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url=complete_deposit.client.provider_url,
    )

    actual_authority = swh_storage.metadata_authority_get(
        MetadataAuthorityType.DEPOSIT_CLIENT,
        url=complete_deposit.client.provider_url)
    assert actual_authority == metadata_authority

    config = APIConfig()
    metadata_fetcher = MetadataFetcher(
        name=config.tool["name"],
        version=config.tool["version"],
    )

    actual_fetcher = swh_storage.metadata_fetcher_get(config.tool["name"],
                                                      config.tool["version"])
    assert actual_fetcher == metadata_fetcher

    directory_swhid = ExtendedSWHID.from_string(complete_deposit.swhid)
    page_results = swh_storage.raw_extrinsic_metadata_get(
        directory_swhid, metadata_authority)
    assert page_results == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=directory_swhid,
                discovery_date=request_meta1.date,
                authority=metadata_authority,
                fetcher=metadata_fetcher,
                format="sword-v2-atom-codemeta",
                metadata=raw_metadata1.encode(),
                origin=complete_deposit.origin_url,
            )
        ],
        next_page_token=None,
    )
Beispiel #26
0
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=RELEASES[0].id),
            b"target/snapshot":
            SnapshotBranch(
                target_type=TargetType.SNAPSHOT,
                target=hash_to_bytes(
                    "9e78d7105c5e0f886487511e2a92377b4ee4c32a"),
            ),
        },
    ),
]

METADATA_AUTHORITIES = [
    MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="http://example.org/",
        metadata={},
    ),
]

METADATA_FETCHERS = [
    MetadataFetcher(
        name="test-fetcher",
        version="1.0.0",
        metadata={},
    )
]

RAW_EXTRINSIC_METADATA = [
    RawExtrinsicMetadata(
        target=Origin("http://example.org/foo.git").swhid(),
Beispiel #27
0
NIXGUIX_FORMAT = "nixguix-sources-json"
NPM_FORMAT = "replicate-npm-package-json"
ORIGINAL_ARTIFACT_FORMAT = "original-artifacts-json"
PYPI_FORMAT = "pypi-project-json"

# Information about this script, for traceability
FETCHER = MetadataFetcher(
    name="migrate-extrinsic-metadata-from-revisions",
    version="0.0.1",
)

# Authorities that we got the metadata from
AUTHORITIES = {
    "npmjs":
    MetadataAuthority(type=MetadataAuthorityType.FORGE,
                      url="https://npmjs.com/",
                      metadata={}),
    "pypi":
    MetadataAuthority(type=MetadataAuthorityType.FORGE,
                      url="https://pypi.org/",
                      metadata={}),
    "gnu":
    MetadataAuthority(type=MetadataAuthorityType.FORGE,
                      url="https://ftp.gnu.org/",
                      metadata={}),
    "swh":
    MetadataAuthority(
        type=MetadataAuthorityType.REGISTRY,
        url="https://softwareheritage.org/",
        metadata={},
    ),  # for original_artifact (which are checksums computed by SWH)
Beispiel #28
0
def handle_deposit_row(
    row,
    discovery_date: Optional[datetime.datetime],
    origin,
    storage,
    deposit_cur,
    dry_run: bool,
):
    """Loads metadata from the deposit database (which is more reliable as the
    metadata on the revision object, as some versions of the deposit loader were
    a bit lossy; and they used very different format for the field in the
    revision table).
    """
    parsed_message = deposit_revision_message_re.match(row["message"])
    assert parsed_message is not None, row["message"]

    deposit_id = int(parsed_message.group("deposit_id"))
    collection = parsed_message.group("collection").decode()
    client_name = parsed_message.group("client").decode()

    deposit_cur.execute(
        f"SELECT {', '.join(DEPOSIT_COLS)} FROM deposit "
        f"INNER JOIN deposit_collection "
        f" ON (deposit.collection_id=deposit_collection.id) "
        f"INNER JOIN deposit_client ON (deposit.client_id=deposit_client.user_ptr_id) "
        f"INNER JOIN auth_user ON (deposit.client_id=auth_user.id) "
        f"INNER JOIN deposit_request ON (deposit.id=deposit_request.deposit_id) "
        f"WHERE deposit.id = %s",
        (deposit_id, ),
    )

    provider_urls = set()
    swhids = set()
    metadata_entries = []
    dates = set()
    external_identifiers = set()
    for deposit_request_row in deposit_cur:
        deposit_request = dict(zip(DEPOSIT_COLS, deposit_request_row))

        # Sanity checks to make sure we selected the right deposit
        assert deposit_request["deposit.id"] == deposit_id
        assert deposit_request[
            "deposit_collection.name"] == collection, deposit_request
        if client_name != "":
            # Sometimes it's missing from the commit message
            assert deposit_request["auth_user.username"] == client_name

        # Date of the deposit request (either the initial request, of subsequent ones)
        date = deposit_request["deposit_request.date"]
        dates.add(date)

        if deposit_request["deposit.external_id"] == "hal-02355563":
            # Failed deposit
            swhids.add("swh:1:rev:9293f230baca9814490d4fff7ac53d487a20edb6"
                       ";origin=https://hal.archives-ouvertes.fr/hal-02355563")
        else:
            assert deposit_request["deposit.swhid_context"], deposit_request
            swhids.add(deposit_request["deposit.swhid_context"])
        external_identifiers.add(deposit_request["deposit.external_id"])

        # Client of the deposit
        provider_urls.add(deposit_request["deposit_client.provider_url"])

        metadata = deposit_request["deposit_request.metadata"]
        if metadata is not None:
            json.dumps(metadata).encode()  # check it's valid
            if "@xmlns" in metadata:
                assert metadata["@xmlns"] == ATOM_NS
                assert metadata["@xmlns:codemeta"] in (CODEMETA_NS,
                                                       [CODEMETA_NS])
                format = NEW_DEPOSIT_FORMAT
            elif "{http://www.w3.org/2005/Atom}id" in metadata:
                assert ("{https://doi.org/10.5063/SCHEMA/CODEMETA-2.0}author"
                        in metadata
                        or "{http://www.w3.org/2005/Atom}author" in metadata)
                format = OLD_DEPOSIT_FORMAT
            else:
                # new format introduced in
                # https://forge.softwareheritage.org/D4065
                # it's the same as the first case, but with the @xmlns
                # declarations stripped
                # Most of them should have the "id", but some revisions,
                # like 4d3890004fade1f4ec3bf7004a4af0c490605128, are missing
                # this field
                assert "id" in metadata or "title" in metadata
                assert "codemeta:author" in metadata
                format = NEW_DEPOSIT_FORMAT
            metadata_entries.append((date, format, metadata))

    if discovery_date is None:
        discovery_date = max(dates)

    # Sanity checks to make sure deposit requests are consistent with each other
    assert len(metadata_entries) >= 1, deposit_id
    assert len(
        provider_urls) == 1, f"expected 1 provider url, got {provider_urls}"
    (provider_url, ) = provider_urls
    assert len(swhids) == 1
    (swhid, ) = swhids
    assert (len(external_identifiers) == 1
            ), f"expected 1 external identifier, got {external_identifiers}"
    (external_identifier, ) = external_identifiers

    # computed the origin from the external_identifier if we don't have one
    if origin is None:
        origin = f"{provider_url.strip('/')}/{external_identifier}"

        # explicit list of mistakes that happened in the past, but shouldn't
        # happen again:
        if origin == "https://hal.archives-ouvertes.fr/hal-01588781":
            # deposit id 75
            origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588781"
        elif origin == "https://hal.archives-ouvertes.fr/hal-01588782":
            # deposit id 76
            origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588782"
        elif origin == "https://hal.archives-ouvertes.fr/hal-01592430":
            # deposit id 143
            origin = "https://hal-preprod.archives-ouvertes.fr/hal-01592430"
        elif origin == "https://hal.archives-ouvertes.fr/hal-01588927":
            origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588927"
        elif origin == "https://hal.archives-ouvertes.fr/hal-01593875":
            # deposit id 175
            origin = "https://hal-preprod.archives-ouvertes.fr/hal-01593875"
        elif deposit_id == 160:
            assert origin == "https://www.softwareheritage.org/je-suis-gpl", origin
            origin = "https://forge.softwareheritage.org/source/jesuisgpl/"
        elif origin == "https://hal.archives-ouvertes.fr/hal-01588942":
            # deposit id 90
            origin = "https://inria.halpreprod.archives-ouvertes.fr/hal-01588942"
        elif origin == "https://hal.archives-ouvertes.fr/hal-01592499":
            # deposit id 162
            origin = "https://hal-preprod.archives-ouvertes.fr/hal-01592499"
        elif origin == "https://hal.archives-ouvertes.fr/hal-01588935":
            # deposit id 89
            origin = "https://hal-preprod.archives-ouvertes.fr/hal-01588935"

        assert_origin_exists(storage, origin)

    # check the origin we computed matches the one in the deposit db
    swhid_origin = QualifiedSWHID.from_string(swhid).origin
    if origin is not None:
        # explicit list of mistakes that happened in the past, but shouldn't
        # happen again:
        exceptions = [
            (
                # deposit id 229
                "https://hal.archives-ouvertes.fr/hal-01243573",
                "https://hal-test.archives-ouvertes.fr/hal-01243573",
            ),
            (
                # deposit id 199
                "https://hal.archives-ouvertes.fr/hal-01243065",
                "https://hal-test.archives-ouvertes.fr/hal-01243065",
            ),
            (
                # deposit id 164
                "https://hal.archives-ouvertes.fr/hal-01593855",
                "https://hal-preprod.archives-ouvertes.fr/hal-01593855",
            ),
        ]
        if (origin, swhid_origin) not in exceptions:
            assert origin == swhid_origin, (
                f"the origin we guessed from the deposit db or revision ({origin}) "
                f"doesn't match the one in the deposit db's SWHID ({swhid})")

    authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url=provider_url,
        metadata={},
    )

    for (date, format, metadata) in metadata_entries:
        load_metadata(
            storage,
            row["id"],
            row["directory"],
            date,
            metadata,
            format,
            authority=authority,
            origin=origin,
            dry_run=dry_run,
        )

    return (origin, discovery_date)
Beispiel #29
0
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info):
    package = "org"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    release_id = "d38cc0b571cd41f3c85513864e049766b42032a7"
    versions = [
        ("0.0.2", release_id),
        ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"),
        ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.4", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.2",
        message=b"Synthetic release for NPM source package org version 0.0.2\n",
        target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person(
            fullname=b"mooz <*****@*****.**>",
            name=b"mooz",
            email=b"*****@*****.**",
        ),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    contents = swh_storage.content_get(_expected_new_contents_first_visit)
    count = sum(0 if content is None else 1 for content in contents)
    assert count == len(_expected_new_contents_first_visit)

    assert (
        list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == []
    )

    assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == []

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://npmjs.com/",
    )

    for (version_name, release_id) in versions:
        release = swh_storage.release_get([hash_to_bytes(release_id)])[0]
        assert release.target_type == ModelObjectType.DIRECTORY
        directory_id = release.target
        directory_swhid = ExtendedSWHID(
            object_type=ExtendedObjectType.DIRECTORY,
            object_id=directory_id,
        )
        release_swhid = CoreSWHID(
            object_type=ObjectType.RELEASE,
            object_id=hash_to_bytes(release_id),
        )
        expected_metadata = [
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.npm.loader.NpmLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="replicate-npm-package-json",
                metadata=json.dumps(
                    json.loads(org_api_info)["versions"][version_name]
                ).encode(),
                origin="https://www.npmjs.com/package/org",
                release=release_swhid,
            )
        ]
        assert swh_storage.raw_extrinsic_metadata_get(
            directory_swhid,
            metadata_authority,
        ) == PagedResult(
            next_page_token=None,
            results=expected_metadata,
        )

    stats = get_stats(swh_storage)

    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Beispiel #30
0
def test_deposit_loading_ok(swh_storage, deposit_client,
                            requests_mock_datadir):
    url = "https://hal-test.archives-ouvertes.fr/some-external-id"
    deposit_id = 666
    loader = DepositLoader(swh_storage,
                           url,
                           deposit_id,
                           deposit_client,
                           default_filename="archive.zip")

    actual_load_status = loader.load()
    expected_snapshot_id = "338b45d87e02fb5cbf324694bc4a898623d6a30f"
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id,
    }

    assert_last_visit_matches(
        loader.storage,
        url,
        status="full",
        type="deposit",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    release_id_hex = "2566a64a27bc00362e265be9666d7606750530a1"
    release_id = hash_to_bytes(release_id_hex)

    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"HEAD":
            SnapshotBranch(
                target=release_id,
                target_type=TargetType.RELEASE,
            ),
        },
    )
    check_snapshot(expected_snapshot, storage=loader.storage)

    release = loader.storage.release_get([release_id])[0]
    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2017, 10, 7, 15, 17, 8,
                          tzinfo=datetime.timezone.utc))
    person = Person(
        fullname=b"Software Heritage",
        name=b"Software Heritage",
        email=b"*****@*****.**",
    )
    assert release == Release(
        id=release_id,
        name=b"HEAD",
        message=b"hal: Deposit 666 in collection hal\n",
        author=person,
        date=date,
        target_type=ModelObjectType.DIRECTORY,
        target=b"\xfd-\xf1-\xc5SL\x1d\xa1\xe9\x18\x0b\x91Q\x02\xfbo`\x1d\x19",
        synthetic=True,
        metadata=None,
    )

    # check metadata

    fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )

    authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="https://hal-test.archives-ouvertes.fr/",
    )

    # Check origin metadata
    orig_meta = loader.storage.raw_extrinsic_metadata_get(
        Origin(url).swhid(), authority)
    assert orig_meta.next_page_token is None
    raw_meta = loader.client.metadata_get(deposit_id)
    raw_metadata: str = raw_meta["raw_metadata"]
    # 2 raw metadata xml + 1 json dict
    assert len(orig_meta.results) == 2
    orig_meta0 = orig_meta.results[0]
    assert orig_meta0.authority == authority
    assert orig_meta0.fetcher == fetcher

    # Check directory metadata
    assert release.target_type == ModelObjectType.DIRECTORY
    directory_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                                object_id=release.target)
    actual_dir_meta = loader.storage.raw_extrinsic_metadata_get(
        directory_swhid, authority)
    assert actual_dir_meta.next_page_token is None
    assert len(actual_dir_meta.results) == 1
    dir_meta = actual_dir_meta.results[0]
    assert dir_meta.authority == authority
    assert dir_meta.fetcher == fetcher
    assert dir_meta.metadata.decode() == raw_metadata

    # Retrieve the information for deposit status update query to the deposit
    urls = [
        m for m in requests_mock_datadir.request_history
        if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/"
    ]

    assert len(urls) == 1
    update_query = urls[0]

    body = update_query.json()
    expected_body = {
        "status": "done",
        "release_id": release_id_hex,
        "directory_id": hash_to_hex(release.target),
        "snapshot_id": expected_snapshot_id,
        "origin_url": url,
    }

    assert body == expected_body

    stats = get_stats(loader.storage)
    assert {
        "content": 303,
        "directory": 12,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats