Beispiel #1
0
    def test_microsecond_insensitive(self):
        """Checks the microseconds of the datetime.datetime does not affect the
        hashed manifest."""
        metadata = {
            **self.minimal,
            "discovery_date":
            datetime.datetime(
                2021,
                1,
                25,
                11,
                27,
                51,
                123456,
                tzinfo=datetime.timezone.utc,
            ),
        }

        self.assertEqual(
            git_objects.raw_extrinsic_metadata_git_object(
                RawExtrinsicMetadata.from_dict(self.minimal)),
            git_objects.raw_extrinsic_metadata_git_object(
                RawExtrinsicMetadata.from_dict(metadata)),
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(self.minimal).id,
            RawExtrinsicMetadata.from_dict(metadata).id,
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(metadata).id,
            _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"),
        )
Beispiel #2
0
    def test_nonascii_path(self):
        metadata = {
            **self.minimal,
            "path": b"/ab\nc/d\xf0\x9f\xa4\xb7e\x00f",
        }
        git_object = (
            b"raw_extrinsic_metadata 231\0"
            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
            b"discovery_date 1611574071\n"
            b"authority forge https://forge.softwareheritage.org/\n"
            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
            b"format json\n"
            b"path /ab\n"
            b" c/d\xf0\x9f\xa4\xb7e\x00f\n"
            b"\n"
            b'{"foo": "bar"}')

        self.assertEqual(
            git_objects.raw_extrinsic_metadata_git_object(
                RawExtrinsicMetadata.from_dict(metadata)),
            git_object,
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(metadata).id,
            hashlib.sha1(git_object).digest(),
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(metadata).id,
            _x("7cc83fd1912176510c083f5df43f01b09af4b333"),
        )
Beispiel #3
0
    def test_timezone_insensitive(self):
        """Checks the timezone of the datetime.datetime does not affect the
        hashed git_object."""
        utc_plus_one = datetime.timezone(datetime.timedelta(hours=1))
        metadata = {
            **self.minimal,
            "discovery_date":
            datetime.datetime(
                2021,
                1,
                25,
                12,
                27,
                51,
                tzinfo=utc_plus_one,
            ),
        }

        self.assertEqual(
            git_objects.raw_extrinsic_metadata_git_object(
                RawExtrinsicMetadata.from_dict(self.minimal)),
            git_objects.raw_extrinsic_metadata_git_object(
                RawExtrinsicMetadata.from_dict(metadata)),
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(self.minimal).id,
            RawExtrinsicMetadata.from_dict(metadata).id,
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(metadata).id,
            _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"),
        )
Beispiel #4
0
    def test_maximal(self):
        git_object = (
            b"raw_extrinsic_metadata 533\0"
            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
            b"discovery_date 1611574071\n"
            b"authority forge https://forge.softwareheritage.org/\n"
            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
            b"format json\n"
            b"origin https://forge.softwareheritage.org/source/swh-model/\n"
            b"visit 42\n"
            b"snapshot swh:1:snp:0000000000000000000000000000000000000000\n"
            b"release swh:1:rel:0101010101010101010101010101010101010101\n"
            b"revision swh:1:rev:0202020202020202020202020202020202020202\n"
            b"path /abc/def\n"
            b"directory swh:1:dir:0303030303030303030303030303030303030303\n"
            b"\n"
            b'{"foo": "bar"}')

        self.assertEqual(
            git_objects.raw_extrinsic_metadata_git_object(
                RawExtrinsicMetadata.from_dict(self.maximal)),
            git_object,
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(self.maximal).id,
            hashlib.sha1(git_object).digest(),
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(self.maximal).id,
            _x("f96966e1093d15236a31fde07e47d5b1c9428049"),
        )
def test_maven_loader_extrinsic_metadata(swh_storage, expected_releases,
                                         expected_json_metadata,
                                         expected_pom_metadata):
    """With no prior visit, loading a jar ends up with 1 snapshot.
    Extrinsic metadata is the pom file associated to the source jar.
    """
    loader = MavenLoader(swh_storage, MVN_ORIGIN_URL, artifacts=MVN_ARTIFACTS)

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"

    for i, expected_release in enumerate(expected_releases):

        expected_release_id = expected_release.id
        release = swh_storage.release_get([expected_release_id])[0]
        assert release is not None

        release_swhid = CoreSWHID(object_type=ObjectType.RELEASE,
                                  object_id=expected_release_id)
        directory_swhid = ExtendedSWHID(
            object_type=ExtendedObjectType.DIRECTORY, object_id=release.target)
        metadata_authority = MetadataAuthority(
            type=MetadataAuthorityType.FORGE,
            url=REPO_BASE_URL,
        )

        expected_metadata = [
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.maven.loader.MavenLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="maven-pom",
                metadata=expected_pom_metadata[i],
                origin=MVN_ORIGIN_URL,
                release=release_swhid,
            ),
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.maven.loader.MavenLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="maven-json",
                metadata=json.dumps(expected_json_metadata[i]).encode(),
                origin=MVN_ORIGIN_URL,
                release=release_swhid,
            ),
        ]

        res = swh_storage.raw_extrinsic_metadata_get(directory_swhid,
                                                     metadata_authority)
        assert res.next_page_token is None
        assert set(res.results) == set(expected_metadata)
def db_to_raw_extrinsic_metadata(row) -> RawExtrinsicMetadata:
    target = row["raw_extrinsic_metadata.target"]
    if not target.startswith("swh:1:"):
        warnings.warn("Fetching raw_extrinsic_metadata row with URL target",
                      DeprecationWarning)
        target = str(Origin(url=target).swhid())

    return RawExtrinsicMetadata(
        target=ExtendedSWHID.from_string(target),
        authority=MetadataAuthority(
            type=MetadataAuthorityType(row["metadata_authority.type"]),
            url=row["metadata_authority.url"],
        ),
        fetcher=MetadataFetcher(
            name=row["metadata_fetcher.name"],
            version=row["metadata_fetcher.version"],
        ),
        discovery_date=row["discovery_date"],
        format=row["format"],
        metadata=row["raw_extrinsic_metadata.metadata"],
        origin=row["origin"],
        visit=row["visit"],
        snapshot=map_optional(CoreSWHID.from_string, row["snapshot"]),
        release=map_optional(CoreSWHID.from_string, row["release"]),
        revision=map_optional(CoreSWHID.from_string, row["revision"]),
        path=row["path"],
        directory=map_optional(CoreSWHID.from_string, row["directory"]),
    )
Beispiel #7
0
def _assert_deposit_info_on_metadata(swh_storage, metadata_swhid, deposit,
                                     metadata_fetcher):
    swh_authority = MetadataAuthority(
        MetadataAuthorityType.REGISTRY,
        "http://deposit.softwareheritage.example/",
    )
    page_results = swh_storage.raw_extrinsic_metadata_get(
        metadata_swhid, swh_authority)

    assert len(page_results.results) == 1
    assert page_results.next_page_token is None

    expected_xml_data = textwrap.dedent(f"""\
        <deposit xmlns="https://www.softwareheritage.org/schema/2018/deposit">
            <deposit_id>{deposit.id}</deposit_id>
            <deposit_client>https://hal-test.archives-ouvertes.fr/</deposit_client>
            <deposit_collection>test</deposit_collection>
        </deposit>
        """)
    assert page_results == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=metadata_swhid,
                discovery_date=deposit.complete_date,
                authority=swh_authority,
                fetcher=metadata_fetcher,
                format="xml-deposit-info",
                metadata=expected_xml_data.encode(),
            )
        ],
        next_page_token=None,
    )
Beispiel #8
0
def test_load_artifact_metadata(swh_storage, caplog):
    loader = MetadataTestLoader(swh_storage, ORIGIN_URL)

    load_status = loader.load()
    assert load_status == {
        "status": "eventful",
        "snapshot_id": FULL_SNAPSHOT_ID,
    }

    authority = MetadataAuthority(
        type=MetadataAuthorityType.REGISTRY,
        url="https://softwareheritage.org/",
    )

    result = swh_storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority,
    )
    assert result.next_page_token is None
    assert len(result.results) == 1
    assert result.results[0] == RawExtrinsicMetadata(
        target=DIRECTORY_SWHID,
        discovery_date=result.results[0].discovery_date,
        authority=authority,
        fetcher=FETCHER,
        format="original-artifacts-json",
        metadata=b'[{"artifact_key": "value", "length": 0}]',
        origin=ORIGIN_URL,
        release=RELEASE_SWHID,
    )
Beispiel #9
0
def row_to_raw_extrinsic_metadata(
        row: RawExtrinsicMetadataRow) -> RawExtrinsicMetadata:
    discovery_date = row.discovery_date.replace(tzinfo=datetime.timezone.utc)

    return RawExtrinsicMetadata(
        target=ExtendedSWHID.from_string(row.target),
        authority=MetadataAuthority(
            type=MetadataAuthorityType(row.authority_type),
            url=row.authority_url,
        ),
        fetcher=MetadataFetcher(
            name=row.fetcher_name,
            version=row.fetcher_version,
        ),
        discovery_date=discovery_date,
        format=row.format,
        metadata=row.metadata,
        origin=row.origin,
        visit=row.visit,
        snapshot=map_optional(CoreSWHID.from_string, row.snapshot),
        release=map_optional(CoreSWHID.from_string, row.release),
        revision=map_optional(CoreSWHID.from_string, row.revision),
        path=row.path,
        directory=map_optional(CoreSWHID.from_string, row.directory),
    )
Beispiel #10
0
def load_metadata(
    storage,
    revision_id,
    directory_id,
    discovery_date: datetime.datetime,
    metadata: Dict[str, Any],
    format: str,
    authority: MetadataAuthority,
    origin: Optional[str],
    dry_run: bool,
):
    """Does the actual loading to swh-storage."""
    directory_swhid = ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY,
                                    object_id=directory_id)
    revision_swhid = CoreSWHID(object_type=ObjectType.REVISION,
                               object_id=revision_id)
    obj = RawExtrinsicMetadata(
        target=directory_swhid,
        discovery_date=discovery_date,
        authority=authority,
        fetcher=FETCHER,
        format=format,
        metadata=json.dumps(metadata).encode(),
        origin=origin,
        revision=revision_swhid,
    )
    if not dry_run:
        storage.raw_extrinsic_metadata_add([obj])
Beispiel #11
0
    def test_negative_epoch(self):
        metadata = {
            **self.minimal,
            "discovery_date":
            datetime.datetime(
                1969,
                12,
                31,
                23,
                59,
                59,
                1,
                tzinfo=datetime.timezone.utc,
            ),
        }

        git_object = (
            b"raw_extrinsic_metadata 202\0"
            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
            b"discovery_date -1\n"
            b"authority forge https://forge.softwareheritage.org/\n"
            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
            b"format json\n"
            b"\n"
            b'{"foo": "bar"}')

        self.assertEqual(
            git_objects.raw_extrinsic_metadata_git_object(
                RawExtrinsicMetadata.from_dict(metadata)),
            git_object,
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(metadata).id,
            hashlib.sha1(git_object).digest(),
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(metadata).id,
            _x("be7154a8fd49d87f81547ea634d1e2152907d089"),
        )
Beispiel #12
0
    def test_negative_timestamp(self):
        metadata = {
            **self.minimal,
            "discovery_date":
            datetime.datetime(
                1960,
                1,
                25,
                11,
                27,
                51,
                tzinfo=datetime.timezone.utc,
            ),
        }

        git_object = (
            b"raw_extrinsic_metadata 210\0"
            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
            b"discovery_date -313504329\n"
            b"authority forge https://forge.softwareheritage.org/\n"
            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
            b"format json\n"
            b"\n"
            b'{"foo": "bar"}')

        self.assertEqual(
            git_objects.raw_extrinsic_metadata_git_object(
                RawExtrinsicMetadata.from_dict(metadata)),
            git_object,
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(metadata).id,
            hashlib.sha1(git_object).digest(),
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(metadata).id,
            _x("895d0821a2991dd376ddc303424aceb7c68280f9"),
        )
Beispiel #13
0
    def test_epoch(self):
        metadata = {
            **self.minimal,
            "discovery_date":
            datetime.datetime(
                1970,
                1,
                1,
                0,
                0,
                0,
                tzinfo=datetime.timezone.utc,
            ),
        }

        git_object = (
            b"raw_extrinsic_metadata 201\0"
            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
            b"discovery_date 0\n"
            b"authority forge https://forge.softwareheritage.org/\n"
            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
            b"format json\n"
            b"\n"
            b'{"foo": "bar"}')

        self.assertEqual(
            git_objects.raw_extrinsic_metadata_git_object(
                RawExtrinsicMetadata.from_dict(metadata)),
            git_object,
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(metadata).id,
            hashlib.sha1(git_object).digest(),
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(metadata).id,
            _x("27a53df54ace35ebd910493cdc70b334d6b7cb88"),
        )
Beispiel #14
0
    def test_minimal(self):
        git_object = (
            b"raw_extrinsic_metadata 210\0"
            b"target swh:1:cnt:568aaf43d83b2c3df8067f3bedbb97d83260be6d\n"
            b"discovery_date 1611574071\n"
            b"authority forge https://forge.softwareheritage.org/\n"
            b"fetcher swh-phabricator-metadata-fetcher 0.0.1\n"
            b"format json\n"
            b"\n"
            b'{"foo": "bar"}')

        self.assertEqual(
            git_objects.raw_extrinsic_metadata_git_object(
                RawExtrinsicMetadata.from_dict(self.minimal)),
            git_object,
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(self.minimal).id,
            hashlib.sha1(git_object).digest(),
        )
        self.assertEqual(
            RawExtrinsicMetadata.from_dict(self.minimal).id,
            _x("5c13f20ba336e44549baf3d7b9305b027ec9f43d"),
        )
Beispiel #15
0
DIRECTORY_ID = hash_to_bytes("aa" * 20)
DIRECTORY_SWHID = ExtendedSWHID.from_string(f"swh:1:dir:{DIRECTORY_ID.hex()}")

FETCHER = MetadataFetcher(
    name="swh.loader.package.tests.test_loader_metadata.MetadataTestLoader",
    version=__version__,
)

DISCOVERY_DATE = datetime.datetime.now(tz=datetime.timezone.utc)

DIRECTORY_METADATA = [
    RawExtrinsicMetadata(
        target=DIRECTORY_SWHID,
        discovery_date=DISCOVERY_DATE,
        authority=AUTHORITY,
        fetcher=FETCHER,
        format="test-format1",
        metadata=b"foo bar",
        origin=ORIGIN_URL,
        release=RELEASE_SWHID,
    ),
    RawExtrinsicMetadata(
        target=DIRECTORY_SWHID,
        discovery_date=DISCOVERY_DATE + datetime.timedelta(seconds=1),
        authority=AUTHORITY,
        fetcher=FETCHER,
        format="test-format2",
        metadata=b"bar baz",
        origin=ORIGIN_URL,
        release=RELEASE_SWHID,
    ),
]
Beispiel #16
0
def test_put_update_metadata_done_deposit_nominal(
    tmp_path,
    authenticated_client,
    complete_deposit,
    deposit_collection,
    atom_dataset,
    sample_data,
    swh_storage,
):
    """Nominal scenario, client send an update of metadata on a deposit with status "done"
    with an existing swhid. Such swhid has its metadata updated accordingly both in
    the deposit backend and in the metadata storage.

    Response: 204

    """
    deposit_swhid = CoreSWHID.from_string(complete_deposit.swhid)
    assert deposit_swhid.object_type == ObjectType.DIRECTORY
    directory_id = hash_to_bytes(deposit_swhid.object_id)

    # directory targeted by the complete_deposit does not exist in the storage
    assert list(swh_storage.directory_missing([directory_id
                                               ])) == [directory_id]

    # so let's create a directory reference in the storage (current deposit targets an
    # unknown swhid)
    existing_directory = sample_data.directory
    swh_storage.directory_add([existing_directory])
    assert list(swh_storage.directory_missing([existing_directory.id])) == []

    # and patch one complete deposit swhid so it targets said reference
    complete_deposit.swhid = str(existing_directory.swhid())
    complete_deposit.save()

    actual_existing_requests_archive = DepositRequest.objects.filter(
        deposit=complete_deposit, type="archive")
    nb_archives = len(actual_existing_requests_archive)
    actual_existing_requests_metadata = DepositRequest.objects.filter(
        deposit=complete_deposit, type="metadata")
    nb_metadata = len(actual_existing_requests_metadata)

    update_uri = reverse(EDIT_IRI,
                         args=[deposit_collection.name, complete_deposit.id])
    response = put_atom(
        authenticated_client,
        update_uri,
        data=atom_dataset["entry-data1"],
        HTTP_X_CHECK_SWHID=complete_deposit.swhid,
    )

    assert response.status_code == status.HTTP_204_NO_CONTENT

    new_requests_meta = DepositRequest.objects.filter(deposit=complete_deposit,
                                                      type="metadata")
    assert len(new_requests_meta) == nb_metadata + 1
    request_meta1 = new_requests_meta[0]
    raw_metadata1 = request_meta1.raw_metadata
    assert raw_metadata1 == atom_dataset["entry-data1"]

    # check we did not touch the other parts
    requests_archive1 = DepositRequest.objects.filter(deposit=complete_deposit,
                                                      type="archive")
    assert len(requests_archive1) == nb_archives
    assert set(actual_existing_requests_archive) == set(requests_archive1)

    # Ensure metadata stored in the metadata storage is consistent
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url=complete_deposit.client.provider_url,
    )

    actual_authority = swh_storage.metadata_authority_get(
        MetadataAuthorityType.DEPOSIT_CLIENT,
        url=complete_deposit.client.provider_url)
    assert actual_authority == metadata_authority

    config = APIConfig()
    metadata_fetcher = MetadataFetcher(
        name=config.tool["name"],
        version=config.tool["version"],
    )

    actual_fetcher = swh_storage.metadata_fetcher_get(config.tool["name"],
                                                      config.tool["version"])
    assert actual_fetcher == metadata_fetcher

    directory_swhid = ExtendedSWHID.from_string(complete_deposit.swhid)
    page_results = swh_storage.raw_extrinsic_metadata_get(
        directory_swhid, metadata_authority)
    assert page_results == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=directory_swhid,
                discovery_date=request_meta1.date,
                authority=metadata_authority,
                fetcher=metadata_fetcher,
                format="sword-v2-atom-codemeta",
                metadata=raw_metadata1.encode(),
                origin=complete_deposit.origin_url,
            )
        ],
        next_page_token=None,
    )
def test_gnu():
    original_artifacts = [{
        "length": 842501,
        "filename": "gperf-3.0.1.tar.gz",
        "checksums": {
            "sha1":
            "c4453ee492032b369006ee464f4dd4e2c0c0e650",
            "sha256":
            "5be283ef62e1bd26abdaaf88b416dbea4b14c360b09befcda2f055656dc43f87",
            "sha1_git":
            "bf1d5bb57d571101dd7b6acab2b78ae11bb861de",
            "blake2s256":
            "661f84afeb1e0b914defe2b249d424af1dfe380a96016b3282ae758c70e19a70",
        },
    }]

    row = {
        "id":
        b"\x00\x1cqE\x8e@[%\xba\xcc\xc8\x0b\x99\xf6cM\xff\x9d+\x18",
        "directory":
        DIRECTORY_ID,
        "date":
        datetime.datetime(2003, 6, 13, 0, 11, tzinfo=datetime.timezone.utc),
        "committer_date":
        datetime.datetime(2003, 6, 13, 0, 11, tzinfo=datetime.timezone.utc),
        "type":
        "tar",
        "message":
        b"swh-loader-package: synthetic revision message",
        "metadata": {
            "extrinsic": {
                "raw": {
                    "url": "https://ftp.gnu.org/gnu/gperf/gperf-3.0.1.tar.gz",
                    "time": "2003-06-13T00:11:00+00:00",
                    "length": 842501,
                    "version": "3.0.1",
                    "filename": "gperf-3.0.1.tar.gz",
                },
                "when": "2019-11-27T11:17:38.318997+00:00",
                "provider": "https://ftp.gnu.org/gnu/gperf/",
            },
            "intrinsic": {},
            "original_artifact": original_artifacts,
        },
    }

    origin_url = "https://ftp.gnu.org/gnu/gperf/"

    storage = Mock()

    def origin_get(urls):
        assert urls == [origin_url]
        return [Origin(url=origin_url)]

    storage.origin_get.side_effect = origin_get
    deposit_cur = None
    handle_row(row, storage, deposit_cur, dry_run=False)

    assert storage.method_calls == [
        call.origin_get([origin_url]),
        call.raw_extrinsic_metadata_add([
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(2019,
                                                 11,
                                                 27,
                                                 11,
                                                 17,
                                                 38,
                                                 318997,
                                                 tzinfo=datetime.timezone.utc),
                authority=SWH_AUTHORITY,
                fetcher=FETCHER,
                format="original-artifacts-json",
                metadata=json.dumps(original_artifacts).encode(),
                origin=origin_url,
                revision=CoreSWHID.from_string(
                    "swh:1:rev:001c71458e405b25baccc80b99f6634dff9d2b18"),
            ),
        ]),
    ]
Beispiel #18
0
def test_deposit_metadata_origin(
    url,
    authenticated_client,
    deposit_collection,
    atom_dataset,
    swh_storage,
):
    """Posting a swhid reference is stored on raw extrinsic metadata storage"""
    xml_data = atom_dataset["entry-data-with-origin-reference"].format(url=url)
    origin_swhid = Origin(url).swhid()
    deposit_client = authenticated_client.deposit_client
    swh_storage.origin_add([Origin(url)])
    response = post_atom(
        authenticated_client,
        reverse(COL_IRI, args=[deposit_collection.name]),
        data=xml_data,
    )

    assert response.status_code == status.HTTP_201_CREATED, response.content.decode(
    )
    response_content = ElementTree.fromstring(response.content)
    # Ensure the deposit is finalized
    deposit_id = int(
        response_content.findtext("swh:deposit_id", namespaces=NAMESPACES))
    deposit = Deposit.objects.get(pk=deposit_id)
    # we got not swhid as input so we cannot have those
    assert deposit.swhid is None
    assert deposit.swhid_context is None
    assert deposit.complete_date == deposit.reception_date
    assert deposit.complete_date is not None
    assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS

    # Ensure metadata stored in the metadata storage is consistent
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url=deposit_client.provider_url,
    )

    actual_authority = swh_storage.metadata_authority_get(
        MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url)
    assert actual_authority == metadata_authority

    config = APIConfig()
    metadata_fetcher = MetadataFetcher(
        name=config.tool["name"],
        version=config.tool["version"],
    )

    actual_fetcher = swh_storage.metadata_fetcher_get(config.tool["name"],
                                                      config.tool["version"])
    assert actual_fetcher == metadata_fetcher

    # Get the deposited metadata object and check it:

    page_results = swh_storage.raw_extrinsic_metadata_get(
        origin_swhid, metadata_authority)

    assert len(page_results.results) == 1
    assert page_results.next_page_token is None

    metadata = RawExtrinsicMetadata(
        target=origin_swhid,
        discovery_date=deposit.complete_date,
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="sword-v2-atom-codemeta",
        metadata=xml_data.encode(),
    )
    assert page_results == PagedResult(
        results=[metadata],
        next_page_token=None,
    )

    # Get metadata about the deposited metadata object and check it:
    _assert_deposit_info_on_metadata(swh_storage, metadata.swhid(), deposit,
                                     metadata_fetcher)
Beispiel #19
0
def test_deposit_metadata_swhid(
    swhid,
    authenticated_client,
    deposit_collection,
    atom_dataset,
    swh_storage,
):
    """Posting a swhid reference is stored on raw extrinsic metadata storage"""
    swhid_reference = QualifiedSWHID.from_string(swhid)
    swhid_target = extended_swhid_from_qualified(swhid_reference)

    xml_data = atom_dataset["entry-data-with-swhid"].format(
        swhid=swhid,
        metadata_provenance_url=
        "https://hal-test.archives-ouvertes.fr/hal-abcdefgh",
    )
    deposit_client = authenticated_client.deposit_client

    _insert_object(swh_storage, swhid_reference)

    response = post_atom(
        authenticated_client,
        reverse(COL_IRI, args=[deposit_collection.name]),
        data=xml_data,
    )

    assert response.status_code == status.HTTP_201_CREATED, response.content.decode(
    )
    response_content = ElementTree.fromstring(response.content)

    # Ensure the deposit is finalized
    deposit_id = int(
        response_content.findtext("swh:deposit_id", namespaces=NAMESPACES))
    deposit = Deposit.objects.get(pk=deposit_id)
    assert deposit.swhid == str(swhid_target)
    assert deposit.swhid_context == str(swhid_reference)
    assert deposit.complete_date == deposit.reception_date
    assert deposit.complete_date is not None
    assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS

    # Ensure metadata stored in the metadata storage is consistent
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url=deposit_client.provider_url,
    )

    actual_authority = swh_storage.metadata_authority_get(
        MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url)
    assert actual_authority == metadata_authority

    config = APIConfig()
    metadata_fetcher = MetadataFetcher(
        name=config.tool["name"],
        version=config.tool["version"],
    )

    actual_fetcher = swh_storage.metadata_fetcher_get(config.tool["name"],
                                                      config.tool["version"])
    assert actual_fetcher == metadata_fetcher

    # Get the deposited metadata object and check it:

    page_results = swh_storage.raw_extrinsic_metadata_get(
        swhid_target, metadata_authority)

    assert len(page_results.results) == 1
    assert page_results.next_page_token is None

    metadata_context = compute_metadata_context(swhid_reference)
    metadata = RawExtrinsicMetadata(
        target=swhid_target,
        discovery_date=deposit.complete_date,
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="sword-v2-atom-codemeta",
        metadata=xml_data.encode(),
        **metadata_context,
    )
    assert page_results == PagedResult(
        results=[metadata],
        next_page_token=None,
    )

    # Get metadata about the deposited metadata object and check it:
    _assert_deposit_info_on_metadata(swh_storage, metadata.swhid(), deposit,
                                     metadata_fetcher)
Beispiel #20
0
def test_nixguix():
    extrinsic_metadata = {
        "url":
        "https://files.pythonhosted.org/packages/source/a/alerta/alerta-7.4.5.tar.gz",
        "integrity": "sha256-km8RAaG1ep+tYR8eHVr3UWk+/MNEqdsBr1Di/g02LYQ=",
    }
    original_artifacts = [{
        "length": 34903,
        "filename": "alerta-7.4.5.tar.gz",
        "checksums": {
            "sha1":
            "66db4398b664de272fd5aa6610caa776b5e64651",
            "sha256":
            "926f1101a1b57a9fad611f1e1d5af751693efcc344a9db01af50e2fe0d362d84",
        },
    }]

    row = {
        "id":
        b"\x00\x01\xbaM\xd0S\x94\x85\x02\x11\xd7\xb3\x85M\x99\x13\xd2:\xe3y",
        "directory": DIRECTORY_ID,
        "date": None,
        "committer_date": None,
        "type": "tar",
        "message": b"",
        "metadata": {
            "extrinsic": {
                "raw":
                extrinsic_metadata,
                "when":
                "2020-06-03T11:25:05.259341+00:00",
                "provider":
                "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json",
            },
            "original_artifact": original_artifacts,
        },
    }

    origin_url = "https://nix-community.github.io/nixpkgs-swh/sources-unstable.json"

    storage = Mock()

    def origin_get(urls):
        assert urls == [origin_url]
        return [Origin(url=origin_url)]

    storage.origin_get.side_effect = origin_get
    deposit_cur = None
    handle_row(row, storage, deposit_cur, dry_run=False)

    assert storage.method_calls == [
        call.origin_get([origin_url]),
        call.raw_extrinsic_metadata_add([
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(2020,
                                                 6,
                                                 3,
                                                 11,
                                                 25,
                                                 5,
                                                 259341,
                                                 tzinfo=datetime.timezone.utc),
                authority=NIX_UNSTABLE_AUTHORITY,
                fetcher=FETCHER,
                format="nixguix-sources-json",
                metadata=json.dumps(extrinsic_metadata).encode(),
                origin=origin_url,
                revision=CoreSWHID.from_string(
                    "swh:1:rev:0001ba4dd05394850211d7b3854d9913d23ae379"),
            ),
        ]),
        call.raw_extrinsic_metadata_add([
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(2020,
                                                 6,
                                                 3,
                                                 11,
                                                 25,
                                                 5,
                                                 259341,
                                                 tzinfo=datetime.timezone.utc),
                authority=SWH_AUTHORITY,
                fetcher=FETCHER,
                format="original-artifacts-json",
                metadata=json.dumps(original_artifacts).encode(),
                origin=origin_url,
                revision=CoreSWHID.from_string(
                    "swh:1:rev:0001ba4dd05394850211d7b3854d9913d23ae379"),
            ),
        ]),
    ]
Beispiel #21
0
def test_pypi_good_origin():
    """Tests loading a revision whose origin we can find"""

    source_original_artifact = {
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "date": "2014-05-07T22:03:00",
        "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12",
        "size": 46644,
        "sha256":
        "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5",
        "blake2s256":
        "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        "archive_type": "tar",
    }

    dest_original_artifacts = [{
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "archive_type": "tar",
        "length": 46644,
        "checksums": {
            "sha1":
            "3289269f75b4111dd00eaea53e00330db9a1db12",
            "sha256":
            "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
            "sha1_git":
            "1e5c38014731242cfa8594839bcba8a0c4e158c5",
            "blake2s256":
            "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        },
    }]

    revision_id = b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2"
    row = {
        "id":
        revision_id,
        "directory":
        DIRECTORY_ID,
        "date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "committer_date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "type":
        "tar",
        "message":
        b"0.1.32",
        "metadata": {
            "original_artifact": source_original_artifact
        },
    }

    origin_url = "https://pypi.org/project/PyPDFLite/"

    storage = get_storage("memory")

    snapshot_id = b"42" * 10
    storage.origin_add([Origin(url=origin_url)])
    storage.origin_visit_add(
        [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin_url,
            visit=1,
            date=now(),
            status="partial",
            snapshot=snapshot_id,
        )
    ])
    storage.snapshot_add([
        Snapshot(
            id=snapshot_id,
            branches={
                b"foo":
                SnapshotBranch(
                    target_type=TargetType.REVISION,
                    target=revision_id,
                )
            },
        )
    ])
    storage.metadata_authority_add([
        attr.evolve(PYPI_AUTHORITY, metadata={}),
        attr.evolve(SWH_AUTHORITY, metadata={}),
    ])
    storage.metadata_fetcher_add([FETCHER])
    deposit_cur = None
    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)

    revision_swhid = CoreSWHID.from_string(
        "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2")

    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=PYPI_AUTHORITY,
    ) == PagedResult(
        results=[],
        next_page_token=None,
    )
    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=SWH_AUTHORITY,
    ) == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(
                    2014,
                    5,
                    7,
                    22,
                    3,
                    tzinfo=datetime.timezone.utc,
                ),
                authority=SWH_AUTHORITY,
                fetcher=FETCHER,
                format="original-artifacts-json",
                metadata=json.dumps(dest_original_artifacts).encode(),
                origin=origin_url,
                revision=revision_swhid,
            ),
        ],
        next_page_token=None,
    )
Beispiel #22
0
def test_pypi_3(mocker):
    """Tests loading a revision generated by a very old PyPI loader that
    does not have a provider or has 'project' metadata."""

    mocker.patch(
        "swh.storage.migrate_extrinsic_metadata.urlopen",
        side_effect=urllib.error.HTTPError(None, 404, "Not Found", None, None),
    )

    source_original_artifact = {
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "date": "2014-05-07T22:03:00",
        "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12",
        "size": 46644,
        "sha256":
        "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5",
        "blake2s256":
        "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        "archive_type": "tar",
    }

    dest_original_artifacts = [{
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "archive_type": "tar",
        "length": 46644,
        "checksums": {
            "sha1":
            "3289269f75b4111dd00eaea53e00330db9a1db12",
            "sha256":
            "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
            "sha1_git":
            "1e5c38014731242cfa8594839bcba8a0c4e158c5",
            "blake2s256":
            "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        },
    }]

    row = {
        "id":
        b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2",
        "directory":
        DIRECTORY_ID,
        "date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "committer_date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "type":
        "tar",
        "message":
        b"0.1.32",
        "metadata": {
            "original_artifact": source_original_artifact
        },
    }

    origin_url = "https://pypi.org/project/PyPDFLite/"

    storage = get_storage("memory")

    storage.origin_add([Origin(url=origin_url)])
    storage.metadata_authority_add([
        attr.evolve(PYPI_AUTHORITY, metadata={}),
        attr.evolve(SWH_AUTHORITY, metadata={}),
    ])
    storage.metadata_fetcher_add([FETCHER])
    deposit_cur = None
    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)

    revision_swhid = CoreSWHID.from_string(
        "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2")

    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=PYPI_AUTHORITY,
    ) == PagedResult(
        results=[],
        next_page_token=None,
    )
    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=SWH_AUTHORITY,
    ) == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(
                    2014,
                    5,
                    7,
                    22,
                    3,
                    tzinfo=datetime.timezone.utc,
                ),
                authority=SWH_AUTHORITY,
                fetcher=FETCHER,
                format="original-artifacts-json",
                metadata=json.dumps(dest_original_artifacts).encode(),
                origin=None,
                revision=revision_swhid,
            ),
        ],
        next_page_token=None,
    )
Beispiel #23
0
def test_pypi_2(mocker):
    """Tests loading a revision generated by an old PyPI loader that
    does not have a provider, but has 'project' metadata."""

    mocker.patch(
        "swh.storage.migrate_extrinsic_metadata.urlopen",
        side_effect=urllib.error.HTTPError(None, 404, "Not Found", None, None),
    )

    extrinsic_metadata = {
        "name": "jupyterhub-simx",
        "author": "Jupyter Development Team",
        "license": "BSD",
        "summary": "JupyterHub: A multi-user server for Jupyter notebooks",
        "version": "1.0.5",
        # ...
    }

    source_original_artifacts = [{
        "url":
        "https://files.pythonhosted.org/packages/72/28/a8098763d78e2c4607cb67602c0d726a97ac38d4c1f531aac28f49de2e1a/jupyterhub-simx-1.0.5.tar.gz",
        "date": "2019-01-23T22:10:55",
        "sha1": "ede3eadd5a06e70912e3ba7cfccef789c4ad3168",
        "size": 2346538,
        "sha256":
        "0399d7f5f0d90c525d369f0507ad0e8ef8729c1c7fa63aadfc46a27514d14a46",
        "filename": "jupyterhub-simx-1.0.5.tar.gz",
        "sha1_git": "734301124712182eb30fc90e97cc18cef5432f02",
        "blake2s256":
        "bb4aa82ffb5891a05dcf6d4dce3ad56fd2c18e9abdba9d20972910649d869322",
        "archive_type": "tar",
    }]

    dest_original_artifacts = [{
        "url":
        "https://files.pythonhosted.org/packages/72/28/a8098763d78e2c4607cb67602c0d726a97ac38d4c1f531aac28f49de2e1a/jupyterhub-simx-1.0.5.tar.gz",
        "filename": "jupyterhub-simx-1.0.5.tar.gz",
        "archive_type": "tar",
        "length": 2346538,
        "checksums": {
            "sha1":
            "ede3eadd5a06e70912e3ba7cfccef789c4ad3168",
            "sha256":
            "0399d7f5f0d90c525d369f0507ad0e8ef8729c1c7fa63aadfc46a27514d14a46",
            "sha1_git":
            "734301124712182eb30fc90e97cc18cef5432f02",
            "blake2s256":
            "bb4aa82ffb5891a05dcf6d4dce3ad56fd2c18e9abdba9d20972910649d869322",
        },
    }]

    row = {
        "id":
        b"\x00\x00\x04\xd68,J\xd4\xc0Q\x92fbl6U\x1f\x0eQ\xca",
        "directory":
        DIRECTORY_ID,
        "date":
        datetime.datetime(2019,
                          1,
                          23,
                          22,
                          10,
                          55,
                          tzinfo=datetime.timezone.utc),
        "committer_date":
        datetime.datetime(2019,
                          1,
                          23,
                          22,
                          10,
                          55,
                          tzinfo=datetime.timezone.utc),
        "type":
        "tar",
        "message":
        b"1.0.5",
        "metadata": {
            "project": extrinsic_metadata,
            "original_artifact": source_original_artifacts,
        },
    }

    origin_url = "https://pypi.org/project/jupyterhub-simx/"

    storage = get_storage("memory")

    storage.origin_add([Origin(url=origin_url)])
    storage.metadata_authority_add([
        attr.evolve(PYPI_AUTHORITY, metadata={}),
        attr.evolve(SWH_AUTHORITY, metadata={}),
    ])
    storage.metadata_fetcher_add([FETCHER])
    deposit_cur = None

    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)

    revision_swhid = CoreSWHID.from_string(
        "swh:1:rev:000004d6382c4ad4c0519266626c36551f0e51ca")
    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=PYPI_AUTHORITY,
    ) == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(
                    2019,
                    1,
                    23,
                    22,
                    10,
                    55,
                    tzinfo=datetime.timezone.utc,
                ),
                authority=PYPI_AUTHORITY,
                fetcher=FETCHER,
                format="pypi-project-json",
                metadata=json.dumps(extrinsic_metadata).encode(),
                origin=None,
                revision=revision_swhid,
            ),
        ],
        next_page_token=None,
    )
    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=SWH_AUTHORITY,
    ) == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(
                    2019,
                    1,
                    23,
                    22,
                    10,
                    55,
                    tzinfo=datetime.timezone.utc,
                ),
                authority=SWH_AUTHORITY,
                fetcher=FETCHER,
                format="original-artifacts-json",
                metadata=json.dumps(dest_original_artifacts).encode(),
                origin=None,
                revision=revision_swhid,
            ),
        ],
        next_page_token=None,
    )
Beispiel #24
0
def test_pypi_1():
    """Tests loading a revision generated by a new PyPI loader that
    has a provider."""

    extrinsic_metadata = {
        "url":
        "https://files.pythonhosted.org/packages/70/89/a498245baf1bf3dde73d3da00b4b067a8aa7c7378ad83472078803ea3e43/m3-ui-2.2.73.tar.gz",
        "size": 3933168,
        "digests": {
            "md5":
            "a374ac3f655e97df5db5335e2142d344",
            "sha256":
            "1bc2756f7d0d2e15cf5880ca697682ff35e8b58116bf73eb9c78b3db358c5b7d",
        },
        "has_sig": False,
        "filename": "m3-ui-2.2.73.tar.gz",
        "downloads": -1,
        "md5_digest": "a374ac3f655e97df5db5335e2142d344",
        "packagetype": "sdist",
        "upload_time": "2019-11-11T06:21:20",
        "comment_text": "",
        "python_version": "source",
        "requires_python": None,
        "upload_time_iso_8601": "2019-11-11T06:21:20.073082Z",
    }

    original_artifacts = [{
        "length": 3933168,
        "filename": "m3-ui-2.2.73.tar.gz",
        "checksums": {
            "sha1":
            "9f4ec7ce64b7fea4b122e85d47ea31146c367b03",
            "sha256":
            "1bc2756f7d0d2e15cf5880ca697682ff35e8b58116bf73eb9c78b3db358c5b7d",
        },
    }]

    row = {
        "id":
        b"\x00\x00\x07a{S\xe7\xb1E\x8fi]\xd0}\xe4\xceU\xaf\x15\x17",
        "directory":
        DIRECTORY_ID,
        "date":
        datetime.datetime(
            2019,
            11,
            11,
            6,
            21,
            20,
            tzinfo=datetime.timezone.utc,
        ),
        "committer_date":
        datetime.datetime(
            2019,
            11,
            11,
            6,
            21,
            20,
            tzinfo=datetime.timezone.utc,
        ),
        "type":
        "tar",
        "message":
        b"2.2.73",
        "metadata": {
            "extrinsic": {
                "raw": extrinsic_metadata,
                "when": "2020-01-23T18:43:09.109407+00:00",
                "provider": "https://pypi.org/pypi/m3-ui/json",
            },
            "intrinsic": {
                "raw": {
                    "name": "m3-ui",
                    "summary": "======",
                    "version": "2.2.73",
                    # ...
                    "metadata_version": "1.1",
                },
                "tool": "PKG-INFO",
            },
            "original_artifact": original_artifacts,
        },
    }

    origin_url = "https://pypi.org/project/m3-ui/"

    storage = get_storage("memory")
    storage.origin_add([Origin(url=origin_url)])
    storage.metadata_authority_add([
        attr.evolve(PYPI_AUTHORITY, metadata={}),
        attr.evolve(SWH_AUTHORITY, metadata={}),
    ])
    storage.metadata_fetcher_add([FETCHER])

    deposit_cur = None
    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)

    revision_swhid = CoreSWHID.from_string(
        "swh:1:rev:000007617b53e7b1458f695dd07de4ce55af1517")
    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=PYPI_AUTHORITY,
    ) == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(
                    2020,
                    1,
                    23,
                    18,
                    43,
                    9,
                    109407,
                    tzinfo=datetime.timezone.utc,
                ),
                authority=PYPI_AUTHORITY,
                fetcher=FETCHER,
                format="pypi-project-json",
                metadata=json.dumps(extrinsic_metadata).encode(),
                origin=origin_url,
                revision=revision_swhid,
            ),
        ],
        next_page_token=None,
    )
    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=SWH_AUTHORITY,
    ) == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(
                    2020,
                    1,
                    23,
                    18,
                    43,
                    9,
                    109407,
                    tzinfo=datetime.timezone.utc,
                ),
                authority=SWH_AUTHORITY,
                fetcher=FETCHER,
                format="original-artifacts-json",
                metadata=json.dumps(original_artifacts).encode(),
                origin=origin_url,
                revision=revision_swhid,
            ),
        ],
        next_page_token=None,
    )
Beispiel #25
0
def test_opam_metadata(tmpdir, requests_mock_datadir, fake_opam_root,
                       swh_storage, datadir):
    opam_url = f"file://{datadir}/fake_opam_repo"
    opam_root = fake_opam_root
    opam_instance = "loadertest"

    opam_package = "ocb"
    url = f"opam+{opam_url}/packages/{opam_package}"

    loader = OpamLoader(
        swh_storage,
        url,
        opam_root,
        opam_instance,
        opam_url,
        opam_package,
        initialize_opam_root=True,
    )

    actual_load_status = loader.load()

    assert actual_load_status["status"] == "eventful"

    expected_release_id = hash_to_bytes(
        "c231e541eb29c712635ada394b04127ac69e9fb0")

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"HEAD":
            SnapshotBranch(
                target=b"ocb.0.1",
                target_type=TargetType.ALIAS,
            ),
            b"ocb.0.1":
            SnapshotBranch(
                target=expected_release_id,
                target_type=TargetType.RELEASE,
            ),
        },
    )

    assert_last_visit_matches(swh_storage,
                              url,
                              status="full",
                              type="opam",
                              snapshot=expected_snapshot.id)

    check_snapshot(expected_snapshot, swh_storage)

    release = swh_storage.release_get([expected_release_id])[0]
    assert release is not None

    release_swhid = CoreSWHID(object_type=ObjectType.RELEASE,
                              object_id=expected_release_id)
    directory_swhid = ExtendedSWHID(object_type=ExtendedObjectType.DIRECTORY,
                                    object_id=release.target)
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url=opam_url,
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=directory_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.opam.loader.OpamLoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="opam-package-definition",
            metadata=OCB_METADATA,
            origin=url,
            release=release_swhid,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        directory_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )
Beispiel #26
0
    ),
]

METADATA_FETCHERS = [
    MetadataFetcher(
        name="test-fetcher",
        version="1.0.0",
        metadata={},
    )
]

RAW_EXTRINSIC_METADATA = [
    RawExtrinsicMetadata(
        target=Origin("http://example.org/foo.git").swhid(),
        discovery_date=datetime.datetime(2020, 7, 30, 17, 8, 20, tzinfo=UTC),
        authority=attr.evolve(METADATA_AUTHORITIES[0], metadata=None),
        fetcher=attr.evolve(METADATA_FETCHERS[0], metadata=None),
        format="json",
        metadata=b'{"foo": "bar"}',
    ),
    RawExtrinsicMetadata(
        target=ExtendedSWHID.from_string(str(CONTENTS[0].swhid())),
        discovery_date=datetime.datetime(2020, 7, 30, 17, 8, 20, tzinfo=UTC),
        authority=attr.evolve(METADATA_AUTHORITIES[0], metadata=None),
        fetcher=attr.evolve(METADATA_FETCHERS[0], metadata=None),
        format="json",
        metadata=b'{"foo": "bar"}',
    ),
]

TEST_OBJECTS: Dict[str, Sequence[BaseModel]] = {
    "content": CONTENTS,
Beispiel #27
0
class StorageData:
    """Data model objects to use within tests."""

    content = Content(
        data=b"42\n",
        length=3,
        sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"),
        sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        sha256=hash_to_bytes(
            "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0"
        ),
        blake2s256=hash_to_bytes(
            "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d"
        ),
        status="visible",
    )
    content2 = Content(
        data=b"4242\n",
        length=5,
        sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"),
        sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"),
        sha256=hash_to_bytes(
            "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd"
        ),
        blake2s256=hash_to_bytes(
            "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d"
        ),
        status="visible",
    )
    content3 = Content(
        data=b"424242\n",
        length=7,
        sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"),
        sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"),
        sha256=hash_to_bytes(
            "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36"
        ),
        blake2s256=hash_to_bytes(
            "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11"
        ),
        status="visible",
        ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc),
    )
    contents: Tuple[Content, ...] = (content, content2, content3)

    skipped_content = SkippedContent(
        length=1024 * 1024 * 200,
        sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"),
        sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
        origin="file:///dev/zero",
    )
    skipped_content2 = SkippedContent(
        length=1024 * 1024 * 300,
        sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"),
        sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
    )
    skipped_contents: Tuple[SkippedContent,
                            ...] = (skipped_content, skipped_content2)

    directory5 = Directory(
        id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"),
        entries=(),
    )
    directory = Directory(
        id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar\xc3",
                type="dir",
                target=directory5.id,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
    )
    directory2 = Directory(
        id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"),
        entries=tuple([
            DirectoryEntry(
                name=b"oof",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            )
        ], ),
    )
    directory3 = Directory(
        id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"subdir",
                type="dir",
                target=directory.id,
                perms=from_disk.DentryPerms.directory,
            ),
            DirectoryEntry(
                name=b"hello",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
        ], ),
    )
    directory4 = Directory(
        id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"),
        entries=tuple([
            DirectoryEntry(
                name=b"subdir1",
                type="dir",
                target=directory3.id,
                perms=from_disk.DentryPerms.directory,
            )
        ], ),
    )

    directory6 = Directory(
        id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=b"\x00" * 20,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar",
                type="dir",
                target=b"\x01" * 20,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
        raw_manifest=(
            b"tree 61\x00"
            b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"  # noqa
            b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"  # noqa
        ),
    )

    directories: Tuple[Directory, ...] = (
        directory2,
        directory,
        directory3,
        directory4,
        directory5,
        directory6,
    )

    revision = Revision(
        id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
        },
        extra_headers=(
            (b"gpgsig", b"test123"),
            (b"mergetag", b"foo\\bar"),
            (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"),
        ),
        synthetic=True,
    )
    revision2 = Revision(
        id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    revision3 = Revision(
        id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id, revision2.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=True,
    )
    revision4 = Revision(
        id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([revision3.id]),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    git_revisions: Tuple[Revision,
                         ...] = (revision, revision2, revision3, revision4)

    hg_revision = Revision(
        id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
            "node": "a316dfb434af2b451c1f393496b7eaeda343f543",
        },
        extra_headers=(),
        synthetic=True,
    )
    hg_revision2 = Revision(
        id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ),
        synthetic=False,
    )
    hg_revision3 = Revision(
        id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id, hg_revision2.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ),
        synthetic=True,
    )
    hg_revision4 = Revision(
        id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([hg_revision3.id]),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ),
        synthetic=False,
    )
    hg_revisions: Tuple[Revision, ...] = (
        hg_revision,
        hg_revision2,
        hg_revision3,
        hg_revision4,
    )
    revisions: Tuple[Revision, ...] = git_revisions + hg_revisions

    origins: Tuple[Origin, ...] = (
        Origin(url="https://github.com/user1/repo1"),
        Origin(url="https://github.com/user2/repo1"),
        Origin(url="https://github.com/user3/repo1"),
        Origin(url="https://gitlab.com/user1/repo1"),
        Origin(url="https://gitlab.com/user2/repo1"),
        Origin(url="https://forge.softwareheritage.org/source/repo1"),
        Origin(url="https://example.рф/🏛️.txt"),
    )
    origin, origin2 = origins[:2]

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="http://hal.inria.example.com/",
    )
    metadata_authority2 = MetadataAuthority(
        type=MetadataAuthorityType.REGISTRY,
        url="http://wikidata.example.com/",
    )
    authorities: Tuple[MetadataAuthority, ...] = (
        metadata_authority,
        metadata_authority2,
    )

    metadata_fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )
    metadata_fetcher2 = MetadataFetcher(
        name="swh-example",
        version="0.0.1",
    )
    fetchers: Tuple[MetadataFetcher,
                    ...] = (metadata_fetcher, metadata_fetcher2)

    date_visit1 = datetime.datetime(2015,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit2 = datetime.datetime(2017,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit3 = datetime.datetime(2018,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)

    type_visit1 = "git"
    type_visit2 = "hg"
    type_visit3 = "deb"

    origin_visit = OriginVisit(
        origin=origin.url,
        visit=1,
        date=date_visit1,
        type=type_visit1,
    )
    origin_visit2 = OriginVisit(
        origin=origin.url,
        visit=2,
        date=date_visit2,
        type=type_visit1,
    )
    origin_visit3 = OriginVisit(
        origin=origin2.url,
        visit=1,
        date=date_visit1,
        type=type_visit2,
    )
    origin_visits: Tuple[OriginVisit, ...] = (
        origin_visit,
        origin_visit2,
        origin_visit3,
    )

    release = Release(
        id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"),
        name=b"v0.0.1",
        author=Person(
            name=b"olasd",
            email=b"*****@*****.**",
            fullname=b"olasd <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0042",
        ),
        target=revision.id,
        target_type=ObjectType.REVISION,
        message=b"synthetic release",
        synthetic=True,
    )
    release2 = Release(
        id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision2.id,
        target_type=ObjectType.REVISION,
        message=b"v0.0.2\nMisc performance improvements + bug fixes",
        synthetic=False,
    )
    release3 = Release(
        id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision3.id,
        target_type=ObjectType.REVISION,
        message=b"yet another synthetic release",
        synthetic=True,
    )

    releases: Tuple[Release, ...] = (release, release2, release3)

    snapshot = Snapshot(
        id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"),
        branches={
            b"master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        },
    )
    empty_snapshot = Snapshot(
        id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
        branches={},
    )
    complete_snapshot = Snapshot(
        id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"),
        branches={
            b"directory":
            SnapshotBranch(
                target=directory.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"directory2":
            SnapshotBranch(
                target=directory2.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"content":
            SnapshotBranch(
                target=content.sha1_git,
                target_type=TargetType.CONTENT,
            ),
            b"alias":
            SnapshotBranch(
                target=b"revision",
                target_type=TargetType.ALIAS,
            ),
            b"revision":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
            b"release":
            SnapshotBranch(
                target=release.id,
                target_type=TargetType.RELEASE,
            ),
            b"snapshot":
            SnapshotBranch(
                target=empty_snapshot.id,
                target_type=TargetType.SNAPSHOT,
            ),
            b"dangling":
            None,
        },
    )

    snapshots: Tuple[Snapshot,
                     ...] = (snapshot, empty_snapshot, complete_snapshot)

    content_metadata1 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin.url,
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    content_metadata2 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin2.url,
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="yaml",
        metadata=b"foo: bar",
    )
    content_metadata3 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
        origin=origin.url,
        visit=42,
        snapshot=snapshot.swhid(),
        release=release.swhid(),
        revision=revision.swhid(),
        directory=directory.swhid(),
        path=b"/foo/bar",
    )

    content_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        content_metadata1,
        content_metadata2,
        content_metadata3,
    )

    origin_metadata1 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    origin_metadata2 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )
    origin_metadata3 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )

    origin_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        origin_metadata1,
        origin_metadata2,
        origin_metadata3,
    )

    extid1 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=revision.id),
        extid_type="git",
        extid=revision.id,
    )

    extid2 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=hg_revision.id),
        extid_type="mercurial",
        extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"),
    )

    extid3 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory.id),
        extid_type="directory",
        extid=b"something",
    )
    extid4 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory2.id),
        extid_type="directory",
        extid=b"something",
        extid_version=2,
    )

    extids: Tuple[ExtID, ...] = (
        extid1,
        extid2,
        extid3,
        extid4,
    )
Beispiel #28
0
def test_pypi_release_metadata_structure(
    swh_storage, requests_mock_datadir, _0805nexter_api_info
):
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    expected_release_id = hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68")

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/1.2.0",
                target_type=TargetType.ALIAS,
            ),
            b"releases/1.1.0": SnapshotBranch(
                target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.2.0": SnapshotBranch(
                target=expected_release_id,
                target_type=TargetType.RELEASE,
            ),
        },
    )

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id
    )

    check_snapshot(expected_snapshot, swh_storage)

    release = swh_storage.release_get([expected_release_id])[0]
    assert release is not None

    release_swhid = CoreSWHID(
        object_type=ObjectType.RELEASE, object_id=expected_release_id
    )
    directory_swhid = ExtendedSWHID(
        object_type=ExtendedObjectType.DIRECTORY, object_id=release.target
    )
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://pypi.org/",
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=directory_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.pypi.loader.PyPILoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="pypi-project-json",
            metadata=json.dumps(
                json.loads(_0805nexter_api_info)["releases"]["1.2.0"][0]
            ).encode(),
            origin=url,
            release=release_swhid,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        directory_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )
Beispiel #29
0
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info):
    package = "org"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    release_id = "d38cc0b571cd41f3c85513864e049766b42032a7"
    versions = [
        ("0.0.2", release_id),
        ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"),
        ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.4", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.2",
        message=b"Synthetic release for NPM source package org version 0.0.2\n",
        target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person(
            fullname=b"mooz <*****@*****.**>",
            name=b"mooz",
            email=b"*****@*****.**",
        ),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    contents = swh_storage.content_get(_expected_new_contents_first_visit)
    count = sum(0 if content is None else 1 for content in contents)
    assert count == len(_expected_new_contents_first_visit)

    assert (
        list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == []
    )

    assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == []

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://npmjs.com/",
    )

    for (version_name, release_id) in versions:
        release = swh_storage.release_get([hash_to_bytes(release_id)])[0]
        assert release.target_type == ModelObjectType.DIRECTORY
        directory_id = release.target
        directory_swhid = ExtendedSWHID(
            object_type=ExtendedObjectType.DIRECTORY,
            object_id=directory_id,
        )
        release_swhid = CoreSWHID(
            object_type=ObjectType.RELEASE,
            object_id=hash_to_bytes(release_id),
        )
        expected_metadata = [
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.npm.loader.NpmLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="replicate-npm-package-json",
                metadata=json.dumps(
                    json.loads(org_api_info)["versions"][version_name]
                ).encode(),
                origin="https://www.npmjs.com/package/org",
                release=release_swhid,
            )
        ]
        assert swh_storage.raw_extrinsic_metadata_get(
            directory_swhid,
            metadata_authority,
        ) == PagedResult(
            next_page_token=None,
            results=expected_metadata,
        )

    stats = get_stats(swh_storage)

    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
def test_loader_one_visit(swh_storage, requests_mock_datadir, raw_sources):
    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()
    expected_snapshot_id = SNAPSHOT1.id
    expected_snapshot_id_hex = expected_snapshot_id.hex()
    assert load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id_hex,
    }

    release_id = SNAPSHOT1.branches[
        b"https://github.com/owner-1/repository-1/revision-1.tgz"].target
    check_snapshot(SNAPSHOT1, storage=swh_storage)

    assert swh_storage.release_get([release_id])[0] == Release(
        id=release_id,
        name=b"https://github.com/owner-1/repository-1/revision-1.tgz",
        message=None,
        target=hash_to_bytes("4de2e07d3742718d928e974b8a4c721b9f7b33bf"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b""),
        date=None,
    )

    stats = get_stats(swh_storage)
    assert {
        "content": 1,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    # The visit is partial because urls pointing to non tarball file
    # are not handled yet
    assert_last_visit_matches(swh_storage,
                              sources_url,
                              status="partial",
                              type="nixguix")

    visit_status = origin_get_latest_visit_status(swh_storage, sources_url)
    snapshot_swhid = ExtendedSWHID(object_type=ExtendedObjectType.SNAPSHOT,
                                   object_id=visit_status.snapshot)
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url=sources_url,
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=snapshot_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.nixguix.loader.NixGuixLoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="nixguix-sources-json",
            metadata=raw_sources,
            origin=sources_url,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        snapshot_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )