Ejemplo n.º 1
0
def test_iter_origins(swh_storage):
    origins = [
        Origin(url="bar"),
        Origin(url="qux"),
        Origin(url="quuz"),
    ]
    assert swh_storage.origin_add(origins) == {"origin:add": 3}

    # this returns all the origins, only the number of paged called is different
    assert list(iter_origins(swh_storage)) == origins
    assert list(iter_origins(swh_storage, limit=1)) == origins
    assert list(iter_origins(swh_storage, limit=2)) == origins
Ejemplo n.º 2
0
def db_to_raw_extrinsic_metadata(row) -> RawExtrinsicMetadata:
    target = row["raw_extrinsic_metadata.target"]
    if not target.startswith("swh:1:"):
        warnings.warn("Fetching raw_extrinsic_metadata row with URL target",
                      DeprecationWarning)
        target = str(Origin(url=target).swhid())

    return RawExtrinsicMetadata(
        target=ExtendedSWHID.from_string(target),
        authority=MetadataAuthority(
            type=MetadataAuthorityType(row["metadata_authority.type"]),
            url=row["metadata_authority.url"],
        ),
        fetcher=MetadataFetcher(
            name=row["metadata_fetcher.name"],
            version=row["metadata_fetcher.version"],
        ),
        discovery_date=row["discovery_date"],
        format=row["format"],
        metadata=row["raw_extrinsic_metadata.metadata"],
        origin=row["origin"],
        visit=row["visit"],
        snapshot=map_optional(CoreSWHID.from_string, row["snapshot"]),
        release=map_optional(CoreSWHID.from_string, row["release"]),
        revision=map_optional(CoreSWHID.from_string, row["revision"]),
        path=row["path"],
        directory=map_optional(CoreSWHID.from_string, row["directory"]),
    )
Ejemplo n.º 3
0
    def init(self, swh_storage, datadir, tmp_path, mocker):
        archive_name = "testrepo"
        archive_path = os.path.join(datadir, f"{archive_name}.tgz")
        tmp_path = str(tmp_path)
        self.repo_url = prepare_repository_from_archive(archive_path,
                                                        archive_name,
                                                        tmp_path=tmp_path)
        self.destination_path = os.path.join(tmp_path, archive_name)

        self.fetcher = MagicMock()
        self.fetcher.get_origin_metadata.return_value = []
        self.fetcher.get_parent_origins.return_value = [
            Origin(url=f"base://{self.repo_url}")
        ]
        self.fetcher_cls = MagicMock(return_value=self.fetcher)
        self.fetcher_cls.SUPPORTED_LISTERS = ["fake-lister"]
        mocker.patch(
            "swh.loader.core.metadata_fetchers._fetchers",
            return_value=[self.fetcher_cls],
        )

        self.loader = GitLoader(
            MagicMock(wraps=swh_storage),
            self.repo_url,
            lister_name="fake-lister",
            lister_instance_name="",
        )
        self.repo = dulwich.repo.Repo(self.destination_path)
Ejemplo n.º 4
0
def test_gen_origins_max():
    nmax = len(ORIGINS)
    origins = gen_origins(nmax + 1)
    assert len(origins) == nmax
    models = {Origin.from_dict(d).url for d in origins}
    # ensure we did not generate the same origin twice
    assert len(origins) == len(models)
Ejemplo n.º 5
0
def _add_origin(storage,
                search,
                origin_url,
                visit_type="git",
                snapshot_branches={}):
    storage.origin_add([Origin(url=origin_url)])
    search.origin_update([{
        "url": origin_url,
        "has_visits": True,
        "visit_types": [visit_type]
    }])
    date = now()
    visit = OriginVisit(origin=origin_url, date=date, type=visit_type)
    visit = storage.origin_visit_add([visit])[0]
    snapshot = Snapshot.from_dict({"branches": snapshot_branches})
    storage.snapshot_add([snapshot])
    visit_status = OriginVisitStatus(
        origin=origin_url,
        visit=visit.visit,
        date=date + timedelta(minutes=1),
        type=visit.type,
        status="full",
        snapshot=snapshot.id,
    )
    storage.origin_visit_status_add([visit_status])
Ejemplo n.º 6
0
def test_content_id_optional_parts_browse(client, archive_data, content):
    cnt_sha1_git = content["sha1_git"]
    origin_url = "https://github.com/user/repo"

    archive_data.origin_add([Origin(url=origin_url)])

    swhid = gen_swhid(
        CONTENT,
        cnt_sha1_git,
        metadata={
            "lines": "4-20",
            "origin": origin_url
        },
    )
    url = reverse("browse-swhid", url_args={"swhid": swhid})

    query_string = "sha1_git:" + cnt_sha1_git
    content_browse_url = reverse(
        "browse-content",
        url_args={"query_string": query_string},
        query_params={"origin_url": origin_url},
    )
    content_browse_url += "#L4-L20"

    resp = check_html_get_response(client, url, status_code=302)
    assert resp["location"] == content_browse_url
Ejemplo n.º 7
0
def _insert_objects(object_type: str, objects: List[BaseModel],
                    storage: StorageInterface) -> None:
    """Insert objects of type object_type in the storage."""
    if object_type not in OBJECT_CONVERTERS:
        logger.warning("Received a series of %s, this should not happen",
                       object_type)
        return

    method = getattr(storage, f"{object_type}_add")
    if object_type == "skipped_content":
        method = partial(collision_aware_content_add, content_add_fn=method)
    elif object_type == "content":
        method = partial(collision_aware_content_add,
                         content_add_fn=storage.content_add_metadata)
    elif object_type in ("origin_visit", "origin_visit_status"):
        origins: List[Origin] = []
        for obj in cast(List[Union[OriginVisit, OriginVisitStatus]], objects):
            origins.append(Origin(url=obj.origin))
        storage.origin_add(origins)
    elif object_type == "raw_extrinsic_metadata":
        emds = cast(List[RawExtrinsicMetadata], objects)
        authorities = {emd.authority for emd in emds}
        fetchers = {emd.fetcher for emd in emds}
        storage.metadata_authority_add(list(authorities))
        storage.metadata_fetcher_add(list(fetchers))
    method(objects)
Ejemplo n.º 8
0
    def __init__(
        self,
        storage: StorageInterface,
        origin_url: str,
        logging_class: Optional[str] = None,
        save_data_path: Optional[str] = None,
        max_content_size: Optional[int] = None,
        lister_name: Optional[str] = None,
        lister_instance_name: Optional[str] = None,
        metadata_fetcher_credentials: CredentialsType = None,
    ):
        if lister_name == "":
            raise ValueError("lister_name must not be the empty string")
        if lister_name is None and lister_instance_name is not None:
            raise ValueError(
                f"lister_name is None but lister_instance_name is {lister_instance_name!r}"
            )
        if lister_name is not None and lister_instance_name is None:
            raise ValueError(
                f"lister_instance_name is None but lister_name is {lister_name!r}"
            )

        self.storage = storage
        self.origin = Origin(url=origin_url)
        self.max_content_size = int(
            max_content_size) if max_content_size else None
        self.lister_name = lister_name
        self.lister_instance_name = lister_instance_name
        self.metadata_fetcher_credentials = metadata_fetcher_credentials or {}

        if logging_class is None:
            logging_class = "%s.%s" % (
                self.__class__.__module__,
                self.__class__.__name__,
            )
        self.log = logging.getLogger(logging_class)

        _log = logging.getLogger("requests.packages.urllib3.connectionpool")
        _log.setLevel(logging.WARN)

        # possibly overridden in self.prepare method
        self.visit_date = datetime.datetime.now(tz=datetime.timezone.utc)

        self.loaded_snapshot_id = None

        if save_data_path:
            path = save_data_path
            os.stat(path)
            if not os.access(path, os.R_OK | os.W_OK):
                raise PermissionError("Permission denied: %r" % path)

        self.save_data_path = save_data_path

        self.parent_origins = None

        self.statsd = Statsd(namespace="swh_loader",
                             constant_tags={"visit_type": self.visit_type})
Ejemplo n.º 9
0
def test_resolve_swhid_with_malformed_origin_url(archive_data, directory):
    origin_url = "http://example.org/project/abc"
    malformed_origin_url = "http:/example.org/project/abc"
    archive_data.origin_add([Origin(url=origin_url)])
    swhid = gen_swhid(DIRECTORY,
                      directory,
                      metadata={"origin": malformed_origin_url})
    resolved_swhid = resolve_swhid(swhid)
    assert origin_url in resolved_swhid["browse_url"]
 def test_deposit(self):
     origin_url = "https://forge.softwareheritage.org/source/jesuisgpl/"
     self.indexer.storage.origin_add([Origin(url=origin_url)])
     self.indexer.run([origin_url])
     rev_id = b"\xe7n\xa4\x9c\x9f\xfb\xb7\xf76\x11\x08{\xa6\xe9\x99\xb1\x9e]q\xeb"
     self.assertEqual(
         self.indexer.results,
         [{
             "revision_id": rev_id,
             "origin_url": origin_url,
         }],
     )
Ejemplo n.º 11
0
def test_browse_swhid_special_characters_escaping(client, archive_data,
                                                  directory):
    origin = "http://example.org/?project=abc;"
    archive_data.origin_add([Origin(url=origin)])
    origin_swhid_escaped = quote(origin, safe="/?:@&")
    origin_swhid_url_escaped = quote(origin, safe="/:@;")
    swhid = gen_swhid(DIRECTORY,
                      directory,
                      metadata={"origin": origin_swhid_escaped})
    url = reverse("browse-swhid", url_args={"swhid": swhid})

    resp = check_html_get_response(client, url, status_code=302)
    assert origin_swhid_url_escaped in resp["location"]
Ejemplo n.º 12
0
def test_pypi_origin_from_project_name(mocker):
    origin_url = "https://pypi.org/project/ProjectName/"

    storage = get_storage("memory")

    revision_id = b"41" * 10
    snapshot_id = b"42" * 10
    storage.origin_add([Origin(url=origin_url)])
    storage.origin_visit_add(
        [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin_url,
            visit=1,
            date=now(),
            status="partial",
            snapshot=snapshot_id,
        )
    ])
    storage.snapshot_add([
        Snapshot(
            id=snapshot_id,
            branches={
                b"foo":
                SnapshotBranch(
                    target_type=TargetType.REVISION,
                    target=revision_id,
                )
            },
        )
    ])

    class response:
        code = 200

        def read(self):
            return b'{"info": {"name": "ProjectName"}}'

    mock_urlopen = mocker.patch(
        "swh.storage.migrate_extrinsic_metadata.urlopen",
        return_value=response(),
    )

    assert (pypi_origin_from_filename(
        storage, revision_id, "ProjectName-1.0.0.tar.gz") == origin_url)
    mock_urlopen.assert_not_called()
    assert (pypi_origin_from_filename(
        storage, revision_id, "projectname-1.0.0.tar.gz") == origin_url)
    mock_urlopen.assert_called_once_with(
        "https://pypi.org/pypi/projectname/json/")
Ejemplo n.º 13
0
def test_loader_save_data_path(swh_storage, tmp_path):
    loader = DummyBaseLoader(swh_storage,
                             "some.logger.name.1",
                             save_data_path=tmp_path)
    url = "http://bitbucket.org/something"
    loader.origin = Origin(url=url)
    loader.visit_date = datetime.datetime(year=2019, month=10, day=1)

    hash_url = hashlib.sha1(url.encode("utf-8")).hexdigest()
    expected_save_path = "%s/sha1:%s/%s/2019" % (str(tmp_path), hash_url[0:2],
                                                 hash_url)

    save_path = loader.get_save_data_path()
    assert save_path == expected_save_path
def test_origin_metadata_indexer_missing_head(
    swh_indexer_config,
    idx_storage: IndexerStorageInterface,
    storage: StorageInterface,
    obj_storage,
) -> None:
    storage.origin_add([Origin(url="https://example.com")])

    indexer = OriginMetadataIndexer(config=swh_indexer_config)
    indexer.run(["https://example.com"])

    origin = "https://example.com"

    results = list(indexer.idx_storage.origin_intrinsic_metadata_get([origin]))
    assert results == []
Ejemplo n.º 15
0
    def test_no_previous_snapshot(self, mocker):
        statsd_report = mocker.patch.object(self.loader.statsd, "_report")
        res = self.loader.load()
        assert res == {"status": "eventful"}

        self.fetcher_cls.assert_called_once_with(
            credentials={},
            lister_name="fake-lister",
            lister_instance_name="",
            origin=Origin(url=self.repo_url),
        )
        self.fetcher.get_parent_origins.assert_called_once_with()

        # First tries the same origin
        assert self.loader.storage.origin_visit_get_latest.mock_calls == [
            call(
                self.repo_url,
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
            # As it does not already have a snapshot, fall back to the parent origin
            call(
                f"base://{self.repo_url}",
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
        ]

        # TODO: assert "incremental" is added to constant tags before these
        # metrics are sent
        assert [
            c for c in statsd_report.mock_calls if c[1][0].startswith("git_")
        ] == [
            call("git_total", "c", 1, {}, 1),
            call("git_ignored_refs_percent", "h", 0.0, {}, 1),
            call("git_known_refs_percent", "h", 0.0, {}, 1),
        ]
        assert self.loader.statsd.constant_tags == {
            "visit_type": "git",
            "incremental_enabled": True,
            "has_parent_snapshot": False,
            "has_previous_snapshot": False,
            "has_parent_origins": True,
        }
Ejemplo n.º 16
0
def _fix_raw_extrinsic_metadata(obj_dict: Dict) -> Dict:
    """Fix legacy RawExtrinsicMetadata with type which is no longer part of the model.

    >>> _fix_raw_extrinsic_metadata({
    ...     'type': 'directory',
    ...     'target': 'swh:1:dir:460a586d1c95d120811eaadb398d534e019b5243',
    ... })
    {'target': 'swh:1:dir:460a586d1c95d120811eaadb398d534e019b5243'}
    >>> _fix_raw_extrinsic_metadata({
    ...     'type': 'origin',
    ...     'target': 'https://inria.halpreprod.archives-ouvertes.fr/hal-01667309',
    ... })
    {'target': 'swh:1:ori:155291d5b9ada4570672510509f93fcfd9809882'}

    """
    o = obj_dict.copy()
    if o.pop("type", None) == "origin":
        o["target"] = str(Origin(o["target"]).swhid())
    return o
Ejemplo n.º 17
0
def test_api_origin_search_limit(api_client, archive_data, tests_data, mocker,
                                 backend):
    if backend == "swh-search":
        tests_data["search"].origin_update([{
            "url": "http://foobar/{}".format(i)
        } for i in range(2000)])
    else:
        # equivalent to not configuring search in the config
        mocker.patch("swh.web.common.archive.search", None)

        archive_data.origin_add(
            [Origin(url="http://foobar/{}".format(i)) for i in range(2000)])

    url = reverse(
        "api-1-origin-search",
        url_args={"url_pattern": "foobar"},
        query_params={"limit": 1050},
    )
    rv = check_api_get_responses(api_client, url, status_code=200)
    assert len(rv.data) == 1000
 def test_pypi_missing_branch(self):
     origin_url = "https://pypi.org/project/abcdef/"
     self.indexer.storage.origin_add([Origin(url=origin_url, )])
     visit = self.indexer.storage.origin_visit_add([
         OriginVisit(
             origin=origin_url,
             date=datetime(2019, 2, 27, tzinfo=timezone.utc),
             type="pypi",
         )
     ])[0]
     self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
     visit_status = OriginVisitStatus(
         origin=origin_url,
         visit=visit.visit,
         date=now(),
         status="full",
         snapshot=SAMPLE_SNAPSHOT.id,
     )
     self.indexer.storage.origin_visit_status_add([visit_status])
     self.indexer.run(["https://pypi.org/project/abcdef/"])
     self.assertEqual(self.indexer.results, [])
 def test_git_partial_snapshot(self):
     """Checks partial snapshots are ignored."""
     origin_url = "https://github.com/SoftwareHeritage/swh-core"
     self.indexer.storage.origin_add([Origin(url=origin_url)])
     visit = self.indexer.storage.origin_visit_add([
         OriginVisit(
             origin=origin_url,
             date=datetime(2019, 2, 27, tzinfo=timezone.utc),
             type="git",
         )
     ])[0]
     self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
     visit_status = OriginVisitStatus(
         origin=origin_url,
         visit=visit.visit,
         date=now(),
         status="partial",
         snapshot=SAMPLE_SNAPSHOT.id,
     )
     self.indexer.storage.origin_visit_status_add([visit_status])
     self.indexer.run([origin_url])
     self.assertEqual(self.indexer.results, [])
def test_origin_metadata_indexer_partial_missing_head(
    swh_indexer_config,
    idx_storage: IndexerStorageInterface,
    storage: StorageInterface,
    obj_storage,
) -> None:

    origin1 = "https://example.com"
    origin2 = "https://github.com/librariesio/yarn-parser"
    storage.origin_add([Origin(url=origin1)])
    indexer = OriginMetadataIndexer(config=swh_indexer_config)
    indexer.run([origin1, origin2])

    rev_id = REVISION.id

    rev_results = list(
        indexer.idx_storage.revision_intrinsic_metadata_get([rev_id]))
    assert rev_results == [
        RevisionIntrinsicMetadataRow(
            id=rev_id,
            metadata=YARN_PARSER_METADATA,
            mappings=["npm"],
            tool=rev_results[0].tool,
        )
    ]

    orig_results = list(
        indexer.idx_storage.origin_intrinsic_metadata_get([origin1, origin2]))
    for orig_result in orig_results:
        assert orig_results == [
            OriginIntrinsicMetadataRow(
                id=origin2,
                from_revision=rev_id,
                metadata=YARN_PARSER_METADATA,
                mappings=["npm"],
                tool=orig_results[0].tool,
            )
        ]
Ejemplo n.º 21
0
def test_load_extids() -> None:
    """Checks PackageLoader.load() skips iff it should, and writes (only)
    the new ExtIDs"""
    storage = get_storage("memory")

    dir_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                          object_id=b"e" * 20)

    rels = [
        Release(
            name=f"v{i}.0".encode(),
            message=b"blah\n",
            target=dir_swhid.object_id,
            target_type=ModelObjectType.DIRECTORY,
            synthetic=True,
        ) for i in (1, 2, 3, 4)
    ]
    storage.release_add(rels[0:3])

    origin = "http://example.org"
    rel1_swhid = rels[0].swhid()
    rel2_swhid = rels[1].swhid()
    rel3_swhid = rels[2].swhid()
    rel4_swhid = rels[3].swhid()

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid),
    ])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel3_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel4_swhid.object_id, dir_swhid.object_id),
        autospec=True,
    ).start()

    loader.load()

    assert loader._load_release.mock_calls == [  # type: ignore
        # v1.0: not loaded because there is already its (extid_type, extid, rel)
        #       in the storage.
        # v2.0: loaded, because there is already a similar extid, but different type
        call(
            StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
            Origin(url=origin),
        ),
        # v3.0: loaded despite having an (extid_type, extid) in storage, because
        #       the target of the extid is not in the previous snapshot
        call(
            StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
            Origin(url=origin),
        ),
        # v4.0: loaded, because there isn't its extid
        call(
            StubPackageInfo(origin, "example-v4.0.tar", "v4.0"),
            Origin(url=origin),
        ),
    ]

    # then check the snapshot has all the branches.
    # versions 2.0 to 4.0 all point to rel4_swhid (instead of the value of the last
    # snapshot), because they had to be loaded (mismatched extid), and the mocked
    # _load_release always returns rel4_swhid.
    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
            b"branch-v4.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
            rel3_swhid.object_id,
            rel4_swhid.object_id,
        ],
    )

    assert set(extids) == {
        # What we inserted at the beginning of the test:
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid),
        # Added by the loader:
        ExtID("extid-type1", b"extid-of-v2.0", rel4_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel4_swhid),
        ExtID("extid-type2", b"extid-of-v4.0", rel4_swhid),
    }
Ejemplo n.º 22
0
def test_pypi_3(mocker):
    """Tests loading a revision generated by a very old PyPI loader that
    does not have a provider or has 'project' metadata."""

    mocker.patch(
        "swh.storage.migrate_extrinsic_metadata.urlopen",
        side_effect=urllib.error.HTTPError(None, 404, "Not Found", None, None),
    )

    source_original_artifact = {
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "date": "2014-05-07T22:03:00",
        "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12",
        "size": 46644,
        "sha256":
        "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5",
        "blake2s256":
        "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        "archive_type": "tar",
    }

    dest_original_artifacts = [{
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "archive_type": "tar",
        "length": 46644,
        "checksums": {
            "sha1":
            "3289269f75b4111dd00eaea53e00330db9a1db12",
            "sha256":
            "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
            "sha1_git":
            "1e5c38014731242cfa8594839bcba8a0c4e158c5",
            "blake2s256":
            "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        },
    }]

    row = {
        "id":
        b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2",
        "directory":
        DIRECTORY_ID,
        "date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "committer_date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "type":
        "tar",
        "message":
        b"0.1.32",
        "metadata": {
            "original_artifact": source_original_artifact
        },
    }

    origin_url = "https://pypi.org/project/PyPDFLite/"

    storage = get_storage("memory")

    storage.origin_add([Origin(url=origin_url)])
    storage.metadata_authority_add([
        attr.evolve(PYPI_AUTHORITY, metadata={}),
        attr.evolve(SWH_AUTHORITY, metadata={}),
    ])
    storage.metadata_fetcher_add([FETCHER])
    deposit_cur = None
    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)

    revision_swhid = CoreSWHID.from_string(
        "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2")

    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=PYPI_AUTHORITY,
    ) == PagedResult(
        results=[],
        next_page_token=None,
    )
    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=SWH_AUTHORITY,
    ) == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(
                    2014,
                    5,
                    7,
                    22,
                    3,
                    tzinfo=datetime.timezone.utc,
                ),
                authority=SWH_AUTHORITY,
                fetcher=FETCHER,
                format="original-artifacts-json",
                metadata=json.dumps(dest_original_artifacts).encode(),
                origin=None,
                revision=revision_swhid,
            ),
        ],
        next_page_token=None,
    )
Ejemplo n.º 23
0
class StorageData:
    """Data model objects to use within tests."""

    content = Content(
        data=b"42\n",
        length=3,
        sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"),
        sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        sha256=hash_to_bytes(
            "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0"
        ),
        blake2s256=hash_to_bytes(
            "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d"
        ),
        status="visible",
    )
    content2 = Content(
        data=b"4242\n",
        length=5,
        sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"),
        sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"),
        sha256=hash_to_bytes(
            "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd"
        ),
        blake2s256=hash_to_bytes(
            "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d"
        ),
        status="visible",
    )
    content3 = Content(
        data=b"424242\n",
        length=7,
        sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"),
        sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"),
        sha256=hash_to_bytes(
            "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36"
        ),
        blake2s256=hash_to_bytes(
            "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11"
        ),
        status="visible",
        ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc),
    )
    contents: Tuple[Content, ...] = (content, content2, content3)

    skipped_content = SkippedContent(
        length=1024 * 1024 * 200,
        sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"),
        sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
        origin="file:///dev/zero",
    )
    skipped_content2 = SkippedContent(
        length=1024 * 1024 * 300,
        sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"),
        sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
    )
    skipped_contents: Tuple[SkippedContent,
                            ...] = (skipped_content, skipped_content2)

    directory5 = Directory(
        id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"),
        entries=(),
    )
    directory = Directory(
        id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar\xc3",
                type="dir",
                target=directory5.id,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
    )
    directory2 = Directory(
        id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"),
        entries=tuple([
            DirectoryEntry(
                name=b"oof",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            )
        ], ),
    )
    directory3 = Directory(
        id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"subdir",
                type="dir",
                target=directory.id,
                perms=from_disk.DentryPerms.directory,
            ),
            DirectoryEntry(
                name=b"hello",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
        ], ),
    )
    directory4 = Directory(
        id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"),
        entries=tuple([
            DirectoryEntry(
                name=b"subdir1",
                type="dir",
                target=directory3.id,
                perms=from_disk.DentryPerms.directory,
            )
        ], ),
    )

    directory6 = Directory(
        id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=b"\x00" * 20,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar",
                type="dir",
                target=b"\x01" * 20,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
        raw_manifest=(
            b"tree 61\x00"
            b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"  # noqa
            b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"  # noqa
        ),
    )

    directories: Tuple[Directory, ...] = (
        directory2,
        directory,
        directory3,
        directory4,
        directory5,
        directory6,
    )

    revision = Revision(
        id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
        },
        extra_headers=(
            (b"gpgsig", b"test123"),
            (b"mergetag", b"foo\\bar"),
            (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"),
        ),
        synthetic=True,
    )
    revision2 = Revision(
        id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    revision3 = Revision(
        id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id, revision2.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=True,
    )
    revision4 = Revision(
        id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([revision3.id]),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    git_revisions: Tuple[Revision,
                         ...] = (revision, revision2, revision3, revision4)

    hg_revision = Revision(
        id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
            "node": "a316dfb434af2b451c1f393496b7eaeda343f543",
        },
        extra_headers=(),
        synthetic=True,
    )
    hg_revision2 = Revision(
        id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ),
        synthetic=False,
    )
    hg_revision3 = Revision(
        id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id, hg_revision2.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ),
        synthetic=True,
    )
    hg_revision4 = Revision(
        id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([hg_revision3.id]),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ),
        synthetic=False,
    )
    hg_revisions: Tuple[Revision, ...] = (
        hg_revision,
        hg_revision2,
        hg_revision3,
        hg_revision4,
    )
    revisions: Tuple[Revision, ...] = git_revisions + hg_revisions

    origins: Tuple[Origin, ...] = (
        Origin(url="https://github.com/user1/repo1"),
        Origin(url="https://github.com/user2/repo1"),
        Origin(url="https://github.com/user3/repo1"),
        Origin(url="https://gitlab.com/user1/repo1"),
        Origin(url="https://gitlab.com/user2/repo1"),
        Origin(url="https://forge.softwareheritage.org/source/repo1"),
        Origin(url="https://example.рф/🏛️.txt"),
    )
    origin, origin2 = origins[:2]

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="http://hal.inria.example.com/",
    )
    metadata_authority2 = MetadataAuthority(
        type=MetadataAuthorityType.REGISTRY,
        url="http://wikidata.example.com/",
    )
    authorities: Tuple[MetadataAuthority, ...] = (
        metadata_authority,
        metadata_authority2,
    )

    metadata_fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )
    metadata_fetcher2 = MetadataFetcher(
        name="swh-example",
        version="0.0.1",
    )
    fetchers: Tuple[MetadataFetcher,
                    ...] = (metadata_fetcher, metadata_fetcher2)

    date_visit1 = datetime.datetime(2015,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit2 = datetime.datetime(2017,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit3 = datetime.datetime(2018,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)

    type_visit1 = "git"
    type_visit2 = "hg"
    type_visit3 = "deb"

    origin_visit = OriginVisit(
        origin=origin.url,
        visit=1,
        date=date_visit1,
        type=type_visit1,
    )
    origin_visit2 = OriginVisit(
        origin=origin.url,
        visit=2,
        date=date_visit2,
        type=type_visit1,
    )
    origin_visit3 = OriginVisit(
        origin=origin2.url,
        visit=1,
        date=date_visit1,
        type=type_visit2,
    )
    origin_visits: Tuple[OriginVisit, ...] = (
        origin_visit,
        origin_visit2,
        origin_visit3,
    )

    release = Release(
        id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"),
        name=b"v0.0.1",
        author=Person(
            name=b"olasd",
            email=b"*****@*****.**",
            fullname=b"olasd <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0042",
        ),
        target=revision.id,
        target_type=ObjectType.REVISION,
        message=b"synthetic release",
        synthetic=True,
    )
    release2 = Release(
        id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision2.id,
        target_type=ObjectType.REVISION,
        message=b"v0.0.2\nMisc performance improvements + bug fixes",
        synthetic=False,
    )
    release3 = Release(
        id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision3.id,
        target_type=ObjectType.REVISION,
        message=b"yet another synthetic release",
        synthetic=True,
    )

    releases: Tuple[Release, ...] = (release, release2, release3)

    snapshot = Snapshot(
        id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"),
        branches={
            b"master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        },
    )
    empty_snapshot = Snapshot(
        id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
        branches={},
    )
    complete_snapshot = Snapshot(
        id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"),
        branches={
            b"directory":
            SnapshotBranch(
                target=directory.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"directory2":
            SnapshotBranch(
                target=directory2.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"content":
            SnapshotBranch(
                target=content.sha1_git,
                target_type=TargetType.CONTENT,
            ),
            b"alias":
            SnapshotBranch(
                target=b"revision",
                target_type=TargetType.ALIAS,
            ),
            b"revision":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
            b"release":
            SnapshotBranch(
                target=release.id,
                target_type=TargetType.RELEASE,
            ),
            b"snapshot":
            SnapshotBranch(
                target=empty_snapshot.id,
                target_type=TargetType.SNAPSHOT,
            ),
            b"dangling":
            None,
        },
    )

    snapshots: Tuple[Snapshot,
                     ...] = (snapshot, empty_snapshot, complete_snapshot)

    content_metadata1 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin.url,
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    content_metadata2 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin2.url,
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="yaml",
        metadata=b"foo: bar",
    )
    content_metadata3 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
        origin=origin.url,
        visit=42,
        snapshot=snapshot.swhid(),
        release=release.swhid(),
        revision=revision.swhid(),
        directory=directory.swhid(),
        path=b"/foo/bar",
    )

    content_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        content_metadata1,
        content_metadata2,
        content_metadata3,
    )

    origin_metadata1 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    origin_metadata2 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )
    origin_metadata3 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )

    origin_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        origin_metadata1,
        origin_metadata2,
        origin_metadata3,
    )

    extid1 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=revision.id),
        extid_type="git",
        extid=revision.id,
    )

    extid2 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=hg_revision.id),
        extid_type="mercurial",
        extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"),
    )

    extid3 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory.id),
        extid_type="directory",
        extid=b"something",
    )
    extid4 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory2.id),
        extid_type="directory",
        extid=b"something",
        extid_version=2,
    )

    extids: Tuple[ExtID, ...] = (
        extid1,
        extid2,
        extid3,
        extid4,
    )
Ejemplo n.º 24
0
    Person,
    RawExtrinsicMetadata,
    Release,
    Sha1Git,
)
from swh.model.swhids import CoreSWHID, ExtendedSWHID

EMPTY_SNAPSHOT_ID = "1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"
FULL_SNAPSHOT_ID = "4ac5730a9393f5099b63a35a17b6c33d36d70c3a"

AUTHORITY = MetadataAuthority(
    type=MetadataAuthorityType.FORGE,
    url="http://example.org/",
)
ORIGIN_URL = "http://example.org/archive.tgz"
ORIGIN_SWHID = Origin(ORIGIN_URL).swhid()

REVISION_ID = hash_to_bytes("8ff44f081d43176474b267de5451f2c2e88089d0")
RELEASE_ID = hash_to_bytes("9477a708196b44e59efb4e47b7d979a4146bd428")
RELEASE_SWHID = CoreSWHID.from_string(f"swh:1:rel:{RELEASE_ID.hex()}")
DIRECTORY_ID = hash_to_bytes("aa" * 20)
DIRECTORY_SWHID = ExtendedSWHID.from_string(f"swh:1:dir:{DIRECTORY_ID.hex()}")

FETCHER = MetadataFetcher(
    name="swh.loader.package.tests.test_loader_metadata.MetadataTestLoader",
    version=__version__,
)

DISCOVERY_DATE = datetime.datetime.now(tz=datetime.timezone.utc)

DIRECTORY_METADATA = [
Ejemplo n.º 25
0
def test_pypi_good_origin():
    """Tests loading a revision whose origin we can find"""

    source_original_artifact = {
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "date": "2014-05-07T22:03:00",
        "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12",
        "size": 46644,
        "sha256":
        "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5",
        "blake2s256":
        "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        "archive_type": "tar",
    }

    dest_original_artifacts = [{
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "archive_type": "tar",
        "length": 46644,
        "checksums": {
            "sha1":
            "3289269f75b4111dd00eaea53e00330db9a1db12",
            "sha256":
            "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
            "sha1_git":
            "1e5c38014731242cfa8594839bcba8a0c4e158c5",
            "blake2s256":
            "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        },
    }]

    revision_id = b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2"
    row = {
        "id":
        revision_id,
        "directory":
        DIRECTORY_ID,
        "date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "committer_date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "type":
        "tar",
        "message":
        b"0.1.32",
        "metadata": {
            "original_artifact": source_original_artifact
        },
    }

    origin_url = "https://pypi.org/project/PyPDFLite/"

    storage = get_storage("memory")

    snapshot_id = b"42" * 10
    storage.origin_add([Origin(url=origin_url)])
    storage.origin_visit_add(
        [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin_url,
            visit=1,
            date=now(),
            status="partial",
            snapshot=snapshot_id,
        )
    ])
    storage.snapshot_add([
        Snapshot(
            id=snapshot_id,
            branches={
                b"foo":
                SnapshotBranch(
                    target_type=TargetType.REVISION,
                    target=revision_id,
                )
            },
        )
    ])
    storage.metadata_authority_add([
        attr.evolve(PYPI_AUTHORITY, metadata={}),
        attr.evolve(SWH_AUTHORITY, metadata={}),
    ])
    storage.metadata_fetcher_add([FETCHER])
    deposit_cur = None
    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)

    revision_swhid = CoreSWHID.from_string(
        "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2")

    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=PYPI_AUTHORITY,
    ) == PagedResult(
        results=[],
        next_page_token=None,
    )
    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=SWH_AUTHORITY,
    ) == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(
                    2014,
                    5,
                    7,
                    22,
                    3,
                    tzinfo=datetime.timezone.utc,
                ),
                authority=SWH_AUTHORITY,
                fetcher=FETCHER,
                format="original-artifacts-json",
                metadata=json.dumps(dest_original_artifacts).encode(),
                origin=origin_url,
                revision=revision_swhid,
            ),
        ],
        next_page_token=None,
    )
Ejemplo n.º 26
0
def swhid_of_origin(url):
    from swh.model.model import Origin

    return Origin(url).swhid()
Ejemplo n.º 27
0
def test_load_upgrade_from_revision_extids(caplog):
    """Tests that, when loading incrementally based on a snapshot made by an old
    version of the loader, the loader will convert revisions to releases
    and add them to the storage.

    Also checks that, if an extid exists pointing to a non-existent revision
    (which should never happen, but you never know...), the release is loaded from
    scratch."""

    storage = get_storage("memory")

    origin = "http://example.org"
    dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"d" * 20)
    dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"e" * 20)

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime.now(tz=datetime.timezone.utc))
    person = Person.from_fullname(b"Jane Doe <*****@*****.**>")

    rev1 = Revision(
        message=b"blah",
        author=person,
        date=date,
        committer=person,
        committer_date=date,
        directory=dir1_swhid.object_id,
        type=RevisionType.TAR,
        synthetic=True,
    )

    rel1 = Release(
        name=b"v1.0",
        message=b"blah\n",
        author=person,
        date=date,
        target=dir1_swhid.object_id,
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
    )

    rev1_swhid = rev1.swhid()
    rel1_swhid = rel1.swhid()
    rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION,
                           object_id=b"b" * 20)
    rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20)

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0),
        ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0),
    ])
    storage.revision_add([rev1])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev2_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel2_swhid.object_id, dir2_swhid.object_id),
        autospec=True,
    ).start()
    patch.object(
        loader,
        "get_versions",
        return_value=["v1.0", "v2.0", "v3.0"],
        autospec=True,
    ).start()

    caplog.set_level(logging.ERROR)

    loader.load()

    assert len(caplog.records) == 1
    (record, ) = caplog.records
    assert record.levelname == "ERROR"
    assert "Failed to upgrade branch branch-v2.0" in record.message

    assert loader._load_release.mock_calls == [
        # v1.0: not loaded because there is already a revision matching it
        # v2.0: loaded, as the revision is missing from the storage even though there
        #       is an extid
        call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
             Origin(url=origin)),
        # v3.0: loaded (did not exist yet)
        call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
             Origin(url=origin)),
    ]

    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
        ],
    )

    assert set(extids) == {
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid),
    }
Ejemplo n.º 28
0
def test_deposit_metadata_origin(
    url,
    authenticated_client,
    deposit_collection,
    atom_dataset,
    swh_storage,
):
    """Posting a swhid reference is stored on raw extrinsic metadata storage"""
    xml_data = atom_dataset["entry-data-with-origin-reference"].format(url=url)
    origin_swhid = Origin(url).swhid()
    deposit_client = authenticated_client.deposit_client
    swh_storage.origin_add([Origin(url)])
    response = post_atom(
        authenticated_client,
        reverse(COL_IRI, args=[deposit_collection.name]),
        data=xml_data,
    )

    assert response.status_code == status.HTTP_201_CREATED, response.content.decode(
    )
    response_content = ElementTree.fromstring(response.content)
    # Ensure the deposit is finalized
    deposit_id = int(
        response_content.findtext("swh:deposit_id", namespaces=NAMESPACES))
    deposit = Deposit.objects.get(pk=deposit_id)
    # we got not swhid as input so we cannot have those
    assert deposit.swhid is None
    assert deposit.swhid_context is None
    assert deposit.complete_date == deposit.reception_date
    assert deposit.complete_date is not None
    assert deposit.status == DEPOSIT_STATUS_LOAD_SUCCESS

    # Ensure metadata stored in the metadata storage is consistent
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url=deposit_client.provider_url,
    )

    actual_authority = swh_storage.metadata_authority_get(
        MetadataAuthorityType.DEPOSIT_CLIENT, url=deposit_client.provider_url)
    assert actual_authority == metadata_authority

    config = APIConfig()
    metadata_fetcher = MetadataFetcher(
        name=config.tool["name"],
        version=config.tool["version"],
    )

    actual_fetcher = swh_storage.metadata_fetcher_get(config.tool["name"],
                                                      config.tool["version"])
    assert actual_fetcher == metadata_fetcher

    # Get the deposited metadata object and check it:

    page_results = swh_storage.raw_extrinsic_metadata_get(
        origin_swhid, metadata_authority)

    assert len(page_results.results) == 1
    assert page_results.next_page_token is None

    metadata = RawExtrinsicMetadata(
        target=origin_swhid,
        discovery_date=deposit.complete_date,
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="sword-v2-atom-codemeta",
        metadata=xml_data.encode(),
    )
    assert page_results == PagedResult(
        results=[metadata],
        next_page_token=None,
    )

    # Get metadata about the deposited metadata object and check it:
    _assert_deposit_info_on_metadata(swh_storage, metadata.swhid(), deposit,
                                     metadata_fetcher)
Ejemplo n.º 29
0
def test_lookup_origin_single_slash_after_protocol(archive_data):
    origin_url = "http://snapshot.debian.org/package/r-base/"
    malformed_origin_url = "http:/snapshot.debian.org/package/r-base/"
    archive_data.origin_add([Origin(url=origin_url)])
    origin_info = archive.lookup_origin({"url": malformed_origin_url})
    assert origin_info["url"] == origin_url
Ejemplo n.º 30
0
def test_lookup_origin_missing_trailing_slash(archive_data):
    deb_origin = Origin(url="http://snapshot.debian.org/package/r-base/")
    archive_data.origin_add([deb_origin])
    origin_info = archive.lookup_origin({"url": deb_origin.url[:-1]})
    assert origin_info["url"] == deb_origin.url