Ejemplo n.º 1
0
def test_pypi_visit_1_release_with_2_artifacts(swh_storage, requests_mock_datadir):
    """With no prior visit, load a pypi project ends up with 1 snapshot"""
    url = "https://pypi.org/project/nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("1394b2e59351a944cc763bd9d26d90ce8e8121a8")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id
    )

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"releases/1.1.0/nexter-1.1.0.zip": SnapshotBranch(
                target=hash_to_bytes("f7d43faeb65b64d3faa67e4f46559db57d26b9a4"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.1.0/nexter-1.1.0.tar.gz": SnapshotBranch(
                target=hash_to_bytes("732bb9dc087e6015884daaebb8b82559be729b5a"),
                target_type=TargetType.RELEASE,
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)
Ejemplo n.º 2
0
def test_npm_artifact_use_mtime_if_no_time(swh_storage, requests_mock_datadir):
    """With no time upload, artifact is skipped"""
    package = "jammit-express"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("33b8f105d48ce16b6c59158af660e0cc78bcbef4")

    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    # artifact is used
    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target_type=TargetType.ALIAS, target=b"releases/0.0.1"
            ),
            b"releases/0.0.1": SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=hash_to_bytes("3e3b800570869fa9b3dbc302500553e62400cc06"),
            ),
        },
    )

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot.id
    )

    check_snapshot(expected_snapshot, swh_storage)
Ejemplo n.º 3
0
def test_lookup_snapshot_sizes_with_filtering(archive_data, revision):
    rev_id = hash_to_bytes(revision)
    snapshot = Snapshot(branches={
        b"refs/heads/master":
        SnapshotBranch(
            target=rev_id,
            target_type=TargetType.REVISION,
        ),
        b"refs/heads/incoming":
        SnapshotBranch(
            target=rev_id,
            target_type=TargetType.REVISION,
        ),
        b"refs/pull/1":
        SnapshotBranch(
            target=rev_id,
            target_type=TargetType.REVISION,
        ),
        b"refs/pull/2":
        SnapshotBranch(
            target=rev_id,
            target_type=TargetType.REVISION,
        ),
    }, )
    archive_data.snapshot_add([snapshot])

    expected_sizes = {"alias": 0, "release": 0, "revision": 2}

    assert (archive.lookup_snapshot_sizes(
        snapshot.id.hex(),
        branch_name_exclude_prefix="refs/pull/") == expected_sizes)
Ejemplo n.º 4
0
def test_snapshot_resolve_alias_cycle_found(swh_storage):
    alias1_name = b"alias_1"
    alias2_name = b"alias_2"
    alias3_name = b"alias_3"
    alias4_name = b"alias_4"

    alias1_branch_info = SnapshotBranch(target=alias2_name,
                                        target_type=TargetType.ALIAS)
    alias2_branch_info = SnapshotBranch(target=alias3_name,
                                        target_type=TargetType.ALIAS)
    alias3_branch_info = SnapshotBranch(target=alias4_name,
                                        target_type=TargetType.ALIAS)
    alias4_branch_info = SnapshotBranch(target=alias2_name,
                                        target_type=TargetType.ALIAS)

    snapshot = Snapshot(
        branches={
            alias1_name: alias1_branch_info,
            alias2_name: alias2_branch_info,
            alias3_name: alias3_branch_info,
            alias4_name: alias4_branch_info,
        })
    swh_storage.snapshot_add([snapshot])

    assert snapshot_resolve_alias(swh_storage, snapshot.id,
                                  alias1_name) is None
Ejemplo n.º 5
0
def test_lookup_snapshot_branch_names_filtering(archive_data, revision):
    rev_id = hash_to_bytes(revision)
    snapshot = Snapshot(branches={
        b"refs/heads/master":
        SnapshotBranch(
            target=rev_id,
            target_type=TargetType.REVISION,
        ),
        b"refs/heads/incoming":
        SnapshotBranch(
            target=rev_id,
            target_type=TargetType.REVISION,
        ),
        b"refs/pull/1":
        SnapshotBranch(
            target=rev_id,
            target_type=TargetType.REVISION,
        ),
        b"refs/pull/2":
        SnapshotBranch(
            target=rev_id,
            target_type=TargetType.REVISION,
        ),
        "non_ascii_name_é".encode():
        SnapshotBranch(
            target=rev_id,
            target_type=TargetType.REVISION,
        ),
    }, )
    archive_data.snapshot_add([snapshot])

    for include_pattern, exclude_prefix, nb_results in (
        ("pull", None, 2),
        ("incoming", None, 1),
        ("é", None, 1),
        (None, "refs/heads/", 3),
        ("refs", "refs/heads/master", 3),
    ):

        branches = archive.lookup_snapshot(
            hash_to_hex(snapshot.id),
            branch_name_include_substring=include_pattern,
            branch_name_exclude_prefix=exclude_prefix,
        )["branches"]
        assert len(branches) == nb_results
        for branch_name in branches:
            if include_pattern:
                assert include_pattern in branch_name
            if exclude_prefix:
                assert not branch_name.startswith(exclude_prefix)
Ejemplo n.º 6
0
def test_pypi_visit_with_missing_artifact(
    swh_storage, requests_mock_datadir_missing_one
):
    """Load a pypi project with some missing artifacts ends up with 1 snapshot"""
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("00785a38479abe5fbfa402df96be26d2ddf89c97")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage,
        url,
        status="partial",
        type="pypi",
        snapshot=expected_snapshot_id,
    )

    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"releases/1.2.0": SnapshotBranch(
                target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD": SnapshotBranch(
                target=b"releases/1.2.0",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    check_snapshot(expected_snapshot, storage=swh_storage)

    stats = get_stats(swh_storage)

    assert {
        "content": 3,
        "directory": 2,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Ejemplo n.º 7
0
    def test_load_dangling_symref(self):
        with open(os.path.join(self.destination_path, ".git/HEAD"), "wb") as f:
            f.write(b"ref: refs/heads/dangling-branch\n")

        res = self.loader.load()
        assert res == {"status": "eventful"}

        visit_status = assert_last_visit_matches(
            self.loader.storage, self.repo_url, status="full", type="git"
        )
        snapshot_id = visit_status.snapshot
        assert snapshot_id is not None

        snapshot = snapshot_get_all_branches(self.loader.storage, snapshot_id)
        branches = snapshot.branches

        assert branches[b"HEAD"] == SnapshotBranch(
            target=b"refs/heads/dangling-branch",
            target_type=TargetType.ALIAS,
        )
        assert branches[b"refs/heads/dangling-branch"] is None

        stats = get_stats(self.loader.storage)
        assert stats == {
            "content": 4,
            "directory": 7,
            "origin": 1,
            "origin_visit": 1,
            "release": 0,
            "revision": 7,
            "skipped_content": 0,
            "snapshot": 1,
        }
Ejemplo n.º 8
0
def test_origin_empty_snapshot_null_revision(client, archive_data, new_origin):
    snapshot = Snapshot(
        branches={
            b"HEAD": SnapshotBranch(
                target="refs/head/master".encode(), target_type=TargetType.ALIAS,
            ),
            b"refs/head/master": None,
        }
    )
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([snapshot])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=now(), type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="partial",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory", query_params={"origin_url": new_origin.url},
    )

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/directory.html"
    )
    resp_content = resp.content.decode("utf-8")
    assert re.search("snapshot.*is empty", resp_content)
    assert not re.search("swh-tr-link", resp_content)
def test_eoferror(swh_storage, requests_mock_datadir):
    """Load a truncated archive which is invalid to make the uncompress
    function raising the exception EOFError. We then check if a
    snapshot is created, meaning this error is well managed.

    """
    sources = (
        "https://nix-community.github.io/nixpkgs-swh/sources-EOFError.json"  # noqa
    )
    loader = NixGuixLoader(swh_storage, sources)
    loader.load()

    expected_snapshot = Snapshot(
        id=hash_to_bytes("4257fa2350168c6bfec726a06452ea27a2c0cb33"),
        branches={
            b"evaluation":
            SnapshotBranch(
                target=hash_to_bytes(
                    "cc4e04c26672dd74e5fd0fecb78b435fb55368f7"),
                target_type=TargetType.REVISION,
            ),
        },
    )

    check_snapshot(expected_snapshot, storage=swh_storage)
Ejemplo n.º 10
0
def snapshot_converter(db: BaseDb, snapshot_d: Dict[str, Any]) -> Snapshot:
    """Convert snapshot from the flat representation to swh model
    compatible objects.

    """
    columns = ["name", "target", "target_type"]
    query = """
    select %s
    from snapshot_branches sbs
    inner join snapshot_branch sb on sb.object_id=sbs.branch_id
    where sbs.snapshot_id=%%s
    """ % ", ".join(columns)
    with db.cursor() as cur:
        cur.execute(query, (snapshot_d["object_id"], ))
        branches = {}
        for name, *row in cur:
            branch_d = dict(zip(columns[1:], row))
            if branch_d["target"] is not None and branch_d[
                    "target_type"] is not None:
                branch: Optional[SnapshotBranch] = SnapshotBranch(
                    target=branch_d["target"],
                    target_type=TargetType(branch_d["target_type"]),
                )
            else:
                branch = None
            branches[name] = branch

    return Snapshot(
        id=snapshot_d["id"],
        branches=branches,
    )
Ejemplo n.º 11
0
    def generate_and_load_snapshot(
        self, revision: Optional[Revision] = None, snapshot: Optional[Snapshot] = None
    ) -> Snapshot:
        """Create the snapshot either from existing revision or snapshot.

        Revision (supposedly new) has priority over the snapshot
        (supposedly existing one).

        Args:
            revision (dict): Last revision seen if any (None by default)
            snapshot (dict): Snapshot to use if any (None by default)

        Returns:
            Optional[Snapshot] The newly created snapshot

        """
        if revision:  # Priority to the revision
            snap = Snapshot(
                branches={
                    DEFAULT_BRANCH: SnapshotBranch(
                        target=revision.id, target_type=TargetType.REVISION
                    )
                }
            )
        elif snapshot:  # Fallback to prior snapshot
            snap = snapshot
        else:
            raise ValueError(
                "generate_and_load_snapshot called with null revision and snapshot!"
            )
        self.log.debug("snapshot: %s", snap)
        self.storage.snapshot_add([snap])
        return snap
Ejemplo n.º 12
0
def test_pypi_visit_with_1_release_artifact(swh_storage, requests_mock_datadir):
    """With no prior visit, load a pypi project ends up with 1 snapshot"""
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("3dd50c1a0e48a7625cf1427e3190a65b787c774e")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id
    )

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"releases/1.1.0": SnapshotBranch(
                target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.2.0": SnapshotBranch(
                target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD": SnapshotBranch(
                target=b"releases/1.2.0",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 6,
        "directory": 4,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Ejemplo n.º 13
0
def test_npm_loader_version_divergence(swh_storage):
    package = "@aller/shared"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("68eed3d3bc852e7f435a84f18ee77e23f6884be2")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }
    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target_type=TargetType.ALIAS, target=b"releases/0.1.0"
            ),
            b"releases/0.1.0": SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=hash_to_bytes("0c486b50b407f847ef7581f595c2b6c2062f1089"),
            ),
            b"releases/0.1.1-alpha.14": SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=hash_to_bytes("79d80c87c0a8d104a216cc539baad962a454802a"),
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)

    assert {  # 1 new releases artifacts
        "content": 534,
        "directory": 153,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Ejemplo n.º 14
0
def expected_snapshot(expected_releases):
    return Snapshot(branches={
        b"HEAD":
        SnapshotBranch(
            target_type=TargetType.ALIAS,
            target=b"releases/0.1.1",
        ),
        b"releases/0.1.0":
        SnapshotBranch(
            target_type=TargetType.RELEASE,
            target=expected_releases[0].id,
        ),
        b"releases/0.1.1":
        SnapshotBranch(
            target_type=TargetType.RELEASE,
            target=expected_releases[1].id,
        ),
    }, )
Ejemplo n.º 15
0
def test_replay(
    swh_storage,
    kafka_prefix: str,
    kafka_consumer_group: str,
    kafka_server: str,
):
    kafka_prefix += ".swh.journal.objects"

    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test-producer",
            "acks": "all",
        }
    )

    snapshot = Snapshot(
        branches={
            b"HEAD": SnapshotBranch(
                target_type=TargetType.REVISION,
                target=b"\x01" * 20,
            )
        },
    )
    snapshot_dict = snapshot.to_dict()

    producer.produce(
        topic=kafka_prefix + ".snapshot",
        key=key_to_kafka(snapshot.id),
        value=value_to_kafka(snapshot_dict),
    )
    producer.flush()

    logger.debug("Flushed producer")

    result = invoke(
        "replay",
        "--stop-after-objects",
        "1",
        journal_config={
            "brokers": [kafka_server],
            "group_id": kafka_consumer_group,
            "prefix": kafka_prefix,
        },
    )

    expected = r"Done.\n"
    assert result.exit_code == 0, result.output
    assert re.fullmatch(expected, result.output, re.MULTILINE), result.output

    assert swh_storage.snapshot_get(snapshot.id) == {
        **snapshot_dict,
        "next_branch": None,
    }
Ejemplo n.º 16
0
def _check_debian_loading(swh_storage, packages):
    loader = DebianLoader(
        swh_storage,
        URL,
        packages=packages,
    )

    actual_load_status = loader.load()
    expected_snapshot_id = "474c0e3d5796d15363031c333533527d659c559e"
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id,
    }

    assert_last_visit_matches(
        swh_storage,
        URL,
        status="full",
        type="deb",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"releases/stretch/contrib/0.7.2-3":
            SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=hash_to_bytes(
                    "de96ae3d3e136f5c1709117059e2a2c05b8ee5ae"),
            ),
            b"releases/buster/contrib/0.7.2-4":
            SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=hash_to_bytes(
                    "11824484c585319302ea4fde4917faf78dfb1973"),
            ),
        },
    )

    check_snapshot(expected_snapshot, swh_storage)
Ejemplo n.º 17
0
def test_snapshot_resolve_alias_dangling_branch(swh_storage):
    dangling_branch_name = b"dangling_branch"
    alias_name = b"rev_alias"

    alias_branch = SnapshotBranch(target=dangling_branch_name,
                                  target_type=TargetType.ALIAS)

    snapshot = Snapshot(branches={
        dangling_branch_name: None,
        alias_name: alias_branch,
    })
    swh_storage.snapshot_add([snapshot])

    assert snapshot_resolve_alias(swh_storage, snapshot.id, alias_name) is None
Ejemplo n.º 18
0
def test_pypi_origin_from_project_name(mocker):
    origin_url = "https://pypi.org/project/ProjectName/"

    storage = get_storage("memory")

    revision_id = b"41" * 10
    snapshot_id = b"42" * 10
    storage.origin_add([Origin(url=origin_url)])
    storage.origin_visit_add(
        [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin_url,
            visit=1,
            date=now(),
            status="partial",
            snapshot=snapshot_id,
        )
    ])
    storage.snapshot_add([
        Snapshot(
            id=snapshot_id,
            branches={
                b"foo":
                SnapshotBranch(
                    target_type=TargetType.REVISION,
                    target=revision_id,
                )
            },
        )
    ])

    class response:
        code = 200

        def read(self):
            return b'{"info": {"name": "ProjectName"}}'

    mock_urlopen = mocker.patch(
        "swh.storage.migrate_extrinsic_metadata.urlopen",
        return_value=response(),
    )

    assert (pypi_origin_from_filename(
        storage, revision_id, "ProjectName-1.0.0.tar.gz") == origin_url)
    mock_urlopen.assert_not_called()
    assert (pypi_origin_from_filename(
        storage, revision_id, "projectname-1.0.0.tar.gz") == origin_url)
    mock_urlopen.assert_called_once_with(
        "https://pypi.org/pypi/projectname/json/")
Ejemplo n.º 19
0
def test_snapshot_resolve_alias_missing_branch(swh_storage):
    missing_branch_name = b"missing_branch"
    alias_name = b"rev_alias"

    alias_branch = SnapshotBranch(target=missing_branch_name,
                                  target_type=TargetType.ALIAS)

    snapshot = Snapshot(
        id=b"42" * 10,
        branches={
            alias_name: alias_branch,
        },
    )
    swh_storage.snapshot_add([snapshot])

    assert snapshot_resolve_alias(swh_storage, snapshot.id, alias_name) is None
Ejemplo n.º 20
0
def test_snapshot_resolve_alias(swh_storage, sample_data):
    rev_branch_name = b"revision_branch"
    rel_branch_name = b"release_branch"
    rev_alias1_name = b"rev_alias1"
    rev_alias2_name = b"rev_alias2"
    rev_alias3_name = b"rev_alias3"
    rel_alias_name = b"rel_alias"
    rev_branch_info = SnapshotBranch(
        target=sample_data.revisions[0].id,
        target_type=TargetType.REVISION,
    )
    rel_branch_info = SnapshotBranch(
        target=sample_data.releases[0].id,
        target_type=TargetType.RELEASE,
    )
    rev_alias1_branch_info = SnapshotBranch(target=rev_branch_name,
                                            target_type=TargetType.ALIAS)
    rev_alias2_branch_info = SnapshotBranch(target=rev_alias1_name,
                                            target_type=TargetType.ALIAS)

    rev_alias3_branch_info = SnapshotBranch(target=rev_alias2_name,
                                            target_type=TargetType.ALIAS)
    rel_alias_branch_info = SnapshotBranch(target=rel_branch_name,
                                           target_type=TargetType.ALIAS)

    snapshot = Snapshot(
        branches={
            rev_branch_name: rev_branch_info,
            rel_branch_name: rel_branch_info,
            rev_alias1_name: rev_alias1_branch_info,
            rev_alias2_name: rev_alias2_branch_info,
            rev_alias3_name: rev_alias3_branch_info,
            rel_alias_name: rel_alias_branch_info,
        })
    swh_storage.snapshot_add([snapshot])

    for alias_name, expected_branch in (
        (rev_alias1_name, rev_branch_info),
        (
            rev_alias2_name,
            rev_branch_info,
        ),
        (
            rev_alias3_name,
            rev_branch_info,
        ),
        (rel_alias_name, rel_branch_info),
    ):
        assert (snapshot_resolve_alias(swh_storage, snapshot.id,
                                       alias_name) == expected_branch)
Ejemplo n.º 21
0
def test_origin_browse_directory_branch_with_non_resolvable_revision(
    client, archive_data, new_origin, unknown_revision
):
    branch_name = "master"
    snapshot = Snapshot(
        branches={
            branch_name.encode(): SnapshotBranch(
                target=hash_to_bytes(unknown_revision), target_type=TargetType.REVISION,
            )
        }
    )
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([snapshot])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=now(), type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="partial",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory",
        query_params={"origin_url": new_origin.url, "branch": branch_name},
    )

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/directory.html"
    )
    assert_contains(
        resp, f"Revision {unknown_revision } could not be found in the archive."
    )
Ejemplo n.º 22
0
def test_npm_loader_duplicate_shasum(swh_storage, requests_mock_datadir):
    """Test with two versions that have exactly the same tarball"""
    package = "org_version_mismatch"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("ac867a4c22ba4e22a022d319f309714477412a5a")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    beta_release_id = "e6d5490a02ac2a8dcd49702f9ccd5a64c90a46f1"
    release_id = "f6985f437e28db6eb1b7533230e05ed99f2c91f0"
    versions = [
        ("0.0.3-beta", beta_release_id),
        ("0.0.3", release_id),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.3", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(beta_release_id)])[0] == Release(
        name=b"0.0.3-beta",
        message=(
            b"Synthetic release for NPM source package org_version_mismatch "
            b"version 0.0.3-beta\n"
        ),
        target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(beta_release_id),
    )

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.3",
        message=(
            b"Synthetic release for NPM source package org_version_mismatch "
            b"version 0.0.3\n"
        ),
        target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 55, 45, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    # Check incremental re-load keeps it unchanged

    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    assert actual_load_status == {
        "status": "uneventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )
Ejemplo n.º 23
0
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info):
    package = "org"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    release_id = "d38cc0b571cd41f3c85513864e049766b42032a7"
    versions = [
        ("0.0.2", release_id),
        ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"),
        ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.4", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.2",
        message=b"Synthetic release for NPM source package org version 0.0.2\n",
        target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person(
            fullname=b"mooz <*****@*****.**>",
            name=b"mooz",
            email=b"*****@*****.**",
        ),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    contents = swh_storage.content_get(_expected_new_contents_first_visit)
    count = sum(0 if content is None else 1 for content in contents)
    assert count == len(_expected_new_contents_first_visit)

    assert (
        list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == []
    )

    assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == []

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://npmjs.com/",
    )

    for (version_name, release_id) in versions:
        release = swh_storage.release_get([hash_to_bytes(release_id)])[0]
        assert release.target_type == ModelObjectType.DIRECTORY
        directory_id = release.target
        directory_swhid = ExtendedSWHID(
            object_type=ExtendedObjectType.DIRECTORY,
            object_id=directory_id,
        )
        release_swhid = CoreSWHID(
            object_type=ObjectType.RELEASE,
            object_id=hash_to_bytes(release_id),
        )
        expected_metadata = [
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.npm.loader.NpmLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="replicate-npm-package-json",
                metadata=json.dumps(
                    json.loads(org_api_info)["versions"][version_name]
                ).encode(),
                origin="https://www.npmjs.com/package/org",
                release=release_swhid,
            )
        ]
        assert swh_storage.raw_extrinsic_metadata_get(
            directory_swhid,
            metadata_authority,
        ) == PagedResult(
            next_page_token=None,
            results=expected_metadata,
        )

    stats = get_stats(swh_storage)

    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
def test_ignore_displayname(swh_storage, use_graph):
    """Tests the original authorship information is used instead of
    configured display names; otherwise objects would not match their hash,
    and git-fsck/git-clone would fail.

    This tests both with and without swh-graph, as both configurations use different
    code paths to fetch revisions.
    """

    date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0),
                                                     0, False)
    legacy_person = Person.from_fullname(b"old me <*****@*****.**>")
    current_person = Person.from_fullname(b"me <*****@*****.**>")

    content = Content.from_data(b"foo")
    swh_storage.content_add([content])

    directory = Directory(
        entries=(DirectoryEntry(name=b"file1",
                                type="file",
                                perms=0o100644,
                                target=content.sha1_git), ), )
    swh_storage.directory_add([directory])

    revision = Revision(
        message=b"rev",
        author=legacy_person,
        date=date,
        committer=legacy_person,
        committer_date=date,
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        synthetic=True,
    )
    swh_storage.revision_add([revision])

    release = Release(
        name=b"v1.1.0",
        message=None,
        author=legacy_person,
        date=date,
        target=revision.id,
        target_type=ObjectType.REVISION,
        synthetic=True,
    )
    swh_storage.release_add([release])

    snapshot = Snapshot(
        branches={
            b"refs/tags/v1.1.0":
            SnapshotBranch(target=release.id, target_type=TargetType.RELEASE),
            b"HEAD":
            SnapshotBranch(target=revision.id,
                           target_type=TargetType.REVISION),
        })
    swh_storage.snapshot_add([snapshot])

    # Add all objects to graph
    if use_graph:
        from swh.graph.naive_client import NaiveClient as GraphClient

        nodes = [
            str(x.swhid())
            for x in [content, directory, revision, release, snapshot]
        ]
        edges = [(str(x.swhid()), str(y.swhid())) for (x, y) in [
            (directory, content),
            (revision, directory),
            (release, revision),
            (snapshot, release),
            (snapshot, revision),
        ]]
        swh_graph = unittest.mock.Mock(
            wraps=GraphClient(nodes=nodes, edges=edges))
    else:
        swh_graph = None

    # Set a display name
    with swh_storage.db() as db:
        with db.transaction() as cur:
            cur.execute(
                "UPDATE person set displayname = %s where fullname = %s",
                (current_person.fullname, legacy_person.fullname),
            )

    # Check the display name did apply in the storage
    assert swh_storage.revision_get([revision.id])[0] == attr.evolve(
        revision,
        author=current_person,
        committer=current_person,
    )

    # Cook
    cooked_swhid = snapshot.swhid()
    backend = InMemoryVaultBackend()
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=swh_graph,
    )

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        # If we are here, it means git-fsck succeeded when called by cooker.cook(),
        # so we already know the original person was used. Let's double-check.

        repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git")

        tag = repo[b"refs/tags/v1.1.0"]
        assert tag.tagger == legacy_person.fullname

        commit = repo[tag.object[1]]
        assert commit.author == legacy_person.fullname
Ejemplo n.º 25
0
class StorageData:
    """Data model objects to use within tests."""

    content = Content(
        data=b"42\n",
        length=3,
        sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"),
        sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        sha256=hash_to_bytes(
            "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0"
        ),
        blake2s256=hash_to_bytes(
            "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d"
        ),
        status="visible",
    )
    content2 = Content(
        data=b"4242\n",
        length=5,
        sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"),
        sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"),
        sha256=hash_to_bytes(
            "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd"
        ),
        blake2s256=hash_to_bytes(
            "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d"
        ),
        status="visible",
    )
    content3 = Content(
        data=b"424242\n",
        length=7,
        sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"),
        sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"),
        sha256=hash_to_bytes(
            "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36"
        ),
        blake2s256=hash_to_bytes(
            "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11"
        ),
        status="visible",
        ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc),
    )
    contents: Tuple[Content, ...] = (content, content2, content3)

    skipped_content = SkippedContent(
        length=1024 * 1024 * 200,
        sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"),
        sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
        origin="file:///dev/zero",
    )
    skipped_content2 = SkippedContent(
        length=1024 * 1024 * 300,
        sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"),
        sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
    )
    skipped_contents: Tuple[SkippedContent,
                            ...] = (skipped_content, skipped_content2)

    directory5 = Directory(
        id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"),
        entries=(),
    )
    directory = Directory(
        id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar\xc3",
                type="dir",
                target=directory5.id,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
    )
    directory2 = Directory(
        id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"),
        entries=tuple([
            DirectoryEntry(
                name=b"oof",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            )
        ], ),
    )
    directory3 = Directory(
        id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"subdir",
                type="dir",
                target=directory.id,
                perms=from_disk.DentryPerms.directory,
            ),
            DirectoryEntry(
                name=b"hello",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
        ], ),
    )
    directory4 = Directory(
        id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"),
        entries=tuple([
            DirectoryEntry(
                name=b"subdir1",
                type="dir",
                target=directory3.id,
                perms=from_disk.DentryPerms.directory,
            )
        ], ),
    )

    directory6 = Directory(
        id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=b"\x00" * 20,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar",
                type="dir",
                target=b"\x01" * 20,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
        raw_manifest=(
            b"tree 61\x00"
            b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"  # noqa
            b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"  # noqa
        ),
    )

    directories: Tuple[Directory, ...] = (
        directory2,
        directory,
        directory3,
        directory4,
        directory5,
        directory6,
    )

    revision = Revision(
        id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
        },
        extra_headers=(
            (b"gpgsig", b"test123"),
            (b"mergetag", b"foo\\bar"),
            (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"),
        ),
        synthetic=True,
    )
    revision2 = Revision(
        id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    revision3 = Revision(
        id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id, revision2.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=True,
    )
    revision4 = Revision(
        id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([revision3.id]),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    git_revisions: Tuple[Revision,
                         ...] = (revision, revision2, revision3, revision4)

    hg_revision = Revision(
        id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
            "node": "a316dfb434af2b451c1f393496b7eaeda343f543",
        },
        extra_headers=(),
        synthetic=True,
    )
    hg_revision2 = Revision(
        id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ),
        synthetic=False,
    )
    hg_revision3 = Revision(
        id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id, hg_revision2.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ),
        synthetic=True,
    )
    hg_revision4 = Revision(
        id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([hg_revision3.id]),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ),
        synthetic=False,
    )
    hg_revisions: Tuple[Revision, ...] = (
        hg_revision,
        hg_revision2,
        hg_revision3,
        hg_revision4,
    )
    revisions: Tuple[Revision, ...] = git_revisions + hg_revisions

    origins: Tuple[Origin, ...] = (
        Origin(url="https://github.com/user1/repo1"),
        Origin(url="https://github.com/user2/repo1"),
        Origin(url="https://github.com/user3/repo1"),
        Origin(url="https://gitlab.com/user1/repo1"),
        Origin(url="https://gitlab.com/user2/repo1"),
        Origin(url="https://forge.softwareheritage.org/source/repo1"),
        Origin(url="https://example.рф/🏛️.txt"),
    )
    origin, origin2 = origins[:2]

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="http://hal.inria.example.com/",
    )
    metadata_authority2 = MetadataAuthority(
        type=MetadataAuthorityType.REGISTRY,
        url="http://wikidata.example.com/",
    )
    authorities: Tuple[MetadataAuthority, ...] = (
        metadata_authority,
        metadata_authority2,
    )

    metadata_fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )
    metadata_fetcher2 = MetadataFetcher(
        name="swh-example",
        version="0.0.1",
    )
    fetchers: Tuple[MetadataFetcher,
                    ...] = (metadata_fetcher, metadata_fetcher2)

    date_visit1 = datetime.datetime(2015,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit2 = datetime.datetime(2017,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit3 = datetime.datetime(2018,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)

    type_visit1 = "git"
    type_visit2 = "hg"
    type_visit3 = "deb"

    origin_visit = OriginVisit(
        origin=origin.url,
        visit=1,
        date=date_visit1,
        type=type_visit1,
    )
    origin_visit2 = OriginVisit(
        origin=origin.url,
        visit=2,
        date=date_visit2,
        type=type_visit1,
    )
    origin_visit3 = OriginVisit(
        origin=origin2.url,
        visit=1,
        date=date_visit1,
        type=type_visit2,
    )
    origin_visits: Tuple[OriginVisit, ...] = (
        origin_visit,
        origin_visit2,
        origin_visit3,
    )

    release = Release(
        id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"),
        name=b"v0.0.1",
        author=Person(
            name=b"olasd",
            email=b"*****@*****.**",
            fullname=b"olasd <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0042",
        ),
        target=revision.id,
        target_type=ObjectType.REVISION,
        message=b"synthetic release",
        synthetic=True,
    )
    release2 = Release(
        id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision2.id,
        target_type=ObjectType.REVISION,
        message=b"v0.0.2\nMisc performance improvements + bug fixes",
        synthetic=False,
    )
    release3 = Release(
        id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision3.id,
        target_type=ObjectType.REVISION,
        message=b"yet another synthetic release",
        synthetic=True,
    )

    releases: Tuple[Release, ...] = (release, release2, release3)

    snapshot = Snapshot(
        id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"),
        branches={
            b"master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        },
    )
    empty_snapshot = Snapshot(
        id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
        branches={},
    )
    complete_snapshot = Snapshot(
        id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"),
        branches={
            b"directory":
            SnapshotBranch(
                target=directory.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"directory2":
            SnapshotBranch(
                target=directory2.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"content":
            SnapshotBranch(
                target=content.sha1_git,
                target_type=TargetType.CONTENT,
            ),
            b"alias":
            SnapshotBranch(
                target=b"revision",
                target_type=TargetType.ALIAS,
            ),
            b"revision":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
            b"release":
            SnapshotBranch(
                target=release.id,
                target_type=TargetType.RELEASE,
            ),
            b"snapshot":
            SnapshotBranch(
                target=empty_snapshot.id,
                target_type=TargetType.SNAPSHOT,
            ),
            b"dangling":
            None,
        },
    )

    snapshots: Tuple[Snapshot,
                     ...] = (snapshot, empty_snapshot, complete_snapshot)

    content_metadata1 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin.url,
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    content_metadata2 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin2.url,
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="yaml",
        metadata=b"foo: bar",
    )
    content_metadata3 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
        origin=origin.url,
        visit=42,
        snapshot=snapshot.swhid(),
        release=release.swhid(),
        revision=revision.swhid(),
        directory=directory.swhid(),
        path=b"/foo/bar",
    )

    content_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        content_metadata1,
        content_metadata2,
        content_metadata3,
    )

    origin_metadata1 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    origin_metadata2 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )
    origin_metadata3 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )

    origin_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        origin_metadata1,
        origin_metadata2,
        origin_metadata3,
    )

    extid1 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=revision.id),
        extid_type="git",
        extid=revision.id,
    )

    extid2 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=hg_revision.id),
        extid_type="mercurial",
        extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"),
    )

    extid3 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory.id),
        extid_type="directory",
        extid=b"something",
    )
    extid4 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory2.id),
        extid_type="directory",
        extid=b"something",
        extid_version=2,
    )

    extids: Tuple[ExtID, ...] = (
        extid1,
        extid2,
        extid3,
        extid4,
    )
Ejemplo n.º 26
0
    def get_snapshot(self) -> Snapshot:
        """Get the snapshot for the current visit.

        The main complexity of this function is mapping target objects to their
        types, as the `refs` dictionaries returned by the git server only give
        us the identifiers for the target objects, and not their types.

        The loader itself only knows the types of the objects that it has
        fetched from the server (as it has parsed them while loading them to
        the archive). As we only fetched an increment between the previous
        snapshot and the current state of the server, we are missing the type
        information for the objects that would already have been referenced by
        the previous snapshot, and that the git server didn't send us. We infer
        the type of these objects from the previous snapshot.

        """
        branches: Dict[bytes, Optional[SnapshotBranch]] = {}

        unfetched_refs: Dict[bytes, bytes] = {}

        # Retrieve types from the objects loaded by the current loader
        for ref_name, ref_object in self.remote_refs.items():
            if ref_name in self.symbolic_refs:
                continue
            target = hashutil.hash_to_bytes(ref_object.decode())
            target_type = self.ref_object_types.get(ref_object)
            if target_type:
                branches[ref_name] = SnapshotBranch(
                    target=target, target_type=target_type
                )
            else:
                # The object pointed at by this ref was not fetched, supposedly
                # because it existed in the base snapshot. We record it here,
                # and we can get it from the base snapshot later.
                unfetched_refs[ref_name] = target

        dangling_branches = {}
        # Handle symbolic references as alias branches
        for ref_name, target in self.symbolic_refs.items():
            branches[ref_name] = SnapshotBranch(
                target_type=TargetType.ALIAS,
                target=target,
            )
            if target not in branches and target not in unfetched_refs:
                # This handles the case where the pointer is "dangling".
                # There's a chance that a further symbolic reference
                # override this default value, which is totally fine.
                dangling_branches[target] = ref_name
                branches[target] = None

        if unfetched_refs:
            # Handle inference of object types from the contents of the
            # previous snapshot
            unknown_objects = {}

            base_snapshot_reverse_branches = {
                branch.target: branch
                for base_snapshot in reversed(self.base_snapshots)
                for branch in base_snapshot.branches.values()
                if branch and branch.target_type != TargetType.ALIAS
            }
            assert all(
                base_snapshot_reverse_branches[branch.target] == branch
                for branch in self.prev_snapshot.branches.values()
                if branch and branch.target_type != TargetType.ALIAS
            ), "base_snapshot_reverse_branches is not a superset of prev_snapshot"

            for ref_name, target in unfetched_refs.items():
                branch = base_snapshot_reverse_branches.get(target)
                branches[ref_name] = branch
                if not branch:
                    unknown_objects[ref_name] = target

            if unknown_objects:
                # This object was referenced by the server; We did not fetch
                # it, and we do not know it from the previous snapshot. This is
                # likely a bug in the loader.
                raise RuntimeError(
                    "Unknown objects referenced by remote refs: %s"
                    % (
                        ", ".join(
                            f"{name.decode()}: {hashutil.hash_to_hex(obj)}"
                            for name, obj in unknown_objects.items()
                        )
                    )
                )

        utils.warn_dangling_branches(
            branches, dangling_branches, logger, self.origin.url
        )

        self.snapshot = Snapshot(branches=branches)
        return self.snapshot
Ejemplo n.º 27
0
def test_check_snapshot_failures(swh_storage):
    """Failure scenarios:

    0. snapshot parameter is not a snapshot
    1. snapshot id is correct but branches mismatched
    2. snapshot id is not correct, it's not found in the storage
    3. snapshot reference an alias which does not exist
    4. snapshot is found in storage, targeted revision does not exist
    5. snapshot is found in storage, targeted revision exists but the directory the
       revision targets does not exist
    6. snapshot is found in storage, target revision exists, targeted directory by the
       revision exist. Content targeted by the directory does not exist.
    7. snapshot is found in storage, targeted release does not exist

    """
    snap_id_hex = "2498dbf535f882bc7f9a18fb16c9ad27fda7bab7"
    snapshot = Snapshot(
        id=hash_to_bytes(snap_id_hex),
        branches={
            b"master":
            SnapshotBranch(
                target=hash_to_bytes(hash_hex),
                target_type=TargetType.REVISION,
            ),
        },
    )

    s = swh_storage.snapshot_add([snapshot])
    assert s == {
        "snapshot:add": 1,
    }

    unexpected_snapshot = Snapshot(
        branches={
            b"tip":
            SnapshotBranch(  # wrong branch
                target=hash_to_bytes(hash_hex),
                target_type=TargetType.RELEASE)
        }, )

    # 0. not a Snapshot object, raise!
    with pytest.raises(
            AssertionError,
            match="argument 'expected_snapshot' must be a snapshot"):
        check_snapshot(ORIGIN_VISIT, swh_storage)

    # 1. snapshot id is correct but branches mismatched
    with pytest.raises(
            AssertionError):  # sadly debian build raises only assertion
        check_snapshot(attr.evolve(unexpected_snapshot, id=snapshot.id),
                       swh_storage)

    # 2. snapshot id is not correct, it's not found in the storage
    wrong_snap_id = hash_to_bytes("999666f535f882bc7f9a18fb16c9ad27fda7bab7")
    with pytest.raises(AssertionError, match="is not found"):
        check_snapshot(attr.evolve(unexpected_snapshot, id=wrong_snap_id),
                       swh_storage)

    # 3. snapshot references an inexistent alias
    snapshot0 = Snapshot(
        id=hash_to_bytes("123666f535f882bc7f9a18fb16c9ad27fda7bab7"),
        branches={
            b"alias":
            SnapshotBranch(
                target=b"HEAD",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    swh_storage.snapshot_add([snapshot0])

    with pytest.raises(InconsistentAliasBranchError,
                       match="Alias branch HEAD"):
        check_snapshot(snapshot0, swh_storage)

    # 4. snapshot is found in storage, targeted revision does not exist

    rev_not_found = list(swh_storage.revision_missing([REVISION.id]))
    assert len(rev_not_found) == 1

    snapshot1 = Snapshot(
        id=hash_to_bytes("456666f535f882bc7f9a18fb16c9ad27fda7bab7"),
        branches={
            b"alias":
            SnapshotBranch(
                target=b"HEAD",
                target_type=TargetType.ALIAS,
            ),
            b"HEAD":
            SnapshotBranch(
                target=REVISION.id,
                target_type=TargetType.REVISION,
            ),
        },
    )

    swh_storage.snapshot_add([snapshot1])

    with pytest.raises(InexistentObjectsError, match="Branch/Revision"):
        check_snapshot(snapshot1, swh_storage)

    # 5. snapshot is found in storage, targeted revision exists but the directory the
    # revision targets does not exist

    swh_storage.revision_add([REVISION])

    dir_not_found = list(swh_storage.directory_missing([REVISION.directory]))
    assert len(dir_not_found) == 1

    snapshot2 = Snapshot(
        id=hash_to_bytes("987123f535f882bc7f9a18fb16c9ad27fda7bab7"),
        branches={
            b"alias":
            SnapshotBranch(
                target=b"HEAD",
                target_type=TargetType.ALIAS,
            ),
            b"HEAD":
            SnapshotBranch(
                target=REVISION.id,
                target_type=TargetType.REVISION,
            ),
        },
    )

    swh_storage.snapshot_add([snapshot2])
    with pytest.raises(InexistentObjectsError, match="Missing directories"):
        check_snapshot(snapshot2, swh_storage)

    assert DIRECTORY.id == REVISION.directory
    swh_storage.directory_add([DIRECTORY])

    # 6. snapshot is found in storage, target revision exists, targeted directory by the
    # revision exist. Content targeted by the directory does not exist.

    assert DIRECTORY.entries[0].target == CONTENT.sha1_git
    not_found = list(
        swh_storage.content_missing_per_sha1_git([CONTENT.sha1_git]))
    assert len(not_found) == 1

    swh_storage.directory_add([DIRECTORY])

    snapshot3 = Snapshot(
        id=hash_to_bytes("091456f535f882bc7f9a18fb16c9ad27fda7bab7"),
        branches={
            b"alias":
            SnapshotBranch(
                target=b"HEAD",
                target_type=TargetType.ALIAS,
            ),
            b"HEAD":
            SnapshotBranch(
                target=REVISION.id,
                target_type=TargetType.REVISION,
            ),
        },
    )

    swh_storage.snapshot_add([snapshot3])
    with pytest.raises(InexistentObjectsError, match="Missing content(s)"):
        check_snapshot(snapshot3, swh_storage)

    # 7. snapshot is found in storage, targeted release does not exist

    # release targets the revisions which exists
    assert RELEASE.target == REVISION.id

    snapshot4 = Snapshot(
        id=hash_to_bytes("789666f535f882bc7f9a18fb16c9ad27fda7bab7"),
        branches={
            b"alias":
            SnapshotBranch(
                target=b"HEAD",
                target_type=TargetType.ALIAS,
            ),
            b"HEAD":
            SnapshotBranch(
                target=REVISION.id,
                target_type=TargetType.REVISION,
            ),
            b"release/0.1.0":
            SnapshotBranch(
                target=RELEASE.id,
                target_type=TargetType.RELEASE,
            ),
        },
    )

    swh_storage.snapshot_add([snapshot4])

    with pytest.raises(InexistentObjectsError, match="Branch/Release"):
        check_snapshot(snapshot4, swh_storage)
Ejemplo n.º 28
0
                          22,
                          26,
                          53,
                          tzinfo=datetime.timezone.utc)),
    target=REVISION.id,
    target_type=ObjectType.REVISION,
    message=b"yet another synthetic release",
    synthetic=True,
)

SNAPSHOT = Snapshot(
    id=hash_to_bytes("2498dbf535f882bc7f9a18fb16c9ad27fda7bab7"),
    branches={
        b"release/0.1.0":
        SnapshotBranch(
            target=RELEASE.id,
            target_type=TargetType.RELEASE,
        ),
        b"HEAD":
        SnapshotBranch(
            target=REVISION.id,
            target_type=TargetType.REVISION,
        ),
        b"alias":
        SnapshotBranch(
            target=b"HEAD",
            target_type=TargetType.ALIAS,
        ),
        b"evaluation":
        SnapshotBranch(  # branch dedicated to not exist in storage
            target=hash_to_bytes("cc4e04c26672dd74e5fd0fecb78b435fb55368f7"),
            target_type=TargetType.REVISION,
Ejemplo n.º 29
0
def test_load_upgrade_from_revision_extids(caplog):
    """Tests that, when loading incrementally based on a snapshot made by an old
    version of the loader, the loader will convert revisions to releases
    and add them to the storage.

    Also checks that, if an extid exists pointing to a non-existent revision
    (which should never happen, but you never know...), the release is loaded from
    scratch."""

    storage = get_storage("memory")

    origin = "http://example.org"
    dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"d" * 20)
    dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"e" * 20)

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime.now(tz=datetime.timezone.utc))
    person = Person.from_fullname(b"Jane Doe <*****@*****.**>")

    rev1 = Revision(
        message=b"blah",
        author=person,
        date=date,
        committer=person,
        committer_date=date,
        directory=dir1_swhid.object_id,
        type=RevisionType.TAR,
        synthetic=True,
    )

    rel1 = Release(
        name=b"v1.0",
        message=b"blah\n",
        author=person,
        date=date,
        target=dir1_swhid.object_id,
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
    )

    rev1_swhid = rev1.swhid()
    rel1_swhid = rel1.swhid()
    rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION,
                           object_id=b"b" * 20)
    rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20)

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0),
        ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0),
    ])
    storage.revision_add([rev1])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev2_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel2_swhid.object_id, dir2_swhid.object_id),
        autospec=True,
    ).start()
    patch.object(
        loader,
        "get_versions",
        return_value=["v1.0", "v2.0", "v3.0"],
        autospec=True,
    ).start()

    caplog.set_level(logging.ERROR)

    loader.load()

    assert len(caplog.records) == 1
    (record, ) = caplog.records
    assert record.levelname == "ERROR"
    assert "Failed to upgrade branch branch-v2.0" in record.message

    assert loader._load_release.mock_calls == [
        # v1.0: not loaded because there is already a revision matching it
        # v2.0: loaded, as the revision is missing from the storage even though there
        #       is an extid
        call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
             Origin(url=origin)),
        # v3.0: loaded (did not exist yet)
        call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
             Origin(url=origin)),
    ]

    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
        ],
    )

    assert set(extids) == {
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid),
    }
Ejemplo n.º 30
0
def test_load_extids() -> None:
    """Checks PackageLoader.load() skips iff it should, and writes (only)
    the new ExtIDs"""
    storage = get_storage("memory")

    dir_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                          object_id=b"e" * 20)

    rels = [
        Release(
            name=f"v{i}.0".encode(),
            message=b"blah\n",
            target=dir_swhid.object_id,
            target_type=ModelObjectType.DIRECTORY,
            synthetic=True,
        ) for i in (1, 2, 3, 4)
    ]
    storage.release_add(rels[0:3])

    origin = "http://example.org"
    rel1_swhid = rels[0].swhid()
    rel2_swhid = rels[1].swhid()
    rel3_swhid = rels[2].swhid()
    rel4_swhid = rels[3].swhid()

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid),
    ])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel3_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel4_swhid.object_id, dir_swhid.object_id),
        autospec=True,
    ).start()

    loader.load()

    assert loader._load_release.mock_calls == [  # type: ignore
        # v1.0: not loaded because there is already its (extid_type, extid, rel)
        #       in the storage.
        # v2.0: loaded, because there is already a similar extid, but different type
        call(
            StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
            Origin(url=origin),
        ),
        # v3.0: loaded despite having an (extid_type, extid) in storage, because
        #       the target of the extid is not in the previous snapshot
        call(
            StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
            Origin(url=origin),
        ),
        # v4.0: loaded, because there isn't its extid
        call(
            StubPackageInfo(origin, "example-v4.0.tar", "v4.0"),
            Origin(url=origin),
        ),
    ]

    # then check the snapshot has all the branches.
    # versions 2.0 to 4.0 all point to rel4_swhid (instead of the value of the last
    # snapshot), because they had to be loaded (mismatched extid), and the mocked
    # _load_release always returns rel4_swhid.
    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
            b"branch-v4.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
            rel3_swhid.object_id,
            rel4_swhid.object_id,
        ],
    )

    assert set(extids) == {
        # What we inserted at the beginning of the test:
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid),
        # Added by the loader:
        ExtID("extid-type1", b"extid-of-v2.0", rel4_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel4_swhid),
        ExtID("extid-type2", b"extid-of-v4.0", rel4_swhid),
    }