def test_closed_branch_incremental(swh_storage, datadir, tmp_path):
    """Test that a repository with a closed branch does not trip an incremental load"""
    archive_name = "example"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    repo_path = repo_url.replace("file://", "")

    loader = HgLoader(swh_storage, repo_path)

    # Test 3 loads: full, and two incremental.
    assert loader.load() == {"status": "eventful"}
    expected_stats = {
        "content": 7,
        "directory": 16,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 9,
        "skipped_content": 0,
        "snapshot": 1,
    }
    assert get_stats(loader.storage) == expected_stats
    assert loader.load() == {"status": "uneventful"}
    assert get_stats(loader.storage) == {
        **expected_stats, "origin_visit": 1 + 1
    }
    assert loader.load() == {"status": "uneventful"}
    assert get_stats(loader.storage) == {
        **expected_stats, "origin_visit": 2 + 1
    }
def test_load_unchanged_repo__dangling_extid(swh_storage, datadir, tmp_path):
    """Checks the loader will load revisions targeted by an ExtID if the
    revisions are missing from the storage"""
    archive_name = "hello"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    repo_path = repo_url.replace("file://", "")

    loader = HgLoader(swh_storage, repo_path)

    assert loader.load() == {"status": "eventful"}
    assert get_stats(loader.storage) == {
        "content": 3,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 3,
        "skipped_content": 0,
        "snapshot": 1,
    }

    old_storage = swh_storage

    # Create a new storage, and only copy ExtIDs or head revisions to it.
    # This should be enough for the loader to know revisions were already loaded
    new_storage = _partial_copy_storage(old_storage,
                                        repo_path,
                                        mechanism="extid",
                                        copy_revisions=False)

    # Create a new loader (to start with a clean slate, eg. remove the caches),
    # with the new, partial, storage
    loader = HgLoader(new_storage, repo_path)

    assert get_stats(loader.storage) == {
        "content": 0,
        "directory": 0,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    }

    assert loader.load() == {"status": "eventful"}

    assert get_stats(loader.storage) == {
        "content": 3,
        "directory": 3,
        "origin": 1,
        "origin_visit": 2,
        "release": 1,
        "revision": 3,
        "skipped_content": 0,
        "snapshot": 1,
    }
Exemple #3
0
def test_npm_loader_incremental_visit(swh_storage, requests_mock_datadir_visits):
    package = "org"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249")
    actual_load_status = loader.load()
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }
    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    stats = get_stats(swh_storage)

    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    # reset loader internal state
    del loader._cached_info
    del loader._cached__raw_info

    actual_load_status2 = loader.load()
    assert actual_load_status2["status"] == "eventful"
    snap_id2 = actual_load_status2["snapshot_id"]
    assert snap_id2 is not None
    assert snap_id2 != actual_load_status["snapshot_id"]

    assert_last_visit_matches(swh_storage, url, status="full", type="npm")

    stats = get_stats(swh_storage)

    assert {  # 3 new releases artifacts
        "content": len(_expected_new_contents_first_visit) + 14,
        "directory": len(_expected_new_directories_first_visit) + 15,
        "origin": 1,
        "origin_visit": 2,
        "release": len(_expected_new_releases_first_visit) + 3,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 2,
    } == stats

    urls = [
        m.url
        for m in requests_mock_datadir_visits.request_history
        if m.url.startswith("https://registry.npmjs.org")
    ]
    assert len(urls) == len(set(urls))  # we visited each artifact once across
def test_loader_hg_extid_filtering(swh_storage, datadir, tmp_path):
    """The first visit of a fork should filter already seen revisions (through extids)"""
    archive_name = "the-sandbox"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)

    loader = HgLoader(swh_storage, url=repo_url)

    assert loader.load() == {"status": "eventful"}
    stats = get_stats(loader.storage)
    expected_stats = {
        "content": 2,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 58,
        "skipped_content": 0,
        "snapshot": 1,
    }
    assert stats == expected_stats

    visit_status = assert_last_visit_matches(
        loader.storage,
        repo_url,
        status="full",
        type="hg",
    )

    # Make a fork of the first repository we ingested
    fork_url = prepare_repository_from_archive(archive_path,
                                               "the-sandbox-reloaded",
                                               tmp_path)
    loader2 = HgLoader(swh_storage,
                       url=fork_url,
                       directory=str(tmp_path / archive_name))

    assert loader2.load() == {"status": "uneventful"}

    stats = get_stats(loader.storage)
    expected_stats2 = expected_stats.copy()
    expected_stats2.update({
        "origin": 1 + 1,
        "origin_visit": 1 + 1,
    })
    assert stats == expected_stats2

    visit_status2 = assert_last_visit_matches(
        loader.storage,
        fork_url,
        status="full",
        type="hg",
    )
    assert visit_status.snapshot is not None
    assert visit_status2.snapshot == visit_status.snapshot
def test_load_unchanged_repo_should_be_uneventful(
    swh_storage,
    datadir,
    tmp_path,
):
    """Checks the loader can find which revisions it already loaded, using ExtIDs."""
    archive_name = "hello"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    repo_path = repo_url.replace("file://", "")

    loader = HgLoader(swh_storage, repo_path)

    assert loader.load() == {"status": "eventful"}
    assert get_stats(loader.storage) == {
        "content": 3,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 3,
        "skipped_content": 0,
        "snapshot": 1,
    }
    visit_status = assert_last_visit_matches(
        loader.storage,
        repo_path,
        type=RevisionType.MERCURIAL.value,
        status="full",
    )
    assert visit_status.snapshot is not None

    # Create a new loader (to start with a clean slate, eg. remove the caches),
    # with the new, partial, storage
    loader2 = HgLoader(swh_storage, repo_path)
    assert loader2.load() == {"status": "uneventful"}

    # Should have all the objects
    assert get_stats(loader.storage) == {
        "content": 3,
        "directory": 3,
        "origin": 1,
        "origin_visit": 2,
        "release": 1,
        "revision": 3,
        "skipped_content": 0,
        "snapshot": 1,
    }
    visit_status2 = assert_last_visit_matches(
        loader2.storage,
        repo_path,
        type=RevisionType.MERCURIAL.value,
        status="full",
    )
    assert visit_status2.snapshot == visit_status.snapshot
Exemple #6
0
    def test_load(self):
        """Loads a simple repository (made available by `setUp()`),
        and checks everything was added in the storage."""
        res = self.loader.load()

        assert res == {"status": "eventful"}

        assert_last_visit_matches(
            self.loader.storage,
            self.repo_url,
            status="full",
            type="git",
            snapshot=SNAPSHOT1.id,
        )

        stats = get_stats(self.loader.storage)
        assert stats == {
            "content": 4,
            "directory": 7,
            "origin": 1,
            "origin_visit": 1,
            "release": 0,
            "revision": 7,
            "skipped_content": 0,
            "snapshot": 1,
        }

        check_snapshot(SNAPSHOT1, self.loader.storage)
Exemple #7
0
def test_pypi_no_release_artifact(swh_storage, requests_mock_datadir_missing_all):
    """Load a pypi project with all artifacts missing ends up with no snapshot"""
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "uneventful"
    assert actual_load_status["snapshot_id"] is not None

    empty_snapshot = Snapshot(branches={})

    assert_last_visit_matches(
        swh_storage, url, status="partial", type="pypi", snapshot=empty_snapshot.id
    )

    stats = get_stats(swh_storage)
    assert {
        "content": 0,
        "directory": 0,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Exemple #8
0
def test_pypi_release_with_traceback(swh_storage, requests_mock_datadir):
    url = "https://pypi.org/project/0805nexter"
    with patch(
        "swh.loader.package.pypi.loader.PyPILoader.last_snapshot",
        side_effect=ValueError("Fake problem to fail the visit"),
    ):
        loader = PyPILoader(swh_storage, url)

        actual_load_status = loader.load()
        assert actual_load_status == {"status": "failed"}

        assert_last_visit_matches(swh_storage, url, status="failed", type="pypi")

        stats = get_stats(swh_storage)

        assert {
            "content": 0,
            "directory": 0,
            "origin": 1,
            "origin_visit": 1,
            "release": 0,
            "revision": 0,
            "skipped_content": 0,
            "snapshot": 0,
        } == stats
    def test_load_despite_dulwich_exception(self, mocker, failure_exception):
        """Checks repository can still be loaded when dulwich raises exception
        when encountering a repository with dumb transfer protocol.
        """

        fetch_pack_from_origin = mocker.patch(
            "swh.loader.git.loader.GitLoader.fetch_pack_from_origin")

        fetch_pack_from_origin.side_effect = failure_exception("failure")

        res = self.loader.load()

        assert res == {"status": "eventful"}

        stats = get_stats(self.loader.storage)
        assert stats == {
            "content": 4,
            "directory": 7,
            "origin": 1,
            "origin_visit": 1,
            "release": 0,
            "revision": 7,
            "skipped_content": 0,
            "snapshot": 1,
        }
def test_archive_visit_with_no_artifact_found(swh_storage,
                                              requests_mock_datadir):
    url = URL
    unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz"
    loader = ArchiveLoader(
        swh_storage,
        url,
        artifacts=[{
            "time": 944729610,
            "url": unknown_artifact_url,  # unknown artifact
            "length": 221837,
            "filename": "8sync-0.1.0.tar.gz",
            "version": "0.1.0",
        }],
    )

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "uneventful"
    assert actual_load_status["snapshot_id"] is not None
    stats = get_stats(swh_storage)

    assert {
        "content": 0,
        "directory": 0,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    assert_last_visit_matches(swh_storage, url, status="partial", type="tar")
Exemple #11
0
def test_deposit_loading_unknown_deposit(swh_storage, deposit_client,
                                         requests_mock_datadir):
    """Loading an unknown deposit should fail

    no origin, no visit, no snapshot
    """
    # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/'
    url = "some-url"
    unknown_deposit_id = 667
    loader = DepositLoader(
        swh_storage,
        url,
        unknown_deposit_id,
        deposit_client,
        default_filename="archive.zip",
    )  # does not exist

    actual_load_status = loader.load()
    assert actual_load_status == {"status": "failed"}

    stats = get_stats(loader.storage)

    assert {
        "content": 0,
        "directory": 0,
        "origin": 0,
        "origin_visit": 0,
        "release": 0,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 0,
    } == stats
Exemple #12
0
    def test_load_dangling_symref(self):
        with open(os.path.join(self.destination_path, ".git/HEAD"), "wb") as f:
            f.write(b"ref: refs/heads/dangling-branch\n")

        res = self.loader.load()
        assert res == {"status": "eventful"}

        visit_status = assert_last_visit_matches(
            self.loader.storage, self.repo_url, status="full", type="git"
        )
        snapshot_id = visit_status.snapshot
        assert snapshot_id is not None

        snapshot = snapshot_get_all_branches(self.loader.storage, snapshot_id)
        branches = snapshot.branches

        assert branches[b"HEAD"] == SnapshotBranch(
            target=b"refs/heads/dangling-branch",
            target_type=TargetType.ALIAS,
        )
        assert branches[b"refs/heads/dangling-branch"] is None

        stats = get_stats(self.loader.storage)
        assert stats == {
            "content": 4,
            "directory": 7,
            "origin": 1,
            "origin_visit": 1,
            "release": 0,
            "revision": 7,
            "skipped_content": 0,
            "snapshot": 1,
        }
def test_load_repo_with_new_commits(swh_storage, datadir, tmp_path):
    archive_name = "hello"
    archive_path = Path(datadir, f"{archive_name}.tgz")
    json_path = Path(datadir, f"{archive_name}.json")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)

    # first load with missing commits
    hg_strip(repo_url.replace("file://", ""), "tip")
    loader = HgLoader(swh_storage, repo_url)
    assert loader.load() == {"status": "eventful"}
    assert get_stats(loader.storage) == {
        "content": 2,
        "directory": 2,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 2,
        "skipped_content": 0,
        "snapshot": 1,
    }

    # second load with all commits
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    loader = HgLoader(swh_storage, repo_url)
    checker = LoaderChecker(
        loader=loader,
        expected=ExpectedSwhids.load(json_path),
    )

    checker.check()

    assert get_stats(loader.storage) == {
        "content": 3,
        "directory": 3,
        "origin": 1,
        "origin_visit": 2,
        "release": 1,
        "revision": 3,
        "skipped_content": 0,
        "snapshot": 2,
    }
Exemple #14
0
def test_deposit_loading_failure_to_retrieve_1_artifact(
        swh_storage, deposit_client, requests_mock_datadir_missing_one):
    """Deposit with missing artifact ends up with an uneventful/partial visit"""
    # private api url form: 'https://deposit.s.o/1/private/hal/666/raw/'
    url = "some-url-2"
    deposit_id = 666
    requests_mock_datadir_missing_one.put(re.compile("https"))
    loader = DepositLoader(swh_storage,
                           url,
                           deposit_id,
                           deposit_client,
                           default_filename="archive.zip")

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "uneventful"
    assert actual_load_status["snapshot_id"] is not None

    assert_last_visit_matches(loader.storage,
                              url,
                              status="partial",
                              type="deposit")

    stats = get_stats(loader.storage)
    assert {
        "content": 0,
        "directory": 0,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    # Retrieve the information for deposit status update query to the deposit
    urls = [
        m for m in requests_mock_datadir_missing_one.request_history
        if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/"
    ]

    assert len(urls) == 1
    update_query = urls[0]

    body = update_query.json()
    expected_body = {
        "status": "failed",
        "status_detail": {
            "loading": [
                "Failed to load branch HEAD for some-url-2: Fail to query "
                "'https://deposit.softwareheritage.org/1/private/666/raw/'. Reason: 404"
            ]
        },
    }

    assert body == expected_body
Exemple #15
0
    def test_load_unchanged(self):
        """Checks loading a repository a second time does not add
        any extra data."""
        res = self.loader.load()
        assert res == {"status": "eventful"}

        assert_last_visit_matches(
            self.loader.storage,
            self.repo_url,
            status="full",
            type="git",
            snapshot=SNAPSHOT1.id,
        )

        stats0 = get_stats(self.loader.storage)
        assert stats0 == {
            "content": 4,
            "directory": 7,
            "origin": 1,
            "origin_visit": 1,
            "release": 0,
            "revision": 7,
            "skipped_content": 0,
            "snapshot": 1,
        }

        res = self.loader.load()
        assert res == {"status": "uneventful"}
        stats1 = get_stats(self.loader.storage)
        expected_stats = copy.deepcopy(stats0)
        expected_stats["origin_visit"] += 1
        assert stats1 == expected_stats

        check_snapshot(SNAPSHOT1, self.loader.storage)

        assert_last_visit_matches(
            self.loader.storage,
            self.repo_url,
            status="full",
            type="git",
            snapshot=SNAPSHOT1.id,
        )
def test_cran_fail_to_build_or_load_extrinsic_metadata(
    method_name, swh_storage, requests_mock_datadir
):
    """problem during loading: {visit: failed, status: failed, no snapshot}"""
    version = "2.22-6"
    base_url = "https://cran.r-project.org"
    origin_url = f"{base_url}/Packages/Recommended_KernSmooth/index.html"
    artifact_url = (
        f"{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz"  # noqa
    )

    full_method_name = f"swh.loader.package.cran.loader.CRANLoader.{method_name}"
    with patch(
        full_method_name,
        side_effect=ValueError("Fake to fail to build or load extrinsic metadata"),
    ):
        loader = CRANLoader(
            swh_storage,
            origin_url,
            artifacts=[
                {
                    "url": artifact_url,
                    "version": version,
                    "package": "Recommended_KernSmooth",
                }
            ],
        )

        actual_load_status = loader.load()

        assert actual_load_status == {
            "status": "failed",
            "snapshot_id": SNAPSHOT.id.hex(),
        }

        visit_stats = get_stats(swh_storage)
        assert {
            "content": 33,
            "directory": 7,
            "origin": 1,
            "origin_visit": 1,
            "release": 1,
            "revision": 0,
            "skipped_content": 0,
            "snapshot": 1,
        } == visit_stats

        assert_last_visit_matches(
            swh_storage, origin_url, status="partial", type="cran", snapshot=SNAPSHOT.id
        )
Exemple #17
0
def test_pypi_visit_with_missing_artifact(
    swh_storage, requests_mock_datadir_missing_one
):
    """Load a pypi project with some missing artifacts ends up with 1 snapshot"""
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("00785a38479abe5fbfa402df96be26d2ddf89c97")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage,
        url,
        status="partial",
        type="pypi",
        snapshot=expected_snapshot_id,
    )

    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"releases/1.2.0": SnapshotBranch(
                target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD": SnapshotBranch(
                target=b"releases/1.2.0",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    check_snapshot(expected_snapshot, storage=swh_storage)

    stats = get_stats(swh_storage)

    assert {
        "content": 3,
        "directory": 2,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Exemple #18
0
def test_pypi_visit_with_1_release_artifact(swh_storage, requests_mock_datadir):
    """With no prior visit, load a pypi project ends up with 1 snapshot"""
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("3dd50c1a0e48a7625cf1427e3190a65b787c774e")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id
    )

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"releases/1.1.0": SnapshotBranch(
                target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.2.0": SnapshotBranch(
                target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD": SnapshotBranch(
                target=b"releases/1.2.0",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 6,
        "directory": 4,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Exemple #19
0
def test_npm_loader_version_divergence(swh_storage):
    package = "@aller/shared"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("68eed3d3bc852e7f435a84f18ee77e23f6884be2")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }
    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target_type=TargetType.ALIAS, target=b"releases/0.1.0"
            ),
            b"releases/0.1.0": SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=hash_to_bytes("0c486b50b407f847ef7581f595c2b6c2062f1089"),
            ),
            b"releases/0.1.1-alpha.14": SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=hash_to_bytes("79d80c87c0a8d104a216cc539baad962a454802a"),
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)

    assert {  # 1 new releases artifacts
        "content": 534,
        "directory": 153,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
def test_maven_loader_first_visit(swh_storage,
                                  expected_contents_and_directories,
                                  expected_snapshot, expected_releases):
    """With no prior visit, loading a jar ends up with 1 snapshot"""

    loader = MavenLoader(swh_storage, MVN_ORIGIN_URL, artifacts=MVN_ARTIFACTS)

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"

    actual_snapshot = snapshot_get_all_branches(
        swh_storage, hash_to_bytes(actual_load_status["snapshot_id"]))

    assert actual_load_status["snapshot_id"] == expected_snapshot.id.hex()
    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)
    assert_last_visit_matches(swh_storage,
                              MVN_ORIGIN_URL,
                              status="full",
                              type="maven")

    expected_contents, expected_directories = expected_contents_and_directories
    assert list(swh_storage.content_missing_per_sha1(expected_contents)) == []
    assert list(swh_storage.directory_missing(expected_directories)) == []

    rel_id = actual_snapshot.branches[b"releases/0.1.0"].target
    rel2_id = actual_snapshot.branches[b"releases/0.1.1"].target
    releases = swh_storage.release_get([rel_id, rel2_id])

    assert releases == expected_releases

    assert {
        "content": len(expected_contents),
        "directory": len(expected_directories),
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
def test_maven_loader_visit_with_no_artifact_found(swh_storage,
                                                   requests_mock_datadir):
    origin_url = "https://ftp.g.o/unknown"
    unknown_artifact_url = "https://ftp.g.o/unknown/8sync-0.1.0.tar.gz"
    loader = MavenLoader(
        swh_storage,
        origin_url,
        artifacts=[{
            "time": "2021-07-18 08:05:05.187000",
            "url": unknown_artifact_url,  # unknown artifact
            "filename": "8sync-0.1.0.tar.gz",
            "gid": "al/aldi",
            "aid": "sprova4j",
            "version": "0.1.0",
            "base_url": "https://repo1.maven.org/maven2/",
        }],
    )

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "uneventful"
    assert actual_load_status["snapshot_id"] is not None

    expected_snapshot_id = "1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"
    assert actual_load_status["snapshot_id"] == expected_snapshot_id

    stats = get_stats(swh_storage)

    assert_last_visit_matches(swh_storage,
                              origin_url,
                              status="partial",
                              type="maven")

    assert {
        "content": 0,
        "directory": 0,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
    def test_load_empty_repository(self, mocker):
        class GitObjectsFetcherNoRefs(dumb.GitObjectsFetcher):
            def _get_refs(self):
                return {}

        mocker.patch.object(dumb, "GitObjectsFetcher", GitObjectsFetcherNoRefs)

        res = self.loader.load()

        assert res == {"status": "uneventful"}

        stats = get_stats(self.loader.storage)
        assert stats == {
            "content": 0,
            "directory": 0,
            "origin": 1,
            "origin_visit": 1,
            "release": 0,
            "revision": 0,
            "skipped_content": 0,
            "snapshot": 1,
        }
Exemple #23
0
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info):
    package = "org"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    release_id = "d38cc0b571cd41f3c85513864e049766b42032a7"
    versions = [
        ("0.0.2", release_id),
        ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"),
        ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.4", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.2",
        message=b"Synthetic release for NPM source package org version 0.0.2\n",
        target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person(
            fullname=b"mooz <*****@*****.**>",
            name=b"mooz",
            email=b"*****@*****.**",
        ),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    contents = swh_storage.content_get(_expected_new_contents_first_visit)
    count = sum(0 if content is None else 1 for content in contents)
    assert count == len(_expected_new_contents_first_visit)

    assert (
        list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == []
    )

    assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == []

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://npmjs.com/",
    )

    for (version_name, release_id) in versions:
        release = swh_storage.release_get([hash_to_bytes(release_id)])[0]
        assert release.target_type == ModelObjectType.DIRECTORY
        directory_id = release.target
        directory_swhid = ExtendedSWHID(
            object_type=ExtendedObjectType.DIRECTORY,
            object_id=directory_id,
        )
        release_swhid = CoreSWHID(
            object_type=ObjectType.RELEASE,
            object_id=hash_to_bytes(release_id),
        )
        expected_metadata = [
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.npm.loader.NpmLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="replicate-npm-package-json",
                metadata=json.dumps(
                    json.loads(org_api_info)["versions"][version_name]
                ).encode(),
                origin="https://www.npmjs.com/package/org",
                release=release_swhid,
            )
        ]
        assert swh_storage.raw_extrinsic_metadata_get(
            directory_swhid,
            metadata_authority,
        ) == PagedResult(
            next_page_token=None,
            results=expected_metadata,
        )

    stats = get_stats(swh_storage)

    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
def test_arch_loader_load_one_version(datadir, requests_mock_datadir,
                                      swh_storage):
    loader = ArchLoader(
        swh_storage,
        url=EXPECTED_PACKAGES[1]["url"],
        artifacts=EXPECTED_PACKAGES[1]["artifacts"],
    )
    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    expected_snapshot_id = "4020d0a278027550e336b5481a4159a913c91aa4"
    expected_release_id = "7681098c9e381f9cc8bd1724d57eeee2182982dc"

    assert expected_snapshot_id == actual_load_status["snapshot_id"]

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz":
            SnapshotBranch(
                target=hash_to_bytes(expected_release_id),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD":
            SnapshotBranch(
                target=b"releases/1.12-1/gzip-1.12-1-aarch64.pkg.tar.xz",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 1,
        "directory": 1,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    assert swh_storage.release_get([
        hash_to_bytes(expected_release_id)
    ])[0] == Release(
        name=b"1.12-1",
        message=b"Synthetic release for Arch Linux source package gzip version "
        b"1.12-1\n\nGNU compression utility\n",
        target=hash_to_bytes("bd742aaf422953a1f7a5e084ec4a7477491d63fb"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(
            b"Arch Linux ARM Build System <*****@*****.**>"),
        date=TimestampWithTimezone.from_iso8601("2022-04-07T21:08:14+00:00"),
        id=hash_to_bytes(expected_release_id),
    )

    assert_last_visit_matches(
        swh_storage,
        url=EXPECTED_PACKAGES[1]["url"],
        status="full",
        type="arch",
        snapshot=expected_snapshot.id,
    )
def test_arch_loader_load_n_versions(datadir, requests_mock_datadir,
                                     swh_storage):

    loader = ArchLoader(
        swh_storage,
        url=EXPECTED_PACKAGES[0]["url"],
        artifacts=EXPECTED_PACKAGES[0]["artifacts"],
    )
    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"
    assert actual_load_status["snapshot_id"] is not None

    expected_snapshot_id = "832139d69a91edffcc3a96cca11deaf9255041c3"

    assert expected_snapshot_id == actual_load_status["snapshot_id"]

    expected_snapshot = Snapshot(
        id=hash_to_bytes(actual_load_status["snapshot_id"]),
        branches={
            b"releases/1:1.3_20190211-1/"
            b"dialog-1:1.3_20190211-1-x86_64.pkg.tar.xz":
            SnapshotBranch(
                target=hash_to_bytes(
                    "37efb727ff8bb8fbf92518aa8fe5fff2ad427d06"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1:1.3_20220414-1/"
            b"dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst":
            SnapshotBranch(
                target=hash_to_bytes(
                    "020d3f5627df7474f257fd04f1ede4415296e265"),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD":
            SnapshotBranch(
                target=
                b"releases/1:1.3_20220414-1/dialog-1:1.3_20220414-1-x86_64.pkg.tar.zst",
                target_type=TargetType.ALIAS,
            ),
        },
    )

    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 2,
        "directory": 2,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    assert_last_visit_matches(
        swh_storage,
        url=EXPECTED_PACKAGES[0]["url"],
        status="full",
        type="arch",
        snapshot=expected_snapshot.id,
    )
Exemple #26
0
def test_pypi_incremental_visit(swh_storage, requests_mock_datadir_visits):
    """With prior visit, 2nd load will result with a different snapshot"""
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    visit1_actual_load_status = loader.load()
    visit1_stats = get_stats(swh_storage)
    expected_snapshot_id = hash_to_bytes("3dd50c1a0e48a7625cf1427e3190a65b787c774e")
    assert visit1_actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id
    )

    assert {
        "content": 6,
        "directory": 4,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == visit1_stats

    # Reset internal state
    del loader._cached__raw_info
    del loader._cached_info

    visit2_actual_load_status = loader.load()
    visit2_stats = get_stats(swh_storage)

    assert visit2_actual_load_status["status"] == "eventful", visit2_actual_load_status
    expected_snapshot_id2 = hash_to_bytes("77febe6ff0faf6cc00dd015a6c9763579a9fb6c7")
    assert visit2_actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id2.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot_id2
    )

    expected_snapshot = Snapshot(
        id=expected_snapshot_id2,
        branches={
            b"releases/1.1.0": SnapshotBranch(
                target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.2.0": SnapshotBranch(
                target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.3.0": SnapshotBranch(
                target=hash_to_bytes("a21b09cbec8e31f47307f196bb1f939effc26e11"),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD": SnapshotBranch(
                target=b"releases/1.3.0",
                target_type=TargetType.ALIAS,
            ),
        },
    )

    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=expected_snapshot.id
    )

    check_snapshot(expected_snapshot, swh_storage)

    assert {
        "content": 6 + 1,  # 1 more content
        "directory": 4 + 2,  # 2 more directories
        "origin": 1,
        "origin_visit": 1 + 1,
        "release": 2 + 1,  # 1 more release
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1 + 1,  # 1 more snapshot
    } == visit2_stats

    urls = [
        m.url
        for m in requests_mock_datadir_visits.request_history
        if m.url.startswith("https://files.pythonhosted.org")
    ]
    # visited each artifact once across 2 visits
    assert len(urls) == len(set(urls))
Exemple #27
0
def test_pypi_multiple_visits_with_no_change(swh_storage, requests_mock_datadir):
    """Multiple visits with no changes results in 1 same snapshot"""
    url = "https://pypi.org/project/0805nexter"
    loader = PyPILoader(swh_storage, url)

    actual_load_status = loader.load()
    snapshot_id = hash_to_bytes("3dd50c1a0e48a7625cf1427e3190a65b787c774e")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": snapshot_id.hex(),
    }
    assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi", snapshot=snapshot_id
    )

    expected_snapshot = Snapshot(
        id=snapshot_id,
        branches={
            b"releases/1.1.0": SnapshotBranch(
                target=hash_to_bytes("f8789ff3ed70a5f570c35d885c7bcfda7b23b091"),
                target_type=TargetType.RELEASE,
            ),
            b"releases/1.2.0": SnapshotBranch(
                target=hash_to_bytes("fbbcb817f01111b06442cdcc93140ab3cc777d68"),
                target_type=TargetType.RELEASE,
            ),
            b"HEAD": SnapshotBranch(
                target=b"releases/1.2.0",
                target_type=TargetType.ALIAS,
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    stats = get_stats(swh_storage)

    assert {
        "content": 6,
        "directory": 4,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    actual_load_status2 = loader.load()
    assert actual_load_status2 == {
        "status": "uneventful",
        "snapshot_id": actual_load_status2["snapshot_id"],
    }

    visit_status2 = assert_last_visit_matches(
        swh_storage, url, status="full", type="pypi"
    )

    stats2 = get_stats(swh_storage)
    expected_stats2 = stats.copy()
    expected_stats2["origin_visit"] = 1 + 1
    assert expected_stats2 == stats2

    # same snapshot
    assert visit_status2.snapshot == snapshot_id
def test_loader_one_visit(swh_storage, requests_mock_datadir, raw_sources):
    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()
    expected_snapshot_id = SNAPSHOT1.id
    expected_snapshot_id_hex = expected_snapshot_id.hex()
    assert load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id_hex,
    }

    release_id = SNAPSHOT1.branches[
        b"https://github.com/owner-1/repository-1/revision-1.tgz"].target
    check_snapshot(SNAPSHOT1, storage=swh_storage)

    assert swh_storage.release_get([release_id])[0] == Release(
        id=release_id,
        name=b"https://github.com/owner-1/repository-1/revision-1.tgz",
        message=None,
        target=hash_to_bytes("4de2e07d3742718d928e974b8a4c721b9f7b33bf"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b""),
        date=None,
    )

    stats = get_stats(swh_storage)
    assert {
        "content": 1,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    # The visit is partial because urls pointing to non tarball file
    # are not handled yet
    assert_last_visit_matches(swh_storage,
                              sources_url,
                              status="partial",
                              type="nixguix")

    visit_status = origin_get_latest_visit_status(swh_storage, sources_url)
    snapshot_swhid = ExtendedSWHID(object_type=ExtendedObjectType.SNAPSHOT,
                                   object_id=visit_status.snapshot)
    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url=sources_url,
    )
    expected_metadata = [
        RawExtrinsicMetadata(
            target=snapshot_swhid,
            authority=metadata_authority,
            fetcher=MetadataFetcher(
                name="swh.loader.package.nixguix.loader.NixGuixLoader",
                version=__version__,
            ),
            discovery_date=loader.visit_date,
            format="nixguix-sources-json",
            metadata=raw_sources,
            origin=sources_url,
        )
    ]
    assert swh_storage.raw_extrinsic_metadata_get(
        snapshot_swhid,
        metadata_authority,
    ) == PagedResult(
        next_page_token=None,
        results=expected_metadata,
    )
def test_loader_two_visits(swh_storage, requests_mock_datadir_visits):
    """To ensure there is only one origin, but two visits, two revisions
    and two snapshots are created.

    The first visit creates a snapshot containing one tarball. The
    second visit creates a snapshot containing the same tarball and
    another tarball.

    """
    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()
    assert load_status == {
        "status": "eventful",
        "snapshot_id": SNAPSHOT1.id.hex()
    }

    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="partial",
        type="nixguix",
        snapshot=SNAPSHOT1.id,
    )

    check_snapshot(SNAPSHOT1, storage=swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 1,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 2,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    loader = NixGuixLoader(swh_storage, sources_url)
    load_status = loader.load()
    expected_snapshot_id_hex = "c1983a0a3f647548e1fb92f30339da6848fe9f7a"
    expected_snapshot_id = hash_to_bytes(expected_snapshot_id_hex)
    assert load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id_hex,
    }

    assert_last_visit_matches(
        swh_storage,
        sources_url,
        status="partial",
        type="nixguix",
        snapshot=expected_snapshot_id,
    )

    # This ensures visits are incremental. Indeed, if we request a
    # second time an url, because of the requests_mock_datadir_visits
    # fixture, the file has to end with `_visit1`.
    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"evaluation":
            SnapshotBranch(
                target=hash_to_bytes(
                    "602140776b2ce6c9159bcf52ada73a297c063d5e"),
                target_type=TargetType.REVISION,
            ),
            b"https://github.com/owner-1/repository-1/revision-1.tgz":
            SnapshotBranch(
                target=hash_to_bytes(
                    "df7811b9644ed8ef088e2e7add62ed32b0bab15f"),
                target_type=TargetType.RELEASE,
            ),
            b"https://github.com/owner-2/repository-1/revision-1.tgz":
            SnapshotBranch(
                target=hash_to_bytes(
                    "5cc0115cd643902b837cb6cfbc9f5865bc5a7cb2"),
                target_type=TargetType.RELEASE,
            ),
        },
    )
    check_snapshot(expected_snapshot, storage=swh_storage)

    stats = get_stats(swh_storage)
    assert {
        "content": 2,
        "directory": 5,
        "origin": 1,
        "origin_visit": 2,
        "release": 3,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 2,
    } == stats
Exemple #30
0
    def test_load_changed(self):
        """Loads a repository, makes some changes by adding files, commits,
        and merges, load it again, and check the storage contains everything
        it should."""
        # Initial load
        res = self.loader.load()
        assert res == {"status": "eventful"}

        stats0 = get_stats(self.loader.storage)
        assert stats0 == {
            "content": 4,
            "directory": 7,
            "origin": 1,
            "origin_visit": 1,
            "release": 0,
            "revision": 7,
            "skipped_content": 0,
            "snapshot": 1,
        }

        # Load with a new file + revision
        with open(os.path.join(self.destination_path, "hello.py"), "a") as fd:
            fd.write("print('Hello world')\n")

        self.repo.stage([b"hello.py"])
        new_revision = self.repo.do_commit(b"Hello world\n").decode()
        new_dir = "85dae072a5aa9923ffa7a7568f819ff21bf49858"

        assert self.repo[new_revision.encode()].tree == new_dir.encode()

        revisions = REVISIONS1.copy()
        assert new_revision not in revisions
        revisions[new_revision] = new_dir

        res = self.loader.load()
        assert res == {"status": "eventful"}

        stats1 = get_stats(self.loader.storage)
        expected_stats = copy.deepcopy(stats0)
        # did one new visit
        expected_stats["origin_visit"] += 1
        # with one more of the following objects
        expected_stats["content"] += 1
        expected_stats["directory"] += 1
        expected_stats["revision"] += 1
        # concluding into 1 new snapshot
        expected_stats["snapshot"] += 1

        assert stats1 == expected_stats

        visit_status = assert_last_visit_matches(
            self.loader.storage, self.repo_url, status="full", type="git"
        )
        assert visit_status.snapshot is not None

        snapshot_id = visit_status.snapshot
        snapshot = snapshot_get_all_branches(self.loader.storage, snapshot_id)
        branches = snapshot.branches
        assert branches[b"HEAD"] == SnapshotBranch(
            target=b"refs/heads/master",
            target_type=TargetType.ALIAS,
        )
        assert branches[b"refs/heads/master"] == SnapshotBranch(
            target=hash_to_bytes(new_revision),
            target_type=TargetType.REVISION,
        )

        # Merge branch1 into HEAD.

        current = self.repo[b"HEAD"]
        branch1 = self.repo[b"refs/heads/branch1"]

        merged_tree = dulwich.objects.Tree()
        for item in self.repo[current.tree].items():
            merged_tree.add(*item)
        for item in self.repo[branch1.tree].items():
            merged_tree.add(*item)

        merged_dir_id = "dab8a37df8db8666d4e277bef9a546f585b5bedd"
        assert merged_tree.id.decode() == merged_dir_id
        self.repo.object_store.add_object(merged_tree)

        merge_commit = self.repo.do_commit(
            b"merge.\n", tree=merged_tree.id, merge_heads=[branch1.id]
        )

        assert merge_commit.decode() not in revisions
        revisions[merge_commit.decode()] = merged_tree.id.decode()

        res = self.loader.load()
        assert res == {"status": "eventful"}

        stats2 = get_stats(self.loader.storage)
        expected_stats = copy.deepcopy(stats1)
        # one more visit
        expected_stats["origin_visit"] += 1
        # with 1 new directory and revision
        expected_stats["directory"] += 1
        expected_stats["revision"] += 1
        # concluding into 1 new snapshot
        expected_stats["snapshot"] += 1

        assert stats2 == expected_stats

        visit_status = assert_last_visit_matches(
            self.loader.storage, self.repo_url, status="full", type="git"
        )
        assert visit_status.snapshot is not None

        merge_snapshot_id = visit_status.snapshot
        assert merge_snapshot_id != snapshot_id

        merge_snapshot = snapshot_get_all_branches(
            self.loader.storage, merge_snapshot_id
        )
        merge_branches = merge_snapshot.branches
        assert merge_branches[b"HEAD"] == SnapshotBranch(
            target=b"refs/heads/master",
            target_type=TargetType.ALIAS,
        )
        assert merge_branches[b"refs/heads/master"] == SnapshotBranch(
            target=hash_to_bytes(merge_commit.decode()),
            target_type=TargetType.REVISION,
        )