def test_loader_hg_extid_filtering(swh_storage, datadir, tmp_path):
    """The first visit of a fork should filter already seen revisions (through extids)"""
    archive_name = "the-sandbox"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)

    loader = HgLoader(swh_storage, url=repo_url)

    assert loader.load() == {"status": "eventful"}
    stats = get_stats(loader.storage)
    expected_stats = {
        "content": 2,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 58,
        "skipped_content": 0,
        "snapshot": 1,
    }
    assert stats == expected_stats

    visit_status = assert_last_visit_matches(
        loader.storage,
        repo_url,
        status="full",
        type="hg",
    )

    # Make a fork of the first repository we ingested
    fork_url = prepare_repository_from_archive(archive_path,
                                               "the-sandbox-reloaded",
                                               tmp_path)
    loader2 = HgLoader(swh_storage,
                       url=fork_url,
                       directory=str(tmp_path / archive_name))

    assert loader2.load() == {"status": "uneventful"}

    stats = get_stats(loader.storage)
    expected_stats2 = expected_stats.copy()
    expected_stats2.update({
        "origin": 1 + 1,
        "origin_visit": 1 + 1,
    })
    assert stats == expected_stats2

    visit_status2 = assert_last_visit_matches(
        loader.storage,
        fork_url,
        status="full",
        type="hg",
    )
    assert visit_status.snapshot is not None
    assert visit_status2.snapshot == visit_status.snapshot
def test_load_repo_check_extids_write_version(swh_storage, datadir, tmp_path):
    """ExtIDs should be stored with a given version when loading is done"""
    archive_name = "hello"
    archive_path = Path(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)

    hg_strip(repo_url.replace("file://", ""), "tip")
    loader = HgLoader(swh_storage, repo_url)
    assert loader.load() == {"status": "eventful"}

    # Ensure we write ExtIDs to a specific version.
    snapshot = snapshot_get_latest(swh_storage, repo_url)

    # First, filter out revisions from that snapshot
    revision_ids = [
        branch.target for branch in snapshot.branches.values()
        if branch.target_type == TargetType.REVISION
    ]

    assert len(revision_ids) > 0

    # Those revisions should have their associated ExtID version set to EXTID_VERSION
    extids = swh_storage.extid_get_from_target(ObjectType.REVISION,
                                               revision_ids)

    assert len(extids) == len(revision_ids)
    for extid in extids:
        assert extid.extid_version == EXTID_VERSION
def test_load_new_extid_should_be_eventful(swh_storage, datadir, tmp_path):
    """Changing the extid version should make loaders ignore existing extids,
    and load the repo again."""
    archive_name = "hello"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    repo_path = repo_url.replace("file://", "")

    with unittest.mock.patch("swh.loader.mercurial.loader.EXTID_VERSION", 0):
        loader = HgLoader(swh_storage, repo_path)
        assert loader.load() == {"status": "eventful"}

    loader = HgLoader(swh_storage, repo_path)
    assert loader.load() == {"status": "eventful"}

    loader = HgLoader(swh_storage, repo_path)
    assert loader.load() == {"status": "uneventful"}

    with unittest.mock.patch("swh.loader.mercurial.loader.EXTID_VERSION",
                             10000):
        loader = HgLoader(swh_storage, repo_path)
        assert loader.load() == {"status": "eventful"}

        loader = HgLoader(swh_storage, repo_path)
        assert loader.load() == {"status": "uneventful"}
def test_multiple_open_heads(swh_storage, datadir, tmp_path):
    archive_name = "multiple-heads"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)

    loader = HgLoader(
        storage=swh_storage,
        url=repo_url,
    )

    actual_load_status = loader.load()
    assert actual_load_status == {"status": "eventful"}

    assert_last_visit_matches(swh_storage, repo_url, status="full", type="hg")

    snapshot = snapshot_get_latest(swh_storage, repo_url)
    expected_branches = [
        b"HEAD",
        b"branch-heads/default/0",
        b"branch-heads/default/1",
        b"branch-tip/default",
    ]
    assert sorted(snapshot.branches.keys()) == expected_branches

    # Check that we don't load anything the second time
    loader = HgLoader(
        storage=swh_storage,
        url=repo_url,
    )

    actual_load_status = loader.load()

    assert actual_load_status == {"status": "uneventful"}
def test_closed_branch_incremental(swh_storage, datadir, tmp_path):
    """Test that a repository with a closed branch does not trip an incremental load"""
    archive_name = "example"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    repo_path = repo_url.replace("file://", "")

    loader = HgLoader(swh_storage, repo_path)

    # Test 3 loads: full, and two incremental.
    assert loader.load() == {"status": "eventful"}
    expected_stats = {
        "content": 7,
        "directory": 16,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 9,
        "skipped_content": 0,
        "snapshot": 1,
    }
    assert get_stats(loader.storage) == expected_stats
    assert loader.load() == {"status": "uneventful"}
    assert get_stats(loader.storage) == {
        **expected_stats, "origin_visit": 1 + 1
    }
    assert loader.load() == {"status": "uneventful"}
    assert get_stats(loader.storage) == {
        **expected_stats, "origin_visit": 2 + 1
    }
    def init(self, swh_storage, datadir, tmp_path, mocker):
        archive_name = "testrepo"
        archive_path = os.path.join(datadir, f"{archive_name}.tgz")
        tmp_path = str(tmp_path)
        self.repo_url = prepare_repository_from_archive(archive_path,
                                                        archive_name,
                                                        tmp_path=tmp_path)
        self.destination_path = os.path.join(tmp_path, archive_name)

        self.fetcher = MagicMock()
        self.fetcher.get_origin_metadata.return_value = []
        self.fetcher.get_parent_origins.return_value = [
            Origin(url=f"base://{self.repo_url}")
        ]
        self.fetcher_cls = MagicMock(return_value=self.fetcher)
        self.fetcher_cls.SUPPORTED_LISTERS = ["fake-lister"]
        mocker.patch(
            "swh.loader.core.metadata_fetchers._fetchers",
            return_value=[self.fetcher_cls],
        )

        self.loader = GitLoader(
            MagicMock(wraps=swh_storage),
            self.repo_url,
            lister_name="fake-lister",
            lister_instance_name="",
        )
        self.repo = dulwich.repo.Repo(self.destination_path)
def test_load_unchanged_repo__dangling_extid(swh_storage, datadir, tmp_path):
    """Checks the loader will load revisions targeted by an ExtID if the
    revisions are missing from the storage"""
    archive_name = "hello"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    repo_path = repo_url.replace("file://", "")

    loader = HgLoader(swh_storage, repo_path)

    assert loader.load() == {"status": "eventful"}
    assert get_stats(loader.storage) == {
        "content": 3,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 3,
        "skipped_content": 0,
        "snapshot": 1,
    }

    old_storage = swh_storage

    # Create a new storage, and only copy ExtIDs or head revisions to it.
    # This should be enough for the loader to know revisions were already loaded
    new_storage = _partial_copy_storage(old_storage,
                                        repo_path,
                                        mechanism="extid",
                                        copy_revisions=False)

    # Create a new loader (to start with a clean slate, eg. remove the caches),
    # with the new, partial, storage
    loader = HgLoader(new_storage, repo_path)

    assert get_stats(loader.storage) == {
        "content": 0,
        "directory": 0,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    }

    assert loader.load() == {"status": "eventful"}

    assert get_stats(loader.storage) == {
        "content": 3,
        "directory": 3,
        "origin": 1,
        "origin_visit": 2,
        "release": 1,
        "revision": 3,
        "skipped_content": 0,
        "snapshot": 1,
    }
def test_load_unchanged_repo_should_be_uneventful(
    swh_storage,
    datadir,
    tmp_path,
):
    """Checks the loader can find which revisions it already loaded, using ExtIDs."""
    archive_name = "hello"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    repo_path = repo_url.replace("file://", "")

    loader = HgLoader(swh_storage, repo_path)

    assert loader.load() == {"status": "eventful"}
    assert get_stats(loader.storage) == {
        "content": 3,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 3,
        "skipped_content": 0,
        "snapshot": 1,
    }
    visit_status = assert_last_visit_matches(
        loader.storage,
        repo_path,
        type=RevisionType.MERCURIAL.value,
        status="full",
    )
    assert visit_status.snapshot is not None

    # Create a new loader (to start with a clean slate, eg. remove the caches),
    # with the new, partial, storage
    loader2 = HgLoader(swh_storage, repo_path)
    assert loader2.load() == {"status": "uneventful"}

    # Should have all the objects
    assert get_stats(loader.storage) == {
        "content": 3,
        "directory": 3,
        "origin": 1,
        "origin_visit": 2,
        "release": 1,
        "revision": 3,
        "skipped_content": 0,
        "snapshot": 1,
    }
    visit_status2 = assert_last_visit_matches(
        loader2.storage,
        repo_path,
        type=RevisionType.MERCURIAL.value,
        status="full",
    )
    assert visit_status2.snapshot == visit_status.snapshot
 def init(self, swh_storage, datadir, tmp_path):
     archive_name = "testrepo"
     archive_path = os.path.join(datadir, f"{archive_name}.tgz")
     tmp_path = str(tmp_path)
     self.repo_url = prepare_repository_from_archive(archive_path,
                                                     archive_name,
                                                     tmp_path=tmp_path)
     self.destination_path = os.path.join(tmp_path, archive_name)
     self.loader = GitLoader(swh_storage, self.repo_url)
     self.repo = dulwich.repo.Repo(self.destination_path)
def test_examples(swh_storage, datadir, tmp_path, archive_name):
    archive_path = Path(datadir, f"{archive_name}.tgz")
    json_path = Path(datadir, f"{archive_name}.json")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)

    LoaderChecker(
        loader=HgLoader(swh_storage, repo_url),
        expected=ExpectedSwhids.load(json_path),
    ).check()
def test_load_repo_with_new_commits(swh_storage, datadir, tmp_path):
    archive_name = "hello"
    archive_path = Path(datadir, f"{archive_name}.tgz")
    json_path = Path(datadir, f"{archive_name}.json")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)

    # first load with missing commits
    hg_strip(repo_url.replace("file://", ""), "tip")
    loader = HgLoader(swh_storage, repo_url)
    assert loader.load() == {"status": "eventful"}
    assert get_stats(loader.storage) == {
        "content": 2,
        "directory": 2,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 2,
        "skipped_content": 0,
        "snapshot": 1,
    }

    # second load with all commits
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    loader = HgLoader(swh_storage, repo_url)
    checker = LoaderChecker(
        loader=loader,
        expected=ExpectedSwhids.load(json_path),
    )

    checker.check()

    assert get_stats(loader.storage) == {
        "content": 3,
        "directory": 3,
        "origin": 1,
        "origin_visit": 2,
        "release": 1,
        "revision": 3,
        "skipped_content": 0,
        "snapshot": 2,
    }
def test_loader_repository_with_bookmark_information(swh_storage, datadir,
                                                     tmp_path):
    """Repository with bookmark information should be ingested correctly"""
    archive_name = "anomad-d"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)

    loader = HgLoader(swh_storage, url=repo_url)

    assert loader.load() == {"status": "eventful"}
Beispiel #13
0
def test_prepare_repository_from_archive(datadir, tmp_path):
    archive_name = "0805nexter-1.1.0"
    archive_path = os.path.join(str(datadir), f"{archive_name}.tar.gz")
    assert os.path.exists(archive_path) is True

    tmp_path = str(tmp_path)  # deals with path string
    repo_url = prepare_repository_from_archive(archive_path,
                                               filename=archive_name,
                                               tmp_path=tmp_path)
    expected_uncompressed_archive_path = os.path.join(tmp_path, archive_name)
    assert repo_url == f"file://{expected_uncompressed_archive_path}"
    assert os.path.exists(expected_uncompressed_archive_path)
def test_visit_repository_with_transplant_operations(swh_storage, datadir,
                                                     tmp_path):
    """Visit a mercurial repository visit transplant operations within should yield a
    snapshot as well.

    """

    archive_name = "transplant"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)

    loader = HgLoader(
        swh_storage,
        url=repo_url,
        visit_date=VISIT_DATE,
    )

    # load hg repository
    actual_load_status = loader.load()
    assert actual_load_status == {"status": "eventful"}

    # collect swh revisions
    assert_last_visit_matches(loader.storage,
                              repo_url,
                              type=RevisionType.MERCURIAL.value,
                              status="full")

    revisions = []
    snapshot = snapshot_get_latest(loader.storage, repo_url)
    for branch in snapshot.branches.values():
        if branch.target_type.value != "revision":
            continue
        revisions.append(branch.target)

    # extract original changesets info and the transplant sources
    hg_changesets = set()
    transplant_sources = set()
    for rev in loader.storage.revision_log(revisions):
        extids = list(
            loader.storage.extid_get_from_target(ObjectType.REVISION,
                                                 [rev["id"]]))
        assert len(extids) == 1
        hg_changesets.add(hash_to_hex(extids[0].extid))
        for k, v in rev["extra_headers"]:
            if k == b"transplant_source":
                transplant_sources.add(v.decode("ascii"))

    # check extracted data are valid
    assert len(hg_changesets) > 0
    assert len(transplant_sources) > 0
    assert transplant_sources <= hg_changesets
Beispiel #15
0
def test_single_revision(datadir: str, tmp_path: str):
    archive_name = "hello"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    directory = urlsplit(repo_url).path

    runner = CliRunner()
    result = runner.invoke(main, [
        "-d", directory, "revision", "0a04b987be5ae354b710cefeba0e2d9de7ad41a9"
    ])

    expected = ("swh:1:rev:93b48d515580522a05f389bec93227fc8e43d940"
                "\t0a04b987be5ae354b710cefeba0e2d9de7ad41a9\n")
    assert result.output == expected
Beispiel #16
0
def test_prepare_repository_from_archive_no_filename(datadir, tmp_path):
    archive_name = "0805nexter-1.1.0"
    archive_path = os.path.join(str(datadir), f"{archive_name}.tar.gz")
    assert os.path.exists(archive_path) is True

    # deals with path as posix path (for tmp_path)
    repo_url = prepare_repository_from_archive(archive_path, tmp_path=tmp_path)

    tmp_path = str(tmp_path)
    expected_uncompressed_archive_path = os.path.join(tmp_path, archive_name)
    expected_repo_url = os.path.join(tmp_path, f"{archive_name}.tar.gz")
    assert repo_url == f"file://{expected_repo_url}"

    # passing along the filename does not influence the on-disk extraction
    # just the repo-url computation
    assert os.path.exists(expected_uncompressed_archive_path)
Beispiel #17
0
def test_all_revisions(datadir: str, tmp_path: str):
    archive_name = "hello"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    directory = urlsplit(repo_url).path

    runner = CliRunner()
    result = runner.invoke(main, ["-d", directory, "revision"])

    expected = dedent("""
        swh:1:rev:93b48d515580522a05f389bec93227fc8e43d940\t0a04b987be5ae354b710cefeba0e2d9de7ad41a9
        swh:1:rev:8dd3db5d5519e4947f035d141581d304565372d2\t82e55d328c8ca4ee16520036c0aaace03a5beb65
        swh:1:rev:c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27\tb985ae4a07e12ac662f45a171e2d42b13be5b50c
        """).lstrip()
    assert result.output == expected
Beispiel #18
0
 def init(self, swh_storage, datadir, tmp_path):
     archive_name = "testrepo"
     archive_path = os.path.join(datadir, f"{archive_name}.tgz")
     tmp_path = str(tmp_path)
     self.repo_url = prepare_repository_from_archive(
         archive_path, archive_name, tmp_path=tmp_path
     )
     self.destination_path = os.path.join(tmp_path, archive_name)
     self.loader = GitLoaderFromDisk(
         swh_storage,
         url=self.repo_url,
         visit_date=datetime.datetime(
             2016, 5, 3, 15, 16, 32, tzinfo=datetime.timezone.utc
         ),
         directory=self.destination_path,
     )
     self.repo = dulwich.repo.Repo(self.destination_path)
def test_missing_filelog_should_not_crash(swh_storage, datadir, tmp_path):
    archive_name = "missing-filelog"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    directory = repo_url.replace("file://", "")

    loader = HgLoader(
        storage=swh_storage,
        url=repo_url,
        directory=directory,  # specify directory to avoid clone
        visit_date=VISIT_DATE,
    )

    actual_load_status = loader.load()
    assert actual_load_status == {"status": "eventful"}

    assert_last_visit_matches(swh_storage,
                              repo_url,
                              status="partial",
                              type="hg")
Beispiel #20
0
def test_all(datadir: str, tmp_path: str):
    archive_name = "hello"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)
    directory = urlsplit(repo_url).path

    runner = CliRunner()
    result = runner.invoke(main, ["-d", directory, "all"])

    expected = dedent(f"""
        swh:1:dir:43d727f2f3f2f7cb3b098ddad1d7038464a4cee2\t0a04b987be5ae354b710cefeba0e2d9de7ad41a9
        swh:1:dir:b3f85f210ff86d334575f64cb01c5bf49895b63e\t82e55d328c8ca4ee16520036c0aaace03a5beb65
        swh:1:dir:8f2be433c945384c85920a8e60f2a68d2c0f20fb\tb985ae4a07e12ac662f45a171e2d42b13be5b50c
        swh:1:rev:93b48d515580522a05f389bec93227fc8e43d940\t0a04b987be5ae354b710cefeba0e2d9de7ad41a9
        swh:1:rev:8dd3db5d5519e4947f035d141581d304565372d2\t82e55d328c8ca4ee16520036c0aaace03a5beb65
        swh:1:rev:c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27\tb985ae4a07e12ac662f45a171e2d42b13be5b50c
        swh:1:rel:515c4d72e089404356d0f4b39d60f948b8999140\t0.1
        swh:1:snp:d35668e02e2ba4321dc951cd308cf883786f918a\t{directory}
        """).lstrip()
    assert result.output == expected
def test_loader_hg_new_visit_no_release(swh_storage, datadir, tmp_path):
    """Eventful visit should yield 1 snapshot"""
    archive_name = "the-sandbox"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)

    loader = HgLoader(swh_storage, url=repo_url)

    assert loader.load() == {"status": "eventful"}

    tips = {
        b"branch-tip/default": "70e750bb046101fdced06f428e73fee471509c56",
        b"branch-tip/develop": "a9c4534552df370f43f0ef97146f393ef2f2a08c",
    }
    closed = {
        b"feature/fun_time": "4d640e8064fe69b4c851dfd43915c431e80c7497",
        b"feature/green2_loader": "94be9abcf9558213ff301af0ecd8223451ce991d",
        b"feature/greenloader": "9f82d95bd3edfb7f18b1a21d6171170395ea44ce",
        b"feature/my_test": "dafa445964230e808148db043c126063ea1dc9b6",
        b"feature/read2_loader": "9e912851eb64e3a1e08fbb587de7a4c897ce5a0a",
        b"feature/readloader": "ddecbc16f4c916c39eacfcb2302e15a9e70a231e",
        b"feature/red": "cb36b894129ca7910bb81c457c72d69d5ff111bc",
        b"feature/split5_loader": "3ed4b85d30401fe32ae3b1d650f215a588293a9e",
        b"feature/split_causing": "c346f6ff7f42f2a8ff867f92ab83a6721057d86c",
        b"feature/split_loader": "5f4eba626c3f826820c4475d2d81410759ec911b",
        b"feature/split_loader5": "5017ce0b285351da09a2029ea2cf544f79b593c7",
        b"feature/split_loading": "4e2dc6d6073f0b6d348f84ded52f9143b10344b9",
        b"feature/split_redload": "2d4a801c9a9645fcd3a9f4c06418d8393206b1f3",
        b"feature/splitloading": "88b80615ed8561be74a700b92883ec0374ddacb0",
        b"feature/test": "61d762d65afb3150e2653d6735068241779c1fcf",
        b"feature/test_branch": "be44d5e6cc66580f59c108f8bff5911ee91a22e4",
        b"feature/test_branching": "d2164061453ecb03d4347a05a77db83f706b8e15",
        b"feature/test_dog": "2973e5dc9568ac491b198f6b7f10c44ddc04e0a3",
    }

    mapping = {b"branch-closed-heads/%s/0" % b: n for b, n in closed.items()}
    mapping.update(tips)

    expected_branches = {
        k: SnapshotBranch(target=hash_to_bytes(v),
                          target_type=TargetType.REVISION)
        for k, v in mapping.items()
    }
    expected_branches[b"HEAD"] = SnapshotBranch(target=b"branch-tip/default",
                                                target_type=TargetType.ALIAS)

    expected_snapshot = Snapshot(
        id=hash_to_bytes("cbc609dcdced34dbd9938fe81b555170f1abc96f"),
        branches=expected_branches,
    )

    assert_last_visit_matches(
        loader.storage,
        repo_url,
        status="full",
        type="hg",
        snapshot=expected_snapshot.id,
    )
    check_snapshot(expected_snapshot, loader.storage)

    stats = get_stats(loader.storage)
    expected_stats = {
        "content": 2,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 0,
        "revision": 58,
        "skipped_content": 0,
        "snapshot": 1,
    }
    assert stats == expected_stats
    loader2 = HgLoader(swh_storage, url=repo_url)

    assert loader2.load() == {"status": "uneventful"}  # nothing new happened

    stats2 = get_stats(loader2.storage)
    expected_stats2 = expected_stats.copy()
    expected_stats2["origin_visit"] = 2  # one new visit recorded
    assert stats2 == expected_stats2
    assert_last_visit_matches(
        loader2.storage,
        repo_url,
        status="full",
        type="hg",
        snapshot=expected_snapshot.id,
    )  # but we got a snapshot nonetheless
Beispiel #22
0
def test_prepare_repository_from_archive_failure():
    # does not deal with inexistent archive so raise
    assert os.path.exists("unknown-archive") is False
    with pytest.raises(subprocess.CalledProcessError, match="exit status 2"):
        prepare_repository_from_archive("unknown-archive")
def test_loader_hg_new_visit_with_release(swh_storage, datadir, tmp_path):
    """Eventful visit with release should yield 1 snapshot"""

    archive_name = "hello"
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    repo_url = prepare_repository_from_archive(archive_path, archive_name,
                                               tmp_path)

    loader = HgLoader(
        swh_storage,
        url=repo_url,
        visit_date=VISIT_DATE,
    )

    actual_load_status = loader.load()
    assert actual_load_status == {"status": "eventful"}

    # then
    stats = get_stats(loader.storage)
    assert stats == {
        "content": 3,
        "directory": 3,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 3,
        "skipped_content": 0,
        "snapshot": 1,
    }

    # cf. test_loader.org for explaining from where those hashes
    tip_release = hash_to_bytes("515c4d72e089404356d0f4b39d60f948b8999140")
    release = loader.storage.release_get([tip_release])[0]
    assert release is not None

    tip_revision_default = hash_to_bytes(
        "c3dbe4fbeaaa98dd961834e4007edb3efb0e2a27")
    revision = loader.storage.revision_get([tip_revision_default])[0]
    assert revision is not None

    expected_snapshot = Snapshot(
        id=hash_to_bytes("7ef082aa8b53136b1bed97f734504be32679bbec"),
        branches={
            b"branch-tip/default":
            SnapshotBranch(
                target=tip_revision_default,
                target_type=TargetType.REVISION,
            ),
            b"tags/0.1":
            SnapshotBranch(
                target=tip_release,
                target_type=TargetType.RELEASE,
            ),
            b"HEAD":
            SnapshotBranch(
                target=b"branch-tip/default",
                target_type=TargetType.ALIAS,
            ),
        },
    )

    check_snapshot(expected_snapshot, loader.storage)
    assert_last_visit_matches(
        loader.storage,
        repo_url,
        type=RevisionType.MERCURIAL.value,
        status="full",
        snapshot=expected_snapshot.id,
    )
def _get_repo_url(archive_name, datadir, tmp_path):
    archive_path = os.path.join(datadir, f"{archive_name}.tgz")
    return prepare_repository_from_archive(archive_path, "pkg-gourmet", tmp_path)
    def init(self, swh_storage, datadir, tmp_path):
        # remove any proxy settings in order to successfully spawn a local HTTP server
        http_proxy = os.environ.get("http_proxy")
        https_proxy = os.environ.get("https_proxy")
        if http_proxy:
            del os.environ["http_proxy"]
        if http_proxy:
            del os.environ["https_proxy"]

        # prepare test base repository using smart transfer protocol
        archive_name = "testrepo"
        archive_path = os.path.join(datadir, f"{archive_name}.tgz")
        tmp_path = str(tmp_path)
        base_repo_url = prepare_repository_from_archive(archive_path,
                                                        archive_name,
                                                        tmp_path=tmp_path)
        destination_path = os.path.join(tmp_path, archive_name)
        self.destination_path = destination_path
        with_pack_files = self.with_pack_files

        if with_pack_files:
            # create a bare clone of that repository in another folder,
            # all objects will be contained in one or two pack files in that case
            http_root_dir = tmp_path
            repo_name = archive_name + "_bare"
            bare_repo_path = os.path.join(http_root_dir, repo_name)
            subprocess.run(
                ["git", "clone", "--bare", base_repo_url, bare_repo_path],
                check=True,
            )
        else:
            # otherwise serve objects from the bare repository located in
            # the .git folder of the base repository
            http_root_dir = destination_path
            repo_name = ".git"
            bare_repo_path = os.path.join(http_root_dir, repo_name)

        # spawn local HTTP server that will serve the bare repository files
        hostname = "localhost"
        handler = partial(SimpleHTTPRequestHandler, directory=http_root_dir)
        httpd = HTTPServer((hostname, 0), handler, bind_and_activate=True)

        def serve_forever(httpd):
            with httpd:
                httpd.serve_forever()

        thread = Thread(target=serve_forever, args=(httpd, ))
        thread.start()

        repo = dulwich.repo.Repo(self.destination_path)

        class DumbGitLoaderTest(GitLoader):
            def load(self):
                """
                Override load method to ensure the bare repository will be synchronized
                with the base one as tests can modify its content.
                """
                if with_pack_files:
                    # ensure HEAD ref will be the same for both repositories
                    with open(os.path.join(bare_repo_path, "HEAD"),
                              "wb") as fw:
                        with open(os.path.join(destination_path, ".git/HEAD"),
                                  "rb") as fr:
                            head_ref = fr.read()
                            fw.write(head_ref)

                    # push possibly modified refs in the base repository to the bare one
                    for ref in repo.refs.allkeys():
                        if ref != b"HEAD" or head_ref in repo.refs:
                            push(
                                repo,
                                remote_location=f"file://{bare_repo_path}",
                                refspecs=ref,
                            )

                # generate or update the info/refs file used in dumb protocol
                subprocess.run(
                    ["git", "-C", bare_repo_path, "update-server-info"],
                    check=True,
                )

                return super().load()

        # bare repository with dumb protocol only URL
        self.repo_url = f"http://{httpd.server_name}:{httpd.server_port}/{repo_name}"
        self.loader = DumbGitLoaderTest(swh_storage, self.repo_url)
        self.repo = repo

        yield

        # shutdown HTTP server
        httpd.shutdown()
        thread.join()

        # restore HTTP proxy settings if any
        if http_proxy:
            os.environ["http_proxy"] = http_proxy
        if https_proxy:
            os.environ["https_proxy"] = https_proxy