Exemple #1
0
def fake_cook(backend, bundle_type, result_content, sticky=False):
    swhid = Content.from_data(result_content).swhid()
    content, obj_id = hash_content(result_content)
    with mock_cooking(backend):
        backend.create_task(bundle_type, swhid, sticky)
    backend.cache.add(bundle_type, swhid, b"content")
    backend.set_status(bundle_type, swhid, "done")
    return swhid, content
Exemple #2
0
def empty_content():
    """
    Hypothesis strategy returning the empty content ingested
    into the test archive.
    """
    empty_content = Content.from_data(data=b"").to_dict()
    for algo in DEFAULT_ALGORITHMS:
        empty_content[algo] = hash_to_hex(empty_content[algo])
    return just(empty_content)
Exemple #3
0
def test_client_batch_size(
    kafka_prefix: str,
    kafka_consumer_group: str,
    kafka_server: str,
    batch_size: int,
):
    num_objects = 2 * batch_size + 1
    assert num_objects < 256, "Too many objects, generation will fail"

    producer = Producer(
        {
            "bootstrap.servers": kafka_server,
            "client.id": "test producer",
            "acks": "all",
        }
    )

    contents = [Content.from_data(bytes([i])) for i in range(num_objects)]

    # Fill Kafka
    for content in contents:
        producer.produce(
            topic=kafka_prefix + ".content",
            key=key_to_kafka(content.sha1),
            value=value_to_kafka(content.to_dict()),
        )

    producer.flush()

    client = JournalClient(
        brokers=[kafka_server],
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        batch_size=batch_size,
    )

    collected_output: List[Dict] = []

    def worker_fn(objects):
        received = objects["content"]
        assert len(received) <= batch_size
        collected_output.extend(received)

    client.process(worker_fn)

    expected_output = [content.to_dict() for content in contents]
    assert len(collected_output) == len(expected_output)

    for output in collected_output:
        assert output in expected_output
Exemple #4
0
def _add_extra_contents(storage, contents):
    pbm_image_data = b"""P1
# PBM example
24 7
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0
0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0
0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 0 0 0 1 1 1 1 0
0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0
0 1 0 0 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"""

    # add file with mimetype image/x-portable-bitmap in the archive content
    pbm_content = Content.from_data(pbm_image_data)
    storage.content_add([pbm_content])
    contents.add(pbm_content.sha1)
def test_get_filtered_files_content(swh_storage):
    content = Content.from_data(b"foo bar")
    skipped_content = SkippedContent(
        sha1=None,
        sha1_git=b"c" * 20,
        sha256=None,
        blake2s256=None,
        length=42,
        status="absent",
        reason="for some reason",
    )
    swh_storage.content_add([content])
    swh_storage.skipped_content_add([skipped_content])

    files_data = [
        {
            "status": "visible",
            "sha1": content.sha1,
            "sha1_git": content.sha1_git,
            "target": content.sha1_git,
        },
        {
            "status": "absent",
            "target": skipped_content.sha1_git,
        },
    ]

    res = list(get_filtered_files_content(swh_storage, files_data))

    assert res == [
        {
            "content": content.data,
            "status": "visible",
            "sha1": content.sha1,
            "sha1_git": content.sha1_git,
            "target": content.sha1_git,
        },
        {
            "content": (b"This content has not been retrieved in the "
                        b"Software Heritage archive due to its size."),
            "status":
            "absent",
            "target":
            skipped_content.sha1_git,
        },
    ]
def test_get_filtered_files_content__unknown_status(swh_storage):
    content = Content.from_data(b"foo bar")
    swh_storage.content_add([content])

    files_data = [
        {
            "status": "visible",
            "sha1": content.sha1,
            "sha1_git": content.sha1_git,
            "target": content.sha1_git,
        },
        {
            "status": None,
            "target": b"c" * 20,
        },
    ]

    with pytest.raises(AssertionError, match="unexpected status None"):
        list(get_filtered_files_content(swh_storage, files_data))
Exemple #7
0
def _init_tests_data():
    # To hold reference to the memory storage
    storage = get_storage("memory")

    # Create search instance
    search = get_search("memory")
    search.initialize()
    search.origin_update({"url": origin["url"]} for origin in _TEST_ORIGINS)

    # Create indexer storage instance that will be shared by indexers
    idx_storage = get_indexer_storage("memory")

    # Declare a test tool for origin intrinsic metadata tests
    idx_tool = idx_storage.indexer_configuration_add([INDEXER_TOOL])[0]
    INDEXER_TOOL["id"] = idx_tool["id"]

    # Load git repositories from archives
    for origin in _TEST_ORIGINS:
        for i, archive_ in enumerate(origin["archives"]):
            if i > 0:
                # ensure visit dates will be different when simulating
                # multiple visits of an origin
                time.sleep(1)
            origin_repo_archive = os.path.join(os.path.dirname(__file__),
                                               "resources/repos/%s" % archive_)
            loader = GitLoaderFromArchive(
                storage,
                origin["url"],
                archive_path=origin_repo_archive,
            )

            result = loader.load()
            assert result["status"] == "eventful"

        ori = storage.origin_get([origin["url"]])[0]
        origin.update(ori.to_dict())  # add an 'id' key if enabled
        search.origin_update([{
            "url": origin["url"],
            "has_visits": True,
            "visit_types": ["git"]
        }])

    for i in range(250):
        _add_origin(storage,
                    search,
                    origin_url=f"https://many.origins/{i+1}",
                    visit_type="tar")

    sha1s: Set[Sha1] = set()
    directories = set()
    revisions = set()
    releases = set()
    snapshots = set()

    content_path = {}

    # Get all objects loaded into the test archive
    common_metadata = {ORIGIN_METADATA_KEY: ORIGIN_METADATA_VALUE}
    for origin in _TEST_ORIGINS:
        snp = snapshot_get_latest(storage, origin["url"])
        snapshots.add(hash_to_hex(snp.id))
        for branch_name, branch_data in snp.branches.items():
            target_type = branch_data.target_type.value
            if target_type == "revision":
                revisions.add(branch_data.target)
                if b"master" in branch_name:
                    # Add some origin intrinsic metadata for tests
                    metadata = common_metadata
                    metadata.update(origin.get("metadata", {}))
                    origin_metadata = OriginIntrinsicMetadataRow(
                        id=origin["url"],
                        from_revision=branch_data.target,
                        indexer_configuration_id=idx_tool["id"],
                        metadata=metadata,
                        mappings=[],
                    )
                    idx_storage.origin_intrinsic_metadata_add(
                        [origin_metadata])
                    search.origin_update([{
                        "url": origin["url"],
                        "intrinsic_metadata": metadata
                    }])

                    ORIGIN_MASTER_REVISION[origin["url"]] = hash_to_hex(
                        branch_data.target)
            elif target_type == "release":
                release = storage.release_get([branch_data.target])[0]
                revisions.add(release.target)
                releases.add(hash_to_hex(branch_data.target))

        for rev_log in storage.revision_shortlog(set(revisions)):
            rev_id = rev_log[0]
            revisions.add(rev_id)

        for rev in storage.revision_get(revisions):
            if rev is None:
                continue
            dir_id = rev.directory
            directories.add(hash_to_hex(dir_id))
            for entry in dir_iterator(storage, dir_id):
                if entry["type"] == "file":
                    sha1s.add(entry["sha1"])
                    content_path[entry["sha1"]] = "/".join(
                        [hash_to_hex(dir_id), entry["path"].decode("utf-8")])
                elif entry["type"] == "dir":
                    directories.add(hash_to_hex(entry["target"]))

    _add_extra_contents(storage, sha1s)

    # Get all checksums for each content
    result: List[Optional[Content]] = storage.content_get(list(sha1s))

    contents: List[Dict] = []
    for content in result:
        assert content is not None
        sha1 = hash_to_hex(content.sha1)
        content_metadata = {
            algo: hash_to_hex(getattr(content, algo))
            for algo in DEFAULT_ALGORITHMS
        }

        path = ""
        if content.sha1 in content_path:
            path = content_path[content.sha1]

        cnt_data = storage.content_get_data(content.sha1)
        assert cnt_data is not None
        mimetype, encoding = get_mimetype_and_encoding_for_content(cnt_data)
        _, _, cnt_data = _re_encode_content(mimetype, encoding, cnt_data)
        content_display_data = prepare_content_for_display(
            cnt_data, mimetype, path)

        content_metadata.update({
            "path":
            path,
            "mimetype":
            mimetype,
            "encoding":
            encoding,
            "hljs_language":
            content_display_data["language"],
            "data":
            content_display_data["content_data"],
        })
        _contents[sha1] = content_metadata
        contents.append(content_metadata)

    # Add the empty directory to the test archive
    storage.directory_add([Directory(entries=())])

    # Add empty content to the test archive
    storage.content_add([Content.from_data(data=b"")])

    # Add fake git origin with pull request branches
    _add_origin(
        storage,
        search,
        origin_url="https://git.example.org/project",
        snapshot_branches={
            b"refs/heads/master": {
                "target_type": "revision",
                "target": next(iter(revisions)),
            },
            **{
                f"refs/pull/{i}".encode(): {
                    "target_type": "revision",
                    "target": next(iter(revisions)),
                }
                for i in range(300)
            },
        },
    )

    # Return tests data
    return {
        "search": search,
        "storage": storage,
        "idx_storage": idx_storage,
        "origins": _TEST_ORIGINS,
        "contents": contents,
        "directories": list(directories),
        "releases": list(releases),
        "revisions": list(map(hash_to_hex, revisions)),
        "snapshots": list(snapshots),
        "generated_checksums": set(),
    }
Exemple #8
0
def test_replay_statsd(kafka_server, kafka_prefix, kafka_consumer_group,
                       statsd):
    objstorage1 = get_objstorage(cls="memory")
    objstorage2 = get_objstorage(cls="memory")

    writer = get_journal_writer(
        cls="kafka",
        brokers=[kafka_server],
        client_id="kafka_writer",
        prefix=kafka_prefix,
        anonymize=False,
    )

    # Fill the source objstorage with a bunch of content object. In the end,
    # there should be 2 content objects for each possible replaying decision
    # (aka. skipped, excluded, in_dst, not_in_src, failed and copied):
    # contents[0:2] are properly copied
    # contents[2:4] are excluded
    # contents[4:6] are in dst
    # contents[6:8] are hidden
    contents = [
        Content.from_data(f"foo{i}".encode(),
                          status="hidden" if 6 <= i < 8 else "visible")
        for i in range(8)
    ]

    for content in contents:
        objstorage1.add(content.data)
        writer.write_addition("content", content)
    excluded = [c.sha1 for c in contents[2:4]]

    def exclude_fn(cnt_d):
        return cnt_d["sha1"] in excluded

    for content in contents[4:6]:
        objstorage2.add(content.data)

    replayer = JournalClient(
        brokers=kafka_server,
        group_id=kafka_consumer_group,
        prefix=kafka_prefix,
        stop_on_eof=True,
        # stop_after_objects=len(objects),
    )

    worker_fn = functools.partial(
        process_replay_objects_content,
        src=objstorage1,
        dst=objstorage2,
        exclude_fn=exclude_fn,
    )
    replayer.process(worker_fn)

    # We cannot expect any order from replayed objects, so statsd reports won't
    # be sorted according to contents, so we just count the expected occurrence
    # of each statsd message.
    prefix = "swh_content_replayer"
    expected_reports = {
        # 4 because 2 for the copied objects + 2 for the in_dst ones
        f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:obj_in_objstorage$":
        4,
        f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:get_object$":
        2,
        f"^{prefix}_retries_total:1[|]c[|]#attempt:1,operation:put_object$":
        2,
        f"^{prefix}_duration_seconds:[0-9]+[.][0-9]+[|]ms[|]#request:get$":
        2,
        f"^{prefix}_duration_seconds:[0-9]+[.][0-9]+[|]ms[|]#request:put$":
        2,
        f"^{prefix}_bytes:4[|]c$":
        2,
    }
    decisions = ("copied", "skipped", "excluded", "in_dst", "not_in_src",
                 "failed")
    decision_re = (
        "^swh_content_replayer_operations_total:1[|]c[|]#decision:(?P<decision>"
        + "|".join(decisions) + ")(?P<extras>,.+)?$")

    operations = dict.fromkeys(decisions, 0)
    reports = dict.fromkeys(expected_reports, 0)

    for report in (r.decode() for r in statsd.socket.payloads):
        m = re.match(decision_re, report)
        if m:
            operations[m.group("decision")] += 1
        else:
            for expected in expected_reports:
                m = re.match(expected, report)
                if m:
                    reports[expected] += 1

    assert reports == expected_reports

    assert operations["skipped"] == 2
    assert operations["excluded"] == 2
    assert operations["in_dst"] == 2
    assert operations["copied"] == 2
    # TODO:
    assert operations["not_in_src"] == 0
    assert operations["failed"] == 0
def test_ignore_displayname(swh_storage, use_graph):
    """Tests the original authorship information is used instead of
    configured display names; otherwise objects would not match their hash,
    and git-fsck/git-clone would fail.

    This tests both with and without swh-graph, as both configurations use different
    code paths to fetch revisions.
    """

    date = TimestampWithTimezone.from_numeric_offset(Timestamp(1643882820, 0),
                                                     0, False)
    legacy_person = Person.from_fullname(b"old me <*****@*****.**>")
    current_person = Person.from_fullname(b"me <*****@*****.**>")

    content = Content.from_data(b"foo")
    swh_storage.content_add([content])

    directory = Directory(
        entries=(DirectoryEntry(name=b"file1",
                                type="file",
                                perms=0o100644,
                                target=content.sha1_git), ), )
    swh_storage.directory_add([directory])

    revision = Revision(
        message=b"rev",
        author=legacy_person,
        date=date,
        committer=legacy_person,
        committer_date=date,
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        synthetic=True,
    )
    swh_storage.revision_add([revision])

    release = Release(
        name=b"v1.1.0",
        message=None,
        author=legacy_person,
        date=date,
        target=revision.id,
        target_type=ObjectType.REVISION,
        synthetic=True,
    )
    swh_storage.release_add([release])

    snapshot = Snapshot(
        branches={
            b"refs/tags/v1.1.0":
            SnapshotBranch(target=release.id, target_type=TargetType.RELEASE),
            b"HEAD":
            SnapshotBranch(target=revision.id,
                           target_type=TargetType.REVISION),
        })
    swh_storage.snapshot_add([snapshot])

    # Add all objects to graph
    if use_graph:
        from swh.graph.naive_client import NaiveClient as GraphClient

        nodes = [
            str(x.swhid())
            for x in [content, directory, revision, release, snapshot]
        ]
        edges = [(str(x.swhid()), str(y.swhid())) for (x, y) in [
            (directory, content),
            (revision, directory),
            (release, revision),
            (snapshot, release),
            (snapshot, revision),
        ]]
        swh_graph = unittest.mock.Mock(
            wraps=GraphClient(nodes=nodes, edges=edges))
    else:
        swh_graph = None

    # Set a display name
    with swh_storage.db() as db:
        with db.transaction() as cur:
            cur.execute(
                "UPDATE person set displayname = %s where fullname = %s",
                (current_person.fullname, legacy_person.fullname),
            )

    # Check the display name did apply in the storage
    assert swh_storage.revision_get([revision.id])[0] == attr.evolve(
        revision,
        author=current_person,
        committer=current_person,
    )

    # Cook
    cooked_swhid = snapshot.swhid()
    backend = InMemoryVaultBackend()
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=swh_graph,
    )

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        # If we are here, it means git-fsck succeeded when called by cooker.cook(),
        # so we already know the original person was used. Let's double-check.

        repo = dulwich.repo.Repo(f"{tempdir}/{cooked_swhid}.git")

        tag = repo[b"refs/tags/v1.1.0"]
        assert tag.tagger == legacy_person.fullname

        commit = repo[tag.object[1]]
        assert commit.author == legacy_person.fullname
def test_checksum_mismatch(swh_storage, mismatch_on):
    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc))
    author = Person.from_fullname(b"Foo <*****@*****.**>")

    wrong_hash = b"\x12\x34" * 10

    cnt1 = Content.from_data(b"Tr0ub4dor&3")
    if mismatch_on == "content":
        cnt1 = attr.evolve(cnt1, sha1_git=wrong_hash)

    dir1 = Directory(entries=(DirectoryEntry(
        name=b"file1",
        type="file",
        perms=DentryPerms.content,
        target=cnt1.sha1_git,
    ), ))

    if mismatch_on == "directory":
        dir1 = attr.evolve(dir1, id=wrong_hash)

    rev1 = Revision(
        message=b"msg1",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        type=RevisionType.GIT,
        synthetic=True,
    )

    if mismatch_on == "revision1":
        rev1 = attr.evolve(rev1, id=wrong_hash)

    rev2 = Revision(
        message=b"msg2",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        parents=(rev1.id, ),
        type=RevisionType.GIT,
        synthetic=True,
    )

    if mismatch_on == "revision2":
        rev2 = attr.evolve(rev2, id=wrong_hash)

    cooked_swhid = rev2.swhid()

    swh_storage.content_add([cnt1])
    swh_storage.directory_add([dir1])
    swh_storage.revision_add([rev1, rev2])

    backend = InMemoryVaultBackend()
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=None,
    )

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        if mismatch_on != "revision2":
            # git-log fails if the head revision is corrupted
            # TODO: we need to find a way to make this somewhat usable
            output = subprocess.check_output([
                "git",
                "-C",
                f"{tempdir}/{cooked_swhid}.git",
                "log",
                "--format=oneline",
                "--decorate=",
            ])

            assert output.decode(
            ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"
def test_graph_revisions(swh_storage, up_to_date_graph, root_object, tag,
                         weird_branches):
    r"""
    Build objects::

                                     snp
                                    /|||\
                                   / ||| \
                        rel2 <----°  /|\  \----> rel4
                         |          / | \         |
                         v         /  v  \        v
          rev1  <------ rev2 <----°  dir4 \      rel3
           |             |            |    \      |
           v             v            v     \     |
          dir1          dir2         dir3   |     |
           |           /   |          |     |     |
           v          /    v          v     v     v
          cnt1  <----°    cnt2       cnt3  cnt4  cnt5

    If up_to_date_graph is true, then swh-graph contains all objects.
    Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph.

    If tag is False, rel2 is excluded.

    If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded.
    """
    from swh.graph.naive_client import NaiveClient as GraphClient

    # Create objects:

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc))
    author = Person.from_fullname(b"Foo <*****@*****.**>")
    cnt1 = Content.from_data(b"correct")
    cnt2 = Content.from_data(b"horse")
    cnt3 = Content.from_data(b"battery")
    cnt4 = Content.from_data(b"staple")
    cnt5 = Content.from_data(b"Tr0ub4dor&3")
    dir1 = Directory(entries=(DirectoryEntry(
        name=b"file1",
        type="file",
        perms=DentryPerms.content,
        target=cnt1.sha1_git,
    ), ))
    dir2 = Directory(entries=(
        DirectoryEntry(
            name=b"file1",
            type="file",
            perms=DentryPerms.content,
            target=cnt1.sha1_git,
        ),
        DirectoryEntry(
            name=b"file2",
            type="file",
            perms=DentryPerms.content,
            target=cnt2.sha1_git,
        ),
    ))
    dir3 = Directory(entries=(DirectoryEntry(
        name=b"file3",
        type="file",
        perms=DentryPerms.content,
        target=cnt3.sha1_git,
    ), ))
    dir4 = Directory(entries=(DirectoryEntry(
        name=b"directory3",
        type="dir",
        perms=DentryPerms.directory,
        target=dir3.id,
    ), ))
    rev1 = Revision(
        message=b"msg1",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        type=RevisionType.GIT,
        synthetic=True,
    )
    rev2 = Revision(
        message=b"msg2",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir2.id,
        parents=(rev1.id, ),
        type=RevisionType.GIT,
        synthetic=True,
    )

    rel2 = Release(
        name=b"1.0.0",
        message=b"tag2",
        target_type=ObjectType.REVISION,
        target=rev2.id,
        synthetic=True,
    )
    rel3 = Release(
        name=b"1.0.0-blob",
        message=b"tagged-blob",
        target_type=ObjectType.CONTENT,
        target=cnt5.sha1_git,
        synthetic=True,
    )
    rel4 = Release(
        name=b"1.0.0-weird",
        message=b"weird release",
        target_type=ObjectType.RELEASE,
        target=rel3.id,
        synthetic=True,
    )
    rel5 = Release(
        name=b"1.0.0:weirdname",
        message=b"weird release",
        target_type=ObjectType.RELEASE,
        target=rel2.id,
        synthetic=True,
    )

    # Create snapshot:

    branches = {
        b"refs/heads/master":
        SnapshotBranch(target=rev2.id, target_type=TargetType.REVISION),
    }
    if tag:
        branches[b"refs/tags/1.0.0"] = SnapshotBranch(
            target=rel2.id, target_type=TargetType.RELEASE)
    if weird_branches:
        branches[b"refs/heads/tree-ref"] = SnapshotBranch(
            target=dir4.id, target_type=TargetType.DIRECTORY)
        branches[b"refs/heads/blob-ref"] = SnapshotBranch(
            target=cnt4.sha1_git, target_type=TargetType.CONTENT)
        branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch(
            target=rel4.id, target_type=TargetType.RELEASE)
    snp = Snapshot(branches=branches)

    # "Fill" swh-graph

    if up_to_date_graph:
        nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp]
        edges = [
            (dir1, cnt1),
            (dir2, cnt1),
            (dir2, cnt2),
            (rev1, dir1),
            (rev2, dir2),
            (rev2, rev1),
            (snp, rev2),
        ]
        if tag:
            nodes.append(rel2)
            edges.append((rel2, rev2))
            edges.append((snp, rel2))
        if weird_branches:
            nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4, rel5])
            edges.extend([
                (dir3, cnt3),
                (dir4, dir3),
                (snp, dir4),
                (snp, cnt4),
                (snp, rel4),
                (rel4, rel3),
                (rel3, cnt5),
                (rel5, rev2),
            ])
    else:
        nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1]
        edges = [
            (dir1, cnt1),
            (dir2, cnt1),
            (dir2, cnt2),
            (dir3, cnt3),
            (rev1, dir1),
        ]
        if tag:
            nodes.append(rel2)
        if weird_branches:
            nodes.extend([cnt3, dir3])
            edges.extend([(dir3, cnt3)])

    nodes = [str(n.swhid()) for n in nodes]
    edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges]

    # Add all objects to storage
    swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5])
    swh_storage.directory_add([dir1, dir2, dir3, dir4])
    swh_storage.revision_add([rev1, rev2])
    swh_storage.release_add([rel2, rel3, rel4, rel5])
    swh_storage.snapshot_add([snp])

    # Add spy on swh_storage, to make sure revision_log is not called
    # (the graph must be used instead)
    swh_storage = unittest.mock.MagicMock(wraps=swh_storage)

    # Add all objects to graph
    swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges))

    # Cook
    backend = InMemoryVaultBackend()
    cooked_swhid = {
        RootObjects.SNAPSHOT: snp.swhid(),
        RootObjects.REVISION: rev2.swhid(),
        RootObjects.RELEASE: rel2.swhid(),
        RootObjects.WEIRD_RELEASE: rel5.swhid(),
    }[root_object]
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=swh_graph,
    )

    if weird_branches:
        # git-fsck now rejects refs pointing to trees and blobs,
        # but some old git repos have them.
        cooker.use_fsck = False

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        if root_object in (RootObjects.SNAPSHOT, RootObjects.REVISION):
            log_head = "master"
        elif root_object == RootObjects.RELEASE:
            log_head = "1.0.0"
        elif root_object == RootObjects.WEIRD_RELEASE:
            log_head = "release"
        else:
            assert False, root_object

        output = subprocess.check_output([
            "git",
            "-C",
            f"{tempdir}/{cooked_swhid}.git",
            "log",
            "--format=oneline",
            "--decorate=",
            log_head,
        ])

        assert output.decode(
        ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"

    # Make sure the graph was used instead of swh_storage.revision_log
    if root_object == RootObjects.SNAPSHOT:
        if up_to_date_graph:
            # The graph has everything, so the first call succeeds and returns
            # all objects transitively pointed by the snapshot
            swh_graph.visit_nodes.assert_has_calls([
                unittest.mock.call(str(snp.swhid()),
                                   edges="snp:*,rel:*,rev:rev"),
            ])
        else:
            # The graph does not have everything, so the first call returns nothing.
            # However, the second call (on the top rev) succeeds and returns
            # all objects but the rev and the rel
            swh_graph.visit_nodes.assert_has_calls([
                unittest.mock.call(str(snp.swhid()),
                                   edges="snp:*,rel:*,rev:rev"),
                unittest.mock.call(str(rev2.swhid()), edges="rev:rev"),
            ])
    elif root_object in (
            RootObjects.REVISION,
            RootObjects.RELEASE,
            RootObjects.WEIRD_RELEASE,
    ):
        swh_graph.visit_nodes.assert_has_calls(
            [unittest.mock.call(str(rev2.swhid()), edges="rev:rev")])
    else:
        assert False, root_object

    if up_to_date_graph:
        swh_storage.revision_log.assert_not_called()
        swh_storage.revision_shortlog.assert_not_called()
    else:
        swh_storage.revision_log.assert_called()
Exemple #12
0
    def test_original_malformed_objects(self, swh_storage,
                                        cook_extract_snapshot):
        """Tests that objects that were originally malformed:

        * are still interpreted somewhat correctly (if the loader could make sense of
          them), especially that they still have links to children
        * have their original manifest in the bundle
        """
        date = TimestampWithTimezone.from_numeric_offset(
            Timestamp(1643819927, 0), 0, False)

        content = Content.from_data(b"foo")
        swh_storage.content_add([content])

        # disordered
        # fmt: off
        malformed_dir_manifest = (b"" + b"100644 file2\x00" +
                                  content.sha1_git + b"100644 file1\x00" +
                                  content.sha1_git)
        # fmt: on
        directory = Directory(
            entries=(
                DirectoryEntry(name=b"file1",
                               type="file",
                               perms=0o100644,
                               target=content.sha1_git),
                DirectoryEntry(name=b"file2",
                               type="file",
                               perms=0o100644,
                               target=content.sha1_git),
            ),
            raw_manifest=f"tree {len(malformed_dir_manifest)}\x00".encode() +
            malformed_dir_manifest,
        )
        swh_storage.directory_add([directory])

        # 'committer' and 'author' swapped
        # fmt: off
        malformed_rev_manifest = (
            b"tree " + hashutil.hash_to_bytehex(directory.id) + b"\n" +
            b"committer me <*****@*****.**> 1643819927 +0000\n" +
            b"author me <*****@*****.**> 1643819927 +0000\n" + b"\n" +
            b"rev")
        # fmt: on
        revision = Revision(
            message=b"rev",
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            committer=Person.from_fullname(b"me <*****@*****.**>"),
            committer_date=date,
            parents=(),
            type=RevisionType.GIT,
            directory=directory.id,
            synthetic=True,
            raw_manifest=f"commit {len(malformed_rev_manifest)}\x00".encode() +
            malformed_rev_manifest,
        )
        swh_storage.revision_add([revision])

        # 'tag' and 'tagger' swapped
        # fmt: off
        malformed_rel_manifest = (
            b"object " + hashutil.hash_to_bytehex(revision.id) + b"\n" +
            b"type commit\n" +
            b"tagger me <*****@*****.**> 1643819927 +0000\n" +
            b"tag v1.1.0\n")
        # fmt: on

        release = Release(
            name=b"v1.1.0",
            message=None,
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            target=revision.id,
            target_type=ModelObjectType.REVISION,
            synthetic=True,
            raw_manifest=f"tag {len(malformed_rel_manifest)}\x00".encode() +
            malformed_rel_manifest,
        )
        swh_storage.release_add([release])

        snapshot = Snapshot(
            branches={
                b"refs/tags/v1.1.0":
                SnapshotBranch(target=release.id,
                               target_type=TargetType.RELEASE),
                b"HEAD":
                SnapshotBranch(target=revision.id,
                               target_type=TargetType.REVISION),
            })
        swh_storage.snapshot_add([snapshot])

        with cook_extract_snapshot(swh_storage, snapshot.swhid()) as (ert, p):
            tag = ert.repo[b"refs/tags/v1.1.0"]
            assert tag.as_raw_string() == malformed_rel_manifest

            commit = ert.repo[tag.object[1]]
            assert commit.as_raw_string() == malformed_rev_manifest

            tree = ert.repo[commit.tree]
            assert tree.as_raw_string() == malformed_dir_manifest
import functools

from hypothesis import given, settings
from hypothesis.strategies import sets

from swh.journal.client import JournalClient
from swh.journal.writer import get_journal_writer
from swh.model.hypothesis_strategies import sha1
from swh.model.model import Content
from swh.objstorage.factory import get_objstorage
from swh.objstorage.replayer.replay import (
    is_hash_in_bytearray,
    process_replay_objects_content,
)

CONTENTS = [Content.from_data(f"foo{i}".encode()) for i in range(10)] + [
    Content.from_data(f"forbidden foo{i}".encode(), status="hidden")
    for i in range(10)
]


@settings(max_examples=500)
@given(
    sets(sha1(), min_size=0, max_size=500),
    sets(sha1(), min_size=10),
)
def test_is_hash_in_bytearray(haystack, needles):
    array = b"".join(sorted(haystack))
    needles |= haystack  # Exhaustively test for all objects in the array
    for needle in needles:
        assert is_hash_in_bytearray(needle, array,
Exemple #14
0
 def test_content_identifier(self):
     self.assertEqual(
         Content.from_data(content_example["data"]).hashes(),
         self.content_id)