Ejemplo n.º 1
0
def test_revision_metadata_display(archive_data, client, directory, person,
                                   date):
    metadata = {"foo": "bar"}
    revision = Revision(
        directory=hash_to_bytes(directory),
        author=person,
        committer=person,
        message=b"commit message",
        date=TimestampWithTimezone.from_datetime(date),
        committer_date=TimestampWithTimezone.from_datetime(date),
        synthetic=False,
        type=RevisionType.GIT,
        metadata=metadata,
    )
    archive_data.revision_add([revision])

    url = reverse("browse-revision",
                  url_args={"sha1_git": hash_to_hex(revision.id)})

    resp = check_html_get_response(client,
                                   url,
                                   status_code=200,
                                   template_used="browse/revision.html")
    assert_contains(resp, "swh-metadata-popover")
    assert_contains(resp, escape(json.dumps(metadata, indent=4)))
Ejemplo n.º 2
0
def cook_extract_directory_gitfast(storage, swhid, fsck=True):
    """Context manager that cooks a revision containing a directory and extract it,
    using RevisionGitfastCooker"""
    test_repo = TestRepo()
    with test_repo as p:
        date = TimestampWithTimezone.from_datetime(
            datetime.datetime.now(datetime.timezone.utc))
        revision = Revision(
            directory=swhid.object_id,
            message=b"dummy message",
            author=Person.from_fullname(b"someone"),
            committer=Person.from_fullname(b"someone"),
            date=date,
            committer_date=date,
            type=RevisionType.GIT,
            synthetic=False,
        )
        storage.revision_add([revision])

    with cook_stream_revision_gitfast(
            storage, revision.swhid()) as stream, test_repo as p:
        processor = dulwich.fastexport.GitImportProcessor(test_repo.repo)
        processor.import_stream(stream)
        test_repo.checkout(b"HEAD")
        shutil.rmtree(p / ".git")
        yield p
Ejemplo n.º 3
0
def parse_date(date: Optional[str]) -> Optional[TimestampWithTimezone]:
    """Parse a date into a datetime"""
    assert not date or isinstance(date, str)
    dt: Optional[datetime.datetime] = None
    if not date:
        return None
    try:
        specific_date = DATE_PATTERN.match(date)
        if specific_date:
            year = int(specific_date.group("year"))
            month = int(specific_date.group("month"))
            dt = datetime.datetime(year, month, 1)
        else:
            dt = dateutil.parser.parse(date)

        if not dt.tzinfo:
            # up for discussion the timezone needs to be set or
            # normalize_timestamp is not happy: ValueError: normalize_timestamp
            # received datetime without timezone: 2001-06-08 00:00:00
            dt = dt.replace(tzinfo=timezone.utc)
    except Exception as e:
        logger.warning("Fail to parse date %s. Reason: %s", date, e)
    if dt:
        return TimestampWithTimezone.from_datetime(dt)
    else:
        return None
Ejemplo n.º 4
0
def new_revision(draw):
    """
    Hypothesis strategy returning random raw swh revision data
    not ingested into the test archive.
    """
    return Revision(
        directory=draw(sha1().map(hash_to_bytes)),
        author=draw(new_person()),
        committer=draw(new_person()),
        message=draw(
            text(min_size=20, max_size=100).map(lambda t: t.encode())),
        date=TimestampWithTimezone.from_datetime(draw(new_swh_date())),
        committer_date=TimestampWithTimezone.from_datetime(draw(
            new_swh_date())),
        synthetic=False,
        type=RevisionType.GIT,
    )
Ejemplo n.º 5
0
def test_api_revision_directory_ok_returns_revision(api_client, archive_data,
                                                    revision, person, date):
    rev_path = "foo"
    _dir = Directory(entries=(DirectoryEntry(
        name=rev_path.encode(),
        type="rev",
        target=hash_to_bytes(revision),
        perms=DentryPerms.revision,
    ), ))
    archive_data.directory_add([_dir])

    rev = Revision(
        directory=_dir.id,
        author=person,
        committer=person,
        message=b"commit message",
        date=TimestampWithTimezone.from_datetime(date),
        committer_date=TimestampWithTimezone.from_datetime(date),
        synthetic=False,
        type=RevisionType.GIT,
    )
    archive_data.revision_add([rev])

    revision_id = hash_to_hex(rev.id)
    rev_data = archive_data.revision_get(revision)
    url = reverse(
        "api-1-revision-directory",
        {
            "sha1_git": revision_id,
            "dir_path": rev_path
        },
    )
    rv = check_api_get_responses(api_client, url, status_code=200)

    assert rv.data == {
        "content": enrich_revision(rev_data, request=rv.wsgi_request),
        "path": rev_path,
        "type": "rev",
        "revision": revision_id,
    }
Ejemplo n.º 6
0
 def build_release(self, p_info: MavenPackageInfo, uncompressed_path: str,
                   directory: Sha1Git) -> Optional[Release]:
     msg = f"Synthetic release for archive at {p_info.url}\n".encode(
         "utf-8")
     normalized_time = TimestampWithTimezone.from_datetime(p_info.time)
     return Release(
         name=p_info.version.encode(),
         message=msg,
         date=normalized_time,
         author=EMPTY_AUTHOR,
         target=directory,
         target_type=ObjectType.DIRECTORY,
         synthetic=True,
     )
Ejemplo n.º 7
0
    def test_revision_submodule(self, swh_storage, cook_extract_revision,
                                ingest_target_revision):
        date = TimestampWithTimezone.from_datetime(
            datetime.datetime.now(
                datetime.timezone.utc).replace(microsecond=0))

        target_rev = Revision(
            message=b"target_rev",
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            committer=Person.from_fullname(b"me <*****@*****.**>"),
            committer_date=date,
            parents=(),
            type=RevisionType.GIT,
            directory=bytes.fromhex(
                "3333333333333333333333333333333333333333"),
            metadata={},
            synthetic=True,
        )
        if ingest_target_revision:
            swh_storage.revision_add([target_rev])

        dir = Directory(entries=(DirectoryEntry(
            name=b"submodule",
            type="rev",
            target=target_rev.id,
            perms=0o160000,
        ), ), )
        swh_storage.directory_add([dir])

        rev = Revision(
            message=b"msg",
            author=Person.from_fullname(b"me <*****@*****.**>"),
            date=date,
            committer=Person.from_fullname(b"me <*****@*****.**>"),
            committer_date=date,
            parents=(),
            type=RevisionType.GIT,
            directory=dir.id,
            metadata={},
            synthetic=True,
        )
        swh_storage.revision_add([rev])

        with cook_extract_revision(swh_storage, rev.swhid()) as (ert, p):
            ert.checkout(b"HEAD")
            pattern = b"160000 submodule\x00%s" % target_rev.id
            tree = ert.repo[b"HEAD"].tree
            assert pattern in ert.repo[tree].as_raw_string()
Ejemplo n.º 8
0
def svn_date_to_swh_date(strdate: Optional[bytes]) -> TimestampWithTimezone:
    """Convert a string date to an swh one.

    Args:
        strdate: A string representing a date with format like
        ``b'YYYY-mm-DDTHH:MM:SS.800722Z'``

    Returns:
        An swh date format

    """
    if not strdate:  # either None or empty string
        dt = datetime.datetime(1970, 1, 1, tzinfo=datetime.timezone.utc)
    else:
        dt = iso8601.parse_date(strdate.decode("ascii"))
        assert dt.tzinfo is not None, strdate
    return TimestampWithTimezone.from_datetime(dt)
Ejemplo n.º 9
0
    def _make_stub_directory_revision(self, dir_id: Sha1Git) -> Sha1Git:
        author = Person.from_fullname(
            b"swh-vault, git-bare cooker <*****@*****.**>")
        dt = datetime.datetime.now(tz=datetime.timezone.utc)
        dt = dt.replace(microsecond=0)  # not supported by git
        date = TimestampWithTimezone.from_datetime(dt)

        revision = Revision(
            author=author,
            committer=author,
            date=date,
            committer_date=date,
            message=b"Initial commit",
            type=RevisionType.GIT,
            directory=self.obj_id,
            synthetic=True,
        )
        self.write_revision_node(revision)

        return revision.id
Ejemplo n.º 10
0
def test_load_upgrade_from_revision_extids(caplog):
    """Tests that, when loading incrementally based on a snapshot made by an old
    version of the loader, the loader will convert revisions to releases
    and add them to the storage.

    Also checks that, if an extid exists pointing to a non-existent revision
    (which should never happen, but you never know...), the release is loaded from
    scratch."""

    storage = get_storage("memory")

    origin = "http://example.org"
    dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"d" * 20)
    dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"e" * 20)

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime.now(tz=datetime.timezone.utc))
    person = Person.from_fullname(b"Jane Doe <*****@*****.**>")

    rev1 = Revision(
        message=b"blah",
        author=person,
        date=date,
        committer=person,
        committer_date=date,
        directory=dir1_swhid.object_id,
        type=RevisionType.TAR,
        synthetic=True,
    )

    rel1 = Release(
        name=b"v1.0",
        message=b"blah\n",
        author=person,
        date=date,
        target=dir1_swhid.object_id,
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
    )

    rev1_swhid = rev1.swhid()
    rel1_swhid = rel1.swhid()
    rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION,
                           object_id=b"b" * 20)
    rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20)

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0),
        ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0),
    ])
    storage.revision_add([rev1])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev2_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel2_swhid.object_id, dir2_swhid.object_id),
        autospec=True,
    ).start()
    patch.object(
        loader,
        "get_versions",
        return_value=["v1.0", "v2.0", "v3.0"],
        autospec=True,
    ).start()

    caplog.set_level(logging.ERROR)

    loader.load()

    assert len(caplog.records) == 1
    (record, ) = caplog.records
    assert record.levelname == "ERROR"
    assert "Failed to upgrade branch branch-v2.0" in record.message

    assert loader._load_release.mock_calls == [
        # v1.0: not loaded because there is already a revision matching it
        # v2.0: loaded, as the revision is missing from the storage even though there
        #       is an extid
        call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
             Origin(url=origin)),
        # v3.0: loaded (did not exist yet)
        call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
             Origin(url=origin)),
    ]

    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
        ],
    )

    assert set(extids) == {
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid),
    }
Ejemplo n.º 11
0
    ),
    synthetic=True,
)

RELEASE = Release(
    id=hash_to_bytes("3e9050196aa288264f2a9d279d6abab8b158448b"),
    name=b"v0.0.2",
    author=Person(
        name=b"tony",
        email=b"*****@*****.**",
        fullname=b"tony <*****@*****.**>",
    ),
    date=TimestampWithTimezone.from_datetime(
        datetime.datetime(2021,
                          10,
                          15,
                          22,
                          26,
                          53,
                          tzinfo=datetime.timezone.utc)),
    target=REVISION.id,
    target_type=ObjectType.REVISION,
    message=b"yet another synthetic release",
    synthetic=True,
)

SNAPSHOT = Snapshot(
    id=hash_to_bytes("2498dbf535f882bc7f9a18fb16c9ad27fda7bab7"),
    branches={
        b"release/0.1.0":
        SnapshotBranch(
            target=RELEASE.id,
def test_graph_revisions(swh_storage, up_to_date_graph, root_object, tag,
                         weird_branches):
    r"""
    Build objects::

                                     snp
                                    /|||\
                                   / ||| \
                        rel2 <----°  /|\  \----> rel4
                         |          / | \         |
                         v         /  v  \        v
          rev1  <------ rev2 <----°  dir4 \      rel3
           |             |            |    \      |
           v             v            v     \     |
          dir1          dir2         dir3   |     |
           |           /   |          |     |     |
           v          /    v          v     v     v
          cnt1  <----°    cnt2       cnt3  cnt4  cnt5

    If up_to_date_graph is true, then swh-graph contains all objects.
    Else, cnt4, cnt5, dir4, rev2, rel2, rel3, and snp are missing from the graph.

    If tag is False, rel2 is excluded.

    If weird_branches is False, dir4, cnt4, rel3, rel4, and cnt5 are excluded.
    """
    from swh.graph.naive_client import NaiveClient as GraphClient

    # Create objects:

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc))
    author = Person.from_fullname(b"Foo <*****@*****.**>")
    cnt1 = Content.from_data(b"correct")
    cnt2 = Content.from_data(b"horse")
    cnt3 = Content.from_data(b"battery")
    cnt4 = Content.from_data(b"staple")
    cnt5 = Content.from_data(b"Tr0ub4dor&3")
    dir1 = Directory(entries=(DirectoryEntry(
        name=b"file1",
        type="file",
        perms=DentryPerms.content,
        target=cnt1.sha1_git,
    ), ))
    dir2 = Directory(entries=(
        DirectoryEntry(
            name=b"file1",
            type="file",
            perms=DentryPerms.content,
            target=cnt1.sha1_git,
        ),
        DirectoryEntry(
            name=b"file2",
            type="file",
            perms=DentryPerms.content,
            target=cnt2.sha1_git,
        ),
    ))
    dir3 = Directory(entries=(DirectoryEntry(
        name=b"file3",
        type="file",
        perms=DentryPerms.content,
        target=cnt3.sha1_git,
    ), ))
    dir4 = Directory(entries=(DirectoryEntry(
        name=b"directory3",
        type="dir",
        perms=DentryPerms.directory,
        target=dir3.id,
    ), ))
    rev1 = Revision(
        message=b"msg1",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        type=RevisionType.GIT,
        synthetic=True,
    )
    rev2 = Revision(
        message=b"msg2",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir2.id,
        parents=(rev1.id, ),
        type=RevisionType.GIT,
        synthetic=True,
    )

    rel2 = Release(
        name=b"1.0.0",
        message=b"tag2",
        target_type=ObjectType.REVISION,
        target=rev2.id,
        synthetic=True,
    )
    rel3 = Release(
        name=b"1.0.0-blob",
        message=b"tagged-blob",
        target_type=ObjectType.CONTENT,
        target=cnt5.sha1_git,
        synthetic=True,
    )
    rel4 = Release(
        name=b"1.0.0-weird",
        message=b"weird release",
        target_type=ObjectType.RELEASE,
        target=rel3.id,
        synthetic=True,
    )
    rel5 = Release(
        name=b"1.0.0:weirdname",
        message=b"weird release",
        target_type=ObjectType.RELEASE,
        target=rel2.id,
        synthetic=True,
    )

    # Create snapshot:

    branches = {
        b"refs/heads/master":
        SnapshotBranch(target=rev2.id, target_type=TargetType.REVISION),
    }
    if tag:
        branches[b"refs/tags/1.0.0"] = SnapshotBranch(
            target=rel2.id, target_type=TargetType.RELEASE)
    if weird_branches:
        branches[b"refs/heads/tree-ref"] = SnapshotBranch(
            target=dir4.id, target_type=TargetType.DIRECTORY)
        branches[b"refs/heads/blob-ref"] = SnapshotBranch(
            target=cnt4.sha1_git, target_type=TargetType.CONTENT)
        branches[b"refs/tags/1.0.0-weird"] = SnapshotBranch(
            target=rel4.id, target_type=TargetType.RELEASE)
    snp = Snapshot(branches=branches)

    # "Fill" swh-graph

    if up_to_date_graph:
        nodes = [cnt1, cnt2, dir1, dir2, rev1, rev2, snp]
        edges = [
            (dir1, cnt1),
            (dir2, cnt1),
            (dir2, cnt2),
            (rev1, dir1),
            (rev2, dir2),
            (rev2, rev1),
            (snp, rev2),
        ]
        if tag:
            nodes.append(rel2)
            edges.append((rel2, rev2))
            edges.append((snp, rel2))
        if weird_branches:
            nodes.extend([cnt3, cnt4, cnt5, dir3, dir4, rel3, rel4, rel5])
            edges.extend([
                (dir3, cnt3),
                (dir4, dir3),
                (snp, dir4),
                (snp, cnt4),
                (snp, rel4),
                (rel4, rel3),
                (rel3, cnt5),
                (rel5, rev2),
            ])
    else:
        nodes = [cnt1, cnt2, cnt3, dir1, dir2, dir3, rev1]
        edges = [
            (dir1, cnt1),
            (dir2, cnt1),
            (dir2, cnt2),
            (dir3, cnt3),
            (rev1, dir1),
        ]
        if tag:
            nodes.append(rel2)
        if weird_branches:
            nodes.extend([cnt3, dir3])
            edges.extend([(dir3, cnt3)])

    nodes = [str(n.swhid()) for n in nodes]
    edges = [(str(s.swhid()), str(d.swhid())) for (s, d) in edges]

    # Add all objects to storage
    swh_storage.content_add([cnt1, cnt2, cnt3, cnt4, cnt5])
    swh_storage.directory_add([dir1, dir2, dir3, dir4])
    swh_storage.revision_add([rev1, rev2])
    swh_storage.release_add([rel2, rel3, rel4, rel5])
    swh_storage.snapshot_add([snp])

    # Add spy on swh_storage, to make sure revision_log is not called
    # (the graph must be used instead)
    swh_storage = unittest.mock.MagicMock(wraps=swh_storage)

    # Add all objects to graph
    swh_graph = unittest.mock.Mock(wraps=GraphClient(nodes=nodes, edges=edges))

    # Cook
    backend = InMemoryVaultBackend()
    cooked_swhid = {
        RootObjects.SNAPSHOT: snp.swhid(),
        RootObjects.REVISION: rev2.swhid(),
        RootObjects.RELEASE: rel2.swhid(),
        RootObjects.WEIRD_RELEASE: rel5.swhid(),
    }[root_object]
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=swh_graph,
    )

    if weird_branches:
        # git-fsck now rejects refs pointing to trees and blobs,
        # but some old git repos have them.
        cooker.use_fsck = False

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        if root_object in (RootObjects.SNAPSHOT, RootObjects.REVISION):
            log_head = "master"
        elif root_object == RootObjects.RELEASE:
            log_head = "1.0.0"
        elif root_object == RootObjects.WEIRD_RELEASE:
            log_head = "release"
        else:
            assert False, root_object

        output = subprocess.check_output([
            "git",
            "-C",
            f"{tempdir}/{cooked_swhid}.git",
            "log",
            "--format=oneline",
            "--decorate=",
            log_head,
        ])

        assert output.decode(
        ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"

    # Make sure the graph was used instead of swh_storage.revision_log
    if root_object == RootObjects.SNAPSHOT:
        if up_to_date_graph:
            # The graph has everything, so the first call succeeds and returns
            # all objects transitively pointed by the snapshot
            swh_graph.visit_nodes.assert_has_calls([
                unittest.mock.call(str(snp.swhid()),
                                   edges="snp:*,rel:*,rev:rev"),
            ])
        else:
            # The graph does not have everything, so the first call returns nothing.
            # However, the second call (on the top rev) succeeds and returns
            # all objects but the rev and the rel
            swh_graph.visit_nodes.assert_has_calls([
                unittest.mock.call(str(snp.swhid()),
                                   edges="snp:*,rel:*,rev:rev"),
                unittest.mock.call(str(rev2.swhid()), edges="rev:rev"),
            ])
    elif root_object in (
            RootObjects.REVISION,
            RootObjects.RELEASE,
            RootObjects.WEIRD_RELEASE,
    ):
        swh_graph.visit_nodes.assert_has_calls(
            [unittest.mock.call(str(rev2.swhid()), edges="rev:rev")])
    else:
        assert False, root_object

    if up_to_date_graph:
        swh_storage.revision_log.assert_not_called()
        swh_storage.revision_shortlog.assert_not_called()
    else:
        swh_storage.revision_log.assert_called()
def test_checksum_mismatch(swh_storage, mismatch_on):
    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2021, 5, 7, 8, 43, 59, tzinfo=datetime.timezone.utc))
    author = Person.from_fullname(b"Foo <*****@*****.**>")

    wrong_hash = b"\x12\x34" * 10

    cnt1 = Content.from_data(b"Tr0ub4dor&3")
    if mismatch_on == "content":
        cnt1 = attr.evolve(cnt1, sha1_git=wrong_hash)

    dir1 = Directory(entries=(DirectoryEntry(
        name=b"file1",
        type="file",
        perms=DentryPerms.content,
        target=cnt1.sha1_git,
    ), ))

    if mismatch_on == "directory":
        dir1 = attr.evolve(dir1, id=wrong_hash)

    rev1 = Revision(
        message=b"msg1",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        type=RevisionType.GIT,
        synthetic=True,
    )

    if mismatch_on == "revision1":
        rev1 = attr.evolve(rev1, id=wrong_hash)

    rev2 = Revision(
        message=b"msg2",
        date=date,
        committer_date=date,
        author=author,
        committer=author,
        directory=dir1.id,
        parents=(rev1.id, ),
        type=RevisionType.GIT,
        synthetic=True,
    )

    if mismatch_on == "revision2":
        rev2 = attr.evolve(rev2, id=wrong_hash)

    cooked_swhid = rev2.swhid()

    swh_storage.content_add([cnt1])
    swh_storage.directory_add([dir1])
    swh_storage.revision_add([rev1, rev2])

    backend = InMemoryVaultBackend()
    cooker = GitBareCooker(
        cooked_swhid,
        backend=backend,
        storage=swh_storage,
        graph=None,
    )

    cooker.cook()

    # Get bundle
    bundle = backend.fetch("git_bare", cooked_swhid)

    # Extract bundle and make sure both revisions are in it
    with tempfile.TemporaryDirectory("swh-vault-test-bare") as tempdir:
        with tarfile.open(fileobj=io.BytesIO(bundle)) as tf:
            tf.extractall(tempdir)

        if mismatch_on != "revision2":
            # git-log fails if the head revision is corrupted
            # TODO: we need to find a way to make this somewhat usable
            output = subprocess.check_output([
                "git",
                "-C",
                f"{tempdir}/{cooked_swhid}.git",
                "log",
                "--format=oneline",
                "--decorate=",
            ])

            assert output.decode(
            ) == f"{rev2.id.hex()} msg2\n{rev1.id.hex()} msg1\n"
Ejemplo n.º 14
0
    f"{REPO_BASE_URL}al/aldi/sprova4j/0.1.1/sprova4j-0.1.1.pom",
]

REL_MSGS = (
    b"Synthetic release for archive at https://repo1.maven.org/maven2/al/aldi/"
    b"sprova4j/0.1.0/sprova4j-0.1.0-sources.jar\n",
    b"Synthetic release for archive at https://repo1.maven.org/maven2/al/aldi/"
    b"sprova4j/0.1.1/sprova4j-0.1.1-sources.jar\n",
)

REL_DATES = (
    TimestampWithTimezone.from_datetime(
        datetime.datetime(2021,
                          7,
                          12,
                          19,
                          6,
                          59,
                          335000,
                          tzinfo=datetime.timezone.utc)),
    TimestampWithTimezone.from_datetime(
        datetime.datetime(2021,
                          7,
                          12,
                          19,
                          37,
                          5,
                          534000,
                          tzinfo=datetime.timezone.utc)),
)
Ejemplo n.º 15
0
def test_deposit_loading_ok_release_notes(swh_storage, deposit_client,
                                          requests_mock_datadir):
    url = "https://hal-test.archives-ouvertes.fr/some-external-id"
    deposit_id = 999
    loader = DepositLoader(swh_storage,
                           url,
                           deposit_id,
                           deposit_client,
                           default_filename="archive.zip")

    actual_load_status = loader.load()
    expected_snapshot_id = "a307acffb7c29bebb3daf1bcb680bb3f452890a8"
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id,
    }

    assert_last_visit_matches(
        loader.storage,
        url,
        status="full",
        type="deposit",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    release_id_hex = "f5e8ec02ede57edbe061afa7fc2a07bb7d14a700"
    release_id = hash_to_bytes(release_id_hex)

    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"HEAD":
            SnapshotBranch(
                target=release_id,
                target_type=TargetType.RELEASE,
            ),
        },
    )
    check_snapshot(expected_snapshot, storage=loader.storage)

    release = loader.storage.release_get([release_id])[0]
    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2017, 10, 7, 15, 17, 8,
                          tzinfo=datetime.timezone.utc))
    person = Person(
        fullname=b"Software Heritage",
        name=b"Software Heritage",
        email=b"*****@*****.**",
    )
    assert release == Release(
        id=release_id,
        name=b"HEAD",
        message=
        (b"hal: Deposit 999 in collection hal\n\nThis release adds this and that.\n"
         ),
        author=person,
        date=date,
        target_type=ModelObjectType.DIRECTORY,
        target=b"\xfd-\xf1-\xc5SL\x1d\xa1\xe9\x18\x0b\x91Q\x02\xfbo`\x1d\x19",
        synthetic=True,
        metadata=None,
    )
Ejemplo n.º 16
0
def test_cran_parse_date():
    data = [
        # parsable, some have debatable results though
        ("2001-June-08", datetime(2001, 6, 8, 0, 0, tzinfo=timezone.utc)),
        (
            "Tue Dec 27 15:06:08 PST 2011",
            datetime(2011, 12, 27, 15, 6, 8, tzinfo=timezone.utc),
        ),
        ("8-14-2013", datetime(2013, 8, 14, 0, 0, tzinfo=timezone.utc)),
        ("2011-01", datetime(2011, 1, 1, 0, 0, tzinfo=timezone.utc)),
        ("201109", datetime(2009, 11, 20, 0, 0, tzinfo=timezone.utc)),
        ("04-12-2014", datetime(2014, 4, 12, 0, 0, tzinfo=timezone.utc)),
        (
            "2018-08-24, 10:40:10",
            datetime(2018, 8, 24, 10, 40, 10, tzinfo=timezone.utc),
        ),
        ("2013-October-16", datetime(2013, 10, 16, 0, 0, tzinfo=timezone.utc)),
        ("Aug 23, 2013", datetime(2013, 8, 23, 0, 0, tzinfo=timezone.utc)),
        ("27-11-2014", datetime(2014, 11, 27, 0, 0, tzinfo=timezone.utc)),
        ("2019-09-26,", datetime(2019, 9, 26, 0, 0, tzinfo=timezone.utc)),
        ("9/25/2014", datetime(2014, 9, 25, 0, 0, tzinfo=timezone.utc)),
        (
            "Fri Jun 27 17:23:53 2014",
            datetime(2014, 6, 27, 17, 23, 53, tzinfo=timezone.utc),
        ),
        ("28-04-2014", datetime(2014, 4, 28, 0, 0, tzinfo=timezone.utc)),
        ("04-14-2014", datetime(2014, 4, 14, 0, 0, tzinfo=timezone.utc)),
        (
            "2019-05-08 14:17:31 UTC",
            datetime(2019, 5, 8, 14, 17, 31, tzinfo=timezone.utc),
        ),
        (
            "Wed May 21 13:50:39 CEST 2014",
            datetime(2014, 5, 21, 13, 50, 39, tzinfo=tzlocal()),
        ),
        (
            "2018-04-10 00:01:04 KST",
            datetime(2018, 4, 10, 0, 1, 4, tzinfo=timezone.utc),
        ),
        ("2019-08-25 10:45", datetime(2019, 8, 25, 10, 45, tzinfo=timezone.utc)),
        ("March 9, 2015", datetime(2015, 3, 9, 0, 0, tzinfo=timezone.utc)),
        ("Aug. 18, 2012", datetime(2012, 8, 18, 0, 0, tzinfo=timezone.utc)),
        ("2014-Dec-17", datetime(2014, 12, 17, 0, 0, tzinfo=timezone.utc)),
        ("March 01, 2013", datetime(2013, 3, 1, 0, 0, tzinfo=timezone.utc)),
        ("2017-04-08.", datetime(2017, 4, 8, 0, 0, tzinfo=timezone.utc)),
        ("2014-Apr-22", datetime(2014, 4, 22, 0, 0, tzinfo=timezone.utc)),
        (
            "Mon Jan 12 19:54:04 2015",
            datetime(2015, 1, 12, 19, 54, 4, tzinfo=timezone.utc),
        ),
        ("May 22, 2014", datetime(2014, 5, 22, 0, 0, tzinfo=timezone.utc)),
        (
            "2014-08-12 09:55:10 EDT",
            datetime(2014, 8, 12, 9, 55, 10, tzinfo=timezone.utc),
        ),
        # unparsable
        ("Fabruary 21, 2012", None),
        ('2019-05-28"', None),
        ("2017-03-01 today", None),
        ("2016-11-0110.1093/icesjms/fsw182", None),
        ("2019-07-010", None),
        ("2015-02.23", None),
        ("20013-12-30", None),
        ("2016-08-017", None),
        ("2019-02-07l", None),
        ("2018-05-010", None),
        ("2019-09-27 KST", None),
        ("$Date$", None),
        ("2019-09-27 KST", None),
        ("2019-06-22 $Date$", None),
        ("$Date: 2013-01-18 12:49:03 -0600 (Fri, 18 Jan 2013) $", None),
        ("2015-7-013", None),
        ("2018-05-023", None),
        ("Check NEWS file for changes: news(package='simSummary')", None),
    ]
    for date, expected_date in data:
        actual_tstz = parse_date(date)
        if expected_date is None:
            assert actual_tstz is None, date
        else:
            expected_tstz = TimestampWithTimezone.from_datetime(expected_date)
            assert actual_tstz == expected_tstz, date
Ejemplo n.º 17
0
def test_deposit_loading_ok(swh_storage, deposit_client,
                            requests_mock_datadir):
    url = "https://hal-test.archives-ouvertes.fr/some-external-id"
    deposit_id = 666
    loader = DepositLoader(swh_storage,
                           url,
                           deposit_id,
                           deposit_client,
                           default_filename="archive.zip")

    actual_load_status = loader.load()
    expected_snapshot_id = "338b45d87e02fb5cbf324694bc4a898623d6a30f"
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id,
    }

    assert_last_visit_matches(
        loader.storage,
        url,
        status="full",
        type="deposit",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    release_id_hex = "2566a64a27bc00362e265be9666d7606750530a1"
    release_id = hash_to_bytes(release_id_hex)

    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"HEAD":
            SnapshotBranch(
                target=release_id,
                target_type=TargetType.RELEASE,
            ),
        },
    )
    check_snapshot(expected_snapshot, storage=loader.storage)

    release = loader.storage.release_get([release_id])[0]
    date = TimestampWithTimezone.from_datetime(
        datetime.datetime(2017, 10, 7, 15, 17, 8,
                          tzinfo=datetime.timezone.utc))
    person = Person(
        fullname=b"Software Heritage",
        name=b"Software Heritage",
        email=b"*****@*****.**",
    )
    assert release == Release(
        id=release_id,
        name=b"HEAD",
        message=b"hal: Deposit 666 in collection hal\n",
        author=person,
        date=date,
        target_type=ModelObjectType.DIRECTORY,
        target=b"\xfd-\xf1-\xc5SL\x1d\xa1\xe9\x18\x0b\x91Q\x02\xfbo`\x1d\x19",
        synthetic=True,
        metadata=None,
    )

    # check metadata

    fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )

    authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="https://hal-test.archives-ouvertes.fr/",
    )

    # Check origin metadata
    orig_meta = loader.storage.raw_extrinsic_metadata_get(
        Origin(url).swhid(), authority)
    assert orig_meta.next_page_token is None
    raw_meta = loader.client.metadata_get(deposit_id)
    raw_metadata: str = raw_meta["raw_metadata"]
    # 2 raw metadata xml + 1 json dict
    assert len(orig_meta.results) == 2
    orig_meta0 = orig_meta.results[0]
    assert orig_meta0.authority == authority
    assert orig_meta0.fetcher == fetcher

    # Check directory metadata
    assert release.target_type == ModelObjectType.DIRECTORY
    directory_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                                object_id=release.target)
    actual_dir_meta = loader.storage.raw_extrinsic_metadata_get(
        directory_swhid, authority)
    assert actual_dir_meta.next_page_token is None
    assert len(actual_dir_meta.results) == 1
    dir_meta = actual_dir_meta.results[0]
    assert dir_meta.authority == authority
    assert dir_meta.fetcher == fetcher
    assert dir_meta.metadata.decode() == raw_metadata

    # Retrieve the information for deposit status update query to the deposit
    urls = [
        m for m in requests_mock_datadir.request_history
        if m.url == f"{DEPOSIT_URL}/{deposit_id}/update/"
    ]

    assert len(urls) == 1
    update_query = urls[0]

    body = update_query.json()
    expected_body = {
        "status": "done",
        "release_id": release_id_hex,
        "directory_id": hash_to_hex(release.target),
        "snapshot_id": expected_snapshot_id,
        "origin_url": url,
    }

    assert body == expected_body

    stats = get_stats(loader.storage)
    assert {
        "content": 303,
        "directory": 12,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Ejemplo n.º 18
0
def test_npm_loader_first_visit(swh_storage, requests_mock_datadir, org_api_info):
    package = "org"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("0996ca28d6280499abcf485b51c4e3941b057249")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    release_id = "d38cc0b571cd41f3c85513864e049766b42032a7"
    versions = [
        ("0.0.2", release_id),
        ("0.0.3", "62bf7076bae9aa2cb4d6cb3bf7ce0ea4fdd5b295"),
        ("0.0.4", "6e976db82f6c310596b21fb0ed8b11f507631434"),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.4", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.2",
        message=b"Synthetic release for NPM source package org version 0.0.2\n",
        target=hash_to_bytes("42753c0c2ab00c4501b552ac4671c68f3cf5aece"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person(
            fullname=b"mooz <*****@*****.**>",
            name=b"mooz",
            email=b"*****@*****.**",
        ),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    contents = swh_storage.content_get(_expected_new_contents_first_visit)
    count = sum(0 if content is None else 1 for content in contents)
    assert count == len(_expected_new_contents_first_visit)

    assert (
        list(swh_storage.directory_missing(_expected_new_directories_first_visit)) == []
    )

    assert list(swh_storage.release_missing(_expected_new_releases_first_visit)) == []

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.FORGE,
        url="https://npmjs.com/",
    )

    for (version_name, release_id) in versions:
        release = swh_storage.release_get([hash_to_bytes(release_id)])[0]
        assert release.target_type == ModelObjectType.DIRECTORY
        directory_id = release.target
        directory_swhid = ExtendedSWHID(
            object_type=ExtendedObjectType.DIRECTORY,
            object_id=directory_id,
        )
        release_swhid = CoreSWHID(
            object_type=ObjectType.RELEASE,
            object_id=hash_to_bytes(release_id),
        )
        expected_metadata = [
            RawExtrinsicMetadata(
                target=directory_swhid,
                authority=metadata_authority,
                fetcher=MetadataFetcher(
                    name="swh.loader.package.npm.loader.NpmLoader",
                    version=__version__,
                ),
                discovery_date=loader.visit_date,
                format="replicate-npm-package-json",
                metadata=json.dumps(
                    json.loads(org_api_info)["versions"][version_name]
                ).encode(),
                origin="https://www.npmjs.com/package/org",
                release=release_swhid,
            )
        ]
        assert swh_storage.raw_extrinsic_metadata_get(
            directory_swhid,
            metadata_authority,
        ) == PagedResult(
            next_page_token=None,
            results=expected_metadata,
        )

    stats = get_stats(swh_storage)

    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
Ejemplo n.º 19
0
def test_sub_directory_view_origin_context(client, archive_data,
                                           empty_directory, person, date):
    origin_url = "test_sub_directory_view_origin_context"
    subdir = Directory(entries=(
        DirectoryEntry(
            name=b"foo",
            type="dir",
            target=hash_to_bytes(empty_directory),
            perms=DentryPerms.directory,
        ),
        DirectoryEntry(
            name=b"bar",
            type="dir",
            target=hash_to_bytes(empty_directory),
            perms=DentryPerms.directory,
        ),
    ))

    parentdir = Directory(entries=(DirectoryEntry(
        name=b"baz",
        type="dir",
        target=subdir.id,
        perms=DentryPerms.directory,
    ), ))
    archive_data.directory_add([subdir, parentdir])

    revision = Revision(
        directory=parentdir.id,
        author=person,
        committer=person,
        message=b"commit message",
        date=TimestampWithTimezone.from_datetime(date),
        committer_date=TimestampWithTimezone.from_datetime(date),
        synthetic=False,
        type=RevisionType.GIT,
    )
    archive_data.revision_add([revision])

    snapshot = Snapshot(
        branches={
            b"HEAD":
            SnapshotBranch(
                target="refs/head/master".encode(),
                target_type=TargetType.ALIAS,
            ),
            b"refs/head/master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        })
    archive_data.snapshot_add([snapshot])

    archive_data.origin_add([Origin(url=origin_url)])
    date = now()
    visit = OriginVisit(origin=origin_url, date=date, type="git")
    visit = archive_data.origin_visit_add([visit])[0]
    visit_status = OriginVisitStatus(
        origin=origin_url,
        visit=visit.visit,
        date=date,
        status="full",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    dir_content = archive_data.directory_ls(hash_to_hex(parentdir.id))
    subdir = dir_content[0]
    subdir_content = archive_data.directory_ls(subdir["target"])
    _directory_view_checks(
        client,
        hash_to_hex(parentdir.id),
        subdir_content,
        subdir["name"],
        origin_url,
        hash_to_hex(snapshot.id),
        hash_to_hex(revision.id),
    )
Ejemplo n.º 20
0
 author=Person(
     name=b"Andrew Nesbitt",
     fullname=b"Andrew Nesbitt <*****@*****.**>",
     email=b"*****@*****.**",
 ),
 committer=Person(
     name=b"Andrew Nesbitt",
     fullname=b"Andrew Nesbitt <*****@*****.**>",
     email=b"*****@*****.**",
 ),
 committer_date=TimestampWithTimezone.from_datetime(
     datetime.datetime(
         2013,
         10,
         4,
         12,
         50,
         49,
         tzinfo=datetime.timezone(datetime.timedelta(minutes=120)),
     )
 ),
 type=RevisionType.GIT,
 synthetic=False,
 date=TimestampWithTimezone.from_datetime(
     datetime.datetime(
         2017,
         2,
         20,
         16,
         14,
         16,
Ejemplo n.º 21
0
def test_debian_first_visit(swh_storage, requests_mock_datadir):
    """With no prior visit, load a gnu project ends up with 1 snapshot"""
    loader = DebianLoader(
        swh_storage,
        URL,
        packages=PACKAGE_PER_VERSION,
    )

    actual_load_status = loader.load()
    expected_snapshot_id = "f9e4d0d200433dc998ad2ca40ee1244785fe6ed1"
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id,
    }

    assert_last_visit_matches(
        swh_storage,
        URL,
        status="full",
        type="deb",
        snapshot=hash_to_bytes(expected_snapshot_id),
    )

    release_id = hash_to_bytes("de96ae3d3e136f5c1709117059e2a2c05b8ee5ae")

    expected_snapshot = Snapshot(
        id=hash_to_bytes(expected_snapshot_id),
        branches={
            b"releases/stretch/contrib/0.7.2-3":
            SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=release_id,
            )
        },
    )  # different than the previous loader as no release is done

    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([release_id])[0] == Release(
        id=release_id,
        name=b"0.7.2-3",
        message=
        b"Synthetic release for Debian source package cicero version 0.7.2-3\n",
        target=hash_to_bytes("798df511408c53bf842a8e54d4d335537836bdc3"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person(
            fullname=b"Samuel Thibault <*****@*****.**>",
            name=b"Samuel Thibault",
            email=b"*****@*****.**",
        ),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(
                2014,
                10,
                19,
                16,
                52,
                35,
                tzinfo=datetime.timezone(datetime.timedelta(seconds=7200)),
            )),
    )

    stats = get_stats(swh_storage)
    assert {
        "content": 42,
        "directory": 2,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,  # all artifacts under 1 release
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats
def test_archive_visit_with_release_artifact_no_prior_visit(
        swh_storage, requests_mock_datadir):
    """With no prior visit, load a gnu project ends up with 1 snapshot"""
    loader = ArchiveLoader(swh_storage, URL, artifacts=GNU_ARTIFACTS[:1])

    actual_load_status = loader.load()
    assert actual_load_status["status"] == "eventful"

    expected_snapshot_first_visit_id = hash_to_bytes(
        "9efecc835e8f99254934f256b5301b94f348fd17")

    assert actual_load_status["snapshot_id"] == hash_to_hex(
        expected_snapshot_first_visit_id)

    assert_last_visit_matches(swh_storage, URL, status="full", type="tar")

    stats = get_stats(swh_storage)
    assert {
        "content": len(_expected_new_contents_first_visit),
        "directory": len(_expected_new_directories_first_visit),
        "origin": 1,
        "origin_visit": 1,
        "release": len(_expected_new_releases_first_visit),
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == stats

    release_id = hash_to_bytes(list(_expected_new_releases_first_visit)[0])
    expected_snapshot = Snapshot(
        id=expected_snapshot_first_visit_id,
        branches={
            b"HEAD":
            SnapshotBranch(
                target_type=TargetType.ALIAS,
                target=b"releases/0.1.0",
            ),
            b"releases/0.1.0":
            SnapshotBranch(
                target_type=TargetType.RELEASE,
                target=release_id,
            ),
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([release_id])[0] == Release(
        id=release_id,
        name=b"0.1.0",
        message=(b"Synthetic release for archive at "
                 b"https://ftp.gnu.org/gnu/8sync/8sync-0.1.0.tar.gz\n"),
        target=hash_to_bytes("3aebc29ed1fccc4a6f2f2010fb8e57882406b528"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b""),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(1999,
                              12,
                              9,
                              8,
                              53,
                              30,
                              tzinfo=datetime.timezone.utc)),
    )

    expected_contents = map(hash_to_bytes, _expected_new_contents_first_visit)
    assert list(swh_storage.content_missing_per_sha1(expected_contents)) == []

    expected_dirs = map(hash_to_bytes, _expected_new_directories_first_visit)
    assert list(swh_storage.directory_missing(expected_dirs)) == []

    expected_rels = map(hash_to_bytes, _expected_new_releases_first_visit)
    assert list(swh_storage.release_missing(expected_rels)) == []
Ejemplo n.º 23
0
def test_npm_loader_duplicate_shasum(swh_storage, requests_mock_datadir):
    """Test with two versions that have exactly the same tarball"""
    package = "org_version_mismatch"
    url = package_url(package)
    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    expected_snapshot_id = hash_to_bytes("ac867a4c22ba4e22a022d319f309714477412a5a")
    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )

    beta_release_id = "e6d5490a02ac2a8dcd49702f9ccd5a64c90a46f1"
    release_id = "f6985f437e28db6eb1b7533230e05ed99f2c91f0"
    versions = [
        ("0.0.3-beta", beta_release_id),
        ("0.0.3", release_id),
    ]

    expected_snapshot = Snapshot(
        id=expected_snapshot_id,
        branches={
            b"HEAD": SnapshotBranch(
                target=b"releases/0.0.3", target_type=TargetType.ALIAS
            ),
            **{
                b"releases/"
                + version_name.encode(): SnapshotBranch(
                    target=hash_to_bytes(version_id),
                    target_type=TargetType.RELEASE,
                )
                for (version_name, version_id) in versions
            },
        },
    )
    check_snapshot(expected_snapshot, swh_storage)

    assert swh_storage.release_get([hash_to_bytes(beta_release_id)])[0] == Release(
        name=b"0.0.3-beta",
        message=(
            b"Synthetic release for NPM source package org_version_mismatch "
            b"version 0.0.3-beta\n"
        ),
        target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 40, 33, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(beta_release_id),
    )

    assert swh_storage.release_get([hash_to_bytes(release_id)])[0] == Release(
        name=b"0.0.3",
        message=(
            b"Synthetic release for NPM source package org_version_mismatch "
            b"version 0.0.3\n"
        ),
        target=hash_to_bytes("3370d20d6f96dc1c9e50f083e2134881db110f4f"),
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
        author=Person.from_fullname(b"Masafumi Oyamada <*****@*****.**>"),
        date=TimestampWithTimezone.from_datetime(
            datetime.datetime(2014, 1, 1, 15, 55, 45, tzinfo=datetime.timezone.utc)
        ),
        id=hash_to_bytes(release_id),
    )

    # Check incremental re-load keeps it unchanged

    loader = NpmLoader(swh_storage, url)

    actual_load_status = loader.load()
    assert actual_load_status == {
        "status": "uneventful",
        "snapshot_id": expected_snapshot_id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, url, status="full", type="npm", snapshot=expected_snapshot_id
    )
Ejemplo n.º 24
0
def test_cran_one_visit(swh_storage, requests_mock_datadir):
    version = "2.22-6"
    base_url = "https://cran.r-project.org"
    origin_url = f"{base_url}/Packages/Recommended_KernSmooth/index.html"
    artifact_url = (
        f"{base_url}/src_contrib_1.4.0_Recommended_KernSmooth_{version}.tar.gz"  # noqa
    )
    loader = CRANLoader(
        swh_storage,
        origin_url,
        artifacts=[
            {
                "url": artifact_url,
                "version": version,
                "package": "Recommended_KernSmooth",
            }
        ],
    )

    actual_load_status = loader.load()

    assert actual_load_status == {
        "status": "eventful",
        "snapshot_id": SNAPSHOT.id.hex(),
    }

    assert_last_visit_matches(
        swh_storage, origin_url, status="full", type="cran", snapshot=SNAPSHOT.id
    )

    check_snapshot(SNAPSHOT, swh_storage)

    assert swh_storage.release_get([RELEASE_ID])[0] == Release(
        id=RELEASE_ID,
        name=b"2.22-6",
        message=(
            b"Synthetic release for CRAN source package "
            b"Recommended_KernSmooth version 2.22-6\n"
        ),
        target=hash_to_bytes("ff64177fea3f4a5136b9caf7581a4f7d4cf65296"),
        target_type=ObjectType.DIRECTORY,
        synthetic=True,
        author=Person(
            fullname=b"Brian Ripley <*****@*****.**>",
            name=b"Brian Ripley",
            email=b"*****@*****.**",
        ),
        date=TimestampWithTimezone.from_datetime(
            datetime(2001, 6, 8, 0, 0, tzinfo=timezone.utc)
        ),
    )

    visit_stats = get_stats(swh_storage)
    assert {
        "content": 33,
        "directory": 7,
        "origin": 1,
        "origin_visit": 1,
        "release": 1,
        "revision": 0,
        "skipped_content": 0,
        "snapshot": 1,
    } == visit_stats

    urls = [
        m.url
        for m in requests_mock_datadir.request_history
        if m.url.startswith(base_url)
    ]
    # visited each artifact once across 2 visits
    assert len(urls) == 1