Beispiel #1
0
def test_origin_snapshot_invalid_branch(
    client, archive_data, new_origin, new_snapshot, visit_dates, revisions
):
    snp_dict = new_snapshot.to_dict()
    archive_data.origin_add([new_origin])
    for i, branch in enumerate(snp_dict["branches"].keys()):
        snp_dict["branches"][branch] = {
            "target_type": "revision",
            "target": hash_to_bytes(revisions[i]),
        }

    archive_data.snapshot_add([Snapshot.from_dict(snp_dict)])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="full",
        snapshot=snp_dict["id"],
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory",
        query_params={"origin_url": new_origin.url, "branch": "invalid_branch"},
    )

    check_html_get_response(client, url, status_code=404, template_used="error.html")
Beispiel #2
0
def test_origin_empty_snapshot_null_revision(client, archive_data, new_origin):
    snapshot = Snapshot(
        branches={
            b"HEAD": SnapshotBranch(
                target="refs/head/master".encode(), target_type=TargetType.ALIAS,
            ),
            b"refs/head/master": None,
        }
    )
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([snapshot])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=now(), type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="partial",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory", query_params={"origin_url": new_origin.url},
    )

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/directory.html"
    )
    resp_content = resp.content.decode("utf-8")
    assert re.search("snapshot.*is empty", resp_content)
    assert not re.search("swh-tr-link", resp_content)
Beispiel #3
0
def test_get_origin_visit_return_first_valid_partial_visit(
        archive_data, new_origin, new_snapshots):
    visits = []

    archive_data.origin_add([new_origin])
    # create 6 visits, the first three have full status but null snapshot
    # while the last three have partial status with valid snapshot
    for i, snp in enumerate(new_snapshots):
        visit_date = now() + timedelta(days=i * 10)
        visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        archive_data.snapshot_add([new_snapshots[i]])
        visit_status = OriginVisitStatus(
            origin=new_origin.url,
            visit=visit.visit,
            date=visit_date + timedelta(minutes=5),
            status="full" if i < 3 else "partial",
            snapshot=new_snapshots[i].id if i > 2 else None,
        )
        if i > 2:
            archive_data.origin_visit_status_add([visit_status])

        visits.append(visit.visit)

    # should return the last visit
    expected_visit = archive_data.origin_visit_get_by(new_origin.url,
                                                      visits[-1])
    assert get_origin_visit((OriginInfo(url=new_origin.url))) == expected_visit
Beispiel #4
0
def test_get_origin_visit_non_resolvable_snapshots(archive_data, new_origin,
                                                   new_snapshots):
    visits = []
    archive_data.origin_add([new_origin])
    # create 6 full visits, the first three have resolvable snapshots
    # while the last three have non resolvable snapshots
    for i, snp in enumerate(new_snapshots):
        visit_date = now() + timedelta(days=i * 10)
        visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        archive_data.snapshot_add([new_snapshots[i]])
        visit_status = OriginVisitStatus(
            origin=new_origin.url,
            visit=visit.visit,
            date=visit_date + timedelta(minutes=5),
            status="full",
            snapshot=new_snapshots[i].id,
        )
        if i < 3:
            archive_data.origin_visit_status_add([visit_status])
        visits.append(visit.visit)

    # should return the third visit
    expected_visit = archive_data.origin_visit_get_by(new_origin.url,
                                                      visits[2])
    assert get_origin_visit((OriginInfo(url=new_origin.url))) == expected_visit
Beispiel #5
0
def test_iter_origin_visits(swh_storage, sample_data):
    """Iter over origin visits for an origin returns all visits"""
    origin1, origin2 = sample_data.origins[:2]
    swh_storage.origin_add([origin1, origin2])

    date_past = now() - datetime.timedelta(weeks=20)

    new_visits = []
    for visit_id in range(20):
        new_visits.append(
            OriginVisit(
                origin=origin1.url,
                date=date_past + datetime.timedelta(days=visit_id),
                type="git",
            ))

    visits = swh_storage.origin_visit_add(new_visits)
    reversed_visits = list(reversed(visits))

    # no limit, order asc
    actual_visits = list(iter_origin_visits(swh_storage, origin1.url))
    assert actual_visits == visits

    # no limit, order desc
    actual_visits = list(
        iter_origin_visits(swh_storage, origin1.url, order=ListOrder.DESC))
    assert actual_visits == reversed_visits

    # no result
    actual_visits = list(iter_origin_visits(swh_storage, origin2.url))
    assert actual_visits == []
Beispiel #6
0
def _add_origin(storage,
                search,
                origin_url,
                visit_type="git",
                snapshot_branches={}):
    storage.origin_add([Origin(url=origin_url)])
    search.origin_update([{
        "url": origin_url,
        "has_visits": True,
        "visit_types": [visit_type]
    }])
    date = now()
    visit = OriginVisit(origin=origin_url, date=date, type=visit_type)
    visit = storage.origin_visit_add([visit])[0]
    snapshot = Snapshot.from_dict({"branches": snapshot_branches})
    storage.snapshot_add([snapshot])
    visit_status = OriginVisitStatus(
        origin=origin_url,
        visit=visit.visit,
        date=date + timedelta(minutes=1),
        type=visit.type,
        status="full",
        snapshot=snapshot.id,
    )
    storage.origin_visit_status_add([visit_status])
 def ovgl(origin_url, allowed_statuses, require_snapshot, type):
     if origin_url == f"base://{self.repo_url}":
         return OriginVisit(origin=origin_url,
                            visit=42,
                            date=now,
                            type="git")
     else:
         return None
Beispiel #8
0
def row_to_visit(row: OriginVisitRow) -> OriginVisit:
    """Format a row representing an origin_visit to an actual OriginVisit."""
    return OriginVisit(
        origin=row.origin,
        visit=row.visit,
        date=row.date.replace(tzinfo=datetime.timezone.utc),
        type=row.type,
    )
Beispiel #9
0
def test_pypi_origin_from_project_name(mocker):
    origin_url = "https://pypi.org/project/ProjectName/"

    storage = get_storage("memory")

    revision_id = b"41" * 10
    snapshot_id = b"42" * 10
    storage.origin_add([Origin(url=origin_url)])
    storage.origin_visit_add(
        [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin_url,
            visit=1,
            date=now(),
            status="partial",
            snapshot=snapshot_id,
        )
    ])
    storage.snapshot_add([
        Snapshot(
            id=snapshot_id,
            branches={
                b"foo":
                SnapshotBranch(
                    target_type=TargetType.REVISION,
                    target=revision_id,
                )
            },
        )
    ])

    class response:
        code = 200

        def read(self):
            return b'{"info": {"name": "ProjectName"}}'

    mock_urlopen = mocker.patch(
        "swh.storage.migrate_extrinsic_metadata.urlopen",
        return_value=response(),
    )

    assert (pypi_origin_from_filename(
        storage, revision_id, "ProjectName-1.0.0.tar.gz") == origin_url)
    mock_urlopen.assert_not_called()
    assert (pypi_origin_from_filename(
        storage, revision_id, "projectname-1.0.0.tar.gz") == origin_url)
    mock_urlopen.assert_called_once_with(
        "https://pypi.org/pypi/projectname/json/")
Beispiel #10
0
    def _store_origin_visit(self) -> None:
        """Store origin and visit references. Sets the self.visit references."""
        assert self.origin
        self.storage.origin_add([self.origin])

        assert isinstance(self.visit_type, str)
        self.visit = list(
            self.storage.origin_visit_add([
                OriginVisit(
                    origin=self.origin.url,
                    date=self.visit_date,
                    type=self.visit_type,
                )
            ]))[0]
Beispiel #11
0
def test_api_lookup_origin_visits(api_client, archive_data, new_origin,
                                  visit_dates, new_snapshots):

    archive_data.origin_add([new_origin])
    for i, visit_date in enumerate(visit_dates):
        origin_visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        archive_data.snapshot_add([new_snapshots[i]])
        visit_status = OriginVisitStatus(
            origin=new_origin.url,
            visit=origin_visit.visit,
            date=now(),
            status="full",
            snapshot=new_snapshots[i].id,
        )
        archive_data.origin_visit_status_add([visit_status])

    all_visits = list(reversed(get_origin_visits(new_origin.to_dict())))

    for last_visit, expected_visits in (
        (None, all_visits[:2]),
        (all_visits[1]["visit"], all_visits[2:]),
    ):

        url = reverse(
            "api-1-origin-visits",
            url_args={"origin_url": new_origin.url},
            query_params={
                "per_page": 2,
                "last_visit": last_visit
            },
        )

        rv = check_api_get_responses(api_client, url, status_code=200)

        for i in range(len(expected_visits)):
            expected_visits[i] = enrich_origin_visit(
                expected_visits[i],
                with_origin_link=False,
                with_origin_visit_link=True,
                request=rv.wsgi_request,
            )

        assert rv.data == expected_visits
Beispiel #12
0
def test_api_lookup_origin_visit_latest_with_snapshot(api_client, archive_data,
                                                      new_origin, visit_dates,
                                                      new_snapshots):
    archive_data.origin_add([new_origin])
    visit_dates.sort()
    visit_ids = []
    for i, visit_date in enumerate(visit_dates):
        origin_visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        visit_ids.append(origin_visit.visit)

    archive_data.snapshot_add([new_snapshots[0]])

    # Add snapshot to the latest visit
    visit_id = visit_ids[-1]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit_id,
        date=now(),
        status="full",
        snapshot=new_snapshots[0].id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "api-1-origin-visit-latest",
        url_args={"origin_url": new_origin.url},
        query_params={"require_snapshot": True},
    )

    rv = check_api_get_responses(api_client, url, status_code=200)

    expected_visit = archive_data.origin_visit_status_get_latest(
        new_origin.url, type="git", require_snapshot=True)

    expected_visit = enrich_origin_visit(
        expected_visit,
        with_origin_link=True,
        with_origin_visit_link=False,
        request=rv.wsgi_request,
    )

    assert rv.data == expected_visit
Beispiel #13
0
def test_lookup_origin_visit(archive_data, new_origin, visit_dates):
    archive_data.origin_add([new_origin])
    visits = archive_data.origin_visit_add([
        OriginVisit(
            origin=new_origin.url,
            date=ts,
            type="git",
        ) for ts in visit_dates
    ])

    visit = random.choice(visits).visit
    actual_origin_visit = archive.lookup_origin_visit(new_origin.url, visit)

    expected_visit = dict(
        archive_data.origin_visit_get_by(new_origin.url, visit))

    assert actual_origin_visit == expected_visit
Beispiel #14
0
def test_origin_branches_pagination_with_alias(
    client, archive_data, mocker, new_origin, visit_dates, revisions, existing_release
):
    """
    When a snapshot contains a branch or a release alias, pagination links
    in the branches / releases view should be displayed.
    """
    mocker.patch("swh.web.browse.snapshot_context.PER_PAGE", len(revisions) / 2)
    snp_dict = {"branches": {}, "id": hash_to_bytes(random_sha1())}
    for i in range(len(revisions)):
        branch = "".join(random.choices(string.ascii_lowercase, k=8))
        snp_dict["branches"][branch.encode()] = {
            "target_type": "revision",
            "target": hash_to_bytes(revisions[i]),
        }
    release = "".join(random.choices(string.ascii_lowercase, k=8))
    snp_dict["branches"][b"RELEASE_ALIAS"] = {
        "target_type": "alias",
        "target": release.encode(),
    }
    snp_dict["branches"][release.encode()] = {
        "target_type": "release",
        "target": hash_to_bytes(existing_release),
    }
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([Snapshot.from_dict(snp_dict)])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="full",
        snapshot=snp_dict["id"],
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse("browse-origin-branches", query_params={"origin_url": new_origin.url})

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/branches.html"
    )
    assert_contains(resp, '<ul class="pagination')
Beispiel #15
0
def test_lookup_origin_visits(archive_data, new_origin, visit_dates):
    archive_data.origin_add([new_origin])

    archive_data.origin_visit_add([
        OriginVisit(
            origin=new_origin.url,
            date=ts,
            type="git",
        ) for ts in visit_dates
    ])

    actual_origin_visits = list(
        archive.lookup_origin_visits(new_origin.url, per_page=100))

    expected_visits = archive_data.origin_visit_get(new_origin.url)
    for expected_visit in expected_visits:
        expected_visit["origin"] = new_origin.url

    assert actual_origin_visits == expected_visits
Beispiel #16
0
def test_api_lookup_origin_visit(api_client, archive_data, new_origin,
                                 visit_dates, new_snapshots):
    archive_data.origin_add([new_origin])
    for i, visit_date in enumerate(visit_dates):
        origin_visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        visit_id = origin_visit.visit
        archive_data.snapshot_add([new_snapshots[i]])
        visit_status = OriginVisitStatus(
            origin=new_origin.url,
            visit=origin_visit.visit,
            date=visit_date + timedelta(minutes=5),
            status="full",
            snapshot=new_snapshots[i].id,
        )
        archive_data.origin_visit_status_add([visit_status])
        url = reverse(
            "api-1-origin-visit",
            url_args={
                "origin_url": new_origin.url,
                "visit_id": visit_id
            },
        )

        rv = check_api_get_responses(api_client, url, status_code=200)

        expected_visit = archive_data.origin_visit_get_by(
            new_origin.url, visit_id)

        expected_visit = enrich_origin_visit(
            expected_visit,
            with_origin_link=True,
            with_origin_visit_link=False,
            request=rv.wsgi_request,
        )

        assert rv.data == expected_visit
 def test_pypi_missing_branch(self):
     origin_url = "https://pypi.org/project/abcdef/"
     self.indexer.storage.origin_add([Origin(url=origin_url, )])
     visit = self.indexer.storage.origin_visit_add([
         OriginVisit(
             origin=origin_url,
             date=datetime(2019, 2, 27, tzinfo=timezone.utc),
             type="pypi",
         )
     ])[0]
     self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
     visit_status = OriginVisitStatus(
         origin=origin_url,
         visit=visit.visit,
         date=now(),
         status="full",
         snapshot=SAMPLE_SNAPSHOT.id,
     )
     self.indexer.storage.origin_visit_status_add([visit_status])
     self.indexer.run(["https://pypi.org/project/abcdef/"])
     self.assertEqual(self.indexer.results, [])
Beispiel #18
0
def test_origin_get_latest_visit_status_filter_snapshot(
        swh_storage, sample_data):
    objects = init_storage_with_origin_visits(swh_storage, sample_data)
    origin1, origin2 = objects["origin"]
    _, ov2 = objects["origin_visit"]
    _, _, _, ovs22 = objects["origin_visit_status"]

    # there is no visit with snapshot yet for that visit
    assert (origin_get_latest_visit_status(
        swh_storage, origin1.url, require_snapshot=True) is None)

    # visit status with partial status visit elected
    actual_ovs22 = origin_get_latest_visit_status(swh_storage,
                                                  origin2.url,
                                                  require_snapshot=True)
    assert actual_ovs22 == ovs22
    assert actual_ovs22.origin == ov2.origin
    assert actual_ovs22.visit == ov2.visit
    assert actual_ovs22.type == ov2.type

    date_now = now()

    # Add another visit
    swh_storage.origin_visit_add([
        OriginVisit(
            origin=origin2.url,
            date=date_now,
            type=sample_data.type_visit2,
        ),
    ])

    # Requiring the latest visit with a snapshot, we still find the previous visit
    ovs22 = origin_get_latest_visit_status(swh_storage,
                                           origin2.url,
                                           require_snapshot=True)
    assert actual_ovs22 == ovs22
    assert actual_ovs22.origin == ov2.origin
    assert actual_ovs22.visit == ov2.visit
    assert actual_ovs22.type == ov2.type
 def test_git_partial_snapshot(self):
     """Checks partial snapshots are ignored."""
     origin_url = "https://github.com/SoftwareHeritage/swh-core"
     self.indexer.storage.origin_add([Origin(url=origin_url)])
     visit = self.indexer.storage.origin_visit_add([
         OriginVisit(
             origin=origin_url,
             date=datetime(2019, 2, 27, tzinfo=timezone.utc),
             type="git",
         )
     ])[0]
     self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
     visit_status = OriginVisitStatus(
         origin=origin_url,
         visit=visit.visit,
         date=now(),
         status="partial",
         snapshot=SAMPLE_SNAPSHOT.id,
     )
     self.indexer.storage.origin_visit_status_add([visit_status])
     self.indexer.run([origin_url])
     self.assertEqual(self.indexer.results, [])
Beispiel #20
0
def test_origin_browse_directory_branch_with_non_resolvable_revision(
    client, archive_data, new_origin, unknown_revision
):
    branch_name = "master"
    snapshot = Snapshot(
        branches={
            branch_name.encode(): SnapshotBranch(
                target=hash_to_bytes(unknown_revision), target_type=TargetType.REVISION,
            )
        }
    )
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([snapshot])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=now(), type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="partial",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory",
        query_params={"origin_url": new_origin.url, "branch": branch_name},
    )

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/directory.html"
    )
    assert_contains(
        resp, f"Revision {unknown_revision } could not be found in the archive."
    )
Beispiel #21
0
def fill_storage(storage):
    storage.origin_add(ORIGINS)
    storage.directory_add([DIRECTORY, DIRECTORY2])
    storage.revision_add(REVISIONS)
    storage.snapshot_add(SNAPSHOTS)

    for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS):
        assert snapshot.id is not None

        visit = storage.origin_visit_add(
            [OriginVisit(origin=visit["origin"], date=now(), type=visit["type"])]
        )[0]
        visit_status = OriginVisitStatus(
            origin=visit.origin,
            visit=visit.visit,
            date=now(),
            status="full",
            snapshot=snapshot.id,
        )
        storage.origin_visit_status_add([visit_status])

    contents = []
    for (obj_id, content) in OBJ_STORAGE_DATA.items():
        content_hashes = hashutil.MultiHash.from_data(content).digest()
        contents.append(
            Content(
                data=content,
                length=len(content),
                status="visible",
                sha1=hash_to_bytes(obj_id),
                sha1_git=hash_to_bytes(obj_id),
                sha256=content_hashes["sha256"],
                blake2s256=content_hashes["blake2s256"],
            )
        )
    storage.content_add(contents)
Beispiel #22
0
class StorageData:
    """Data model objects to use within tests."""

    content = Content(
        data=b"42\n",
        length=3,
        sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"),
        sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        sha256=hash_to_bytes(
            "084c799cd551dd1d8d5c5f9a5d593b2e931f5e36122ee5c793c1d08a19839cc0"
        ),
        blake2s256=hash_to_bytes(
            "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d"
        ),
        status="visible",
    )
    content2 = Content(
        data=b"4242\n",
        length=5,
        sha1=hash_to_bytes("61c2b3a30496d329e21af70dd2d7e097046d07b7"),
        sha1_git=hash_to_bytes("36fade77193cb6d2bd826161a0979d64c28ab4fa"),
        sha256=hash_to_bytes(
            "859f0b154fdb2d630f45e1ecae4a862915435e663248bb8461d914696fc047cd"
        ),
        blake2s256=hash_to_bytes(
            "849c20fad132b7c2d62c15de310adfe87be94a379941bed295e8141c6219810d"
        ),
        status="visible",
    )
    content3 = Content(
        data=b"424242\n",
        length=7,
        sha1=hash_to_bytes("3e21cc4942a4234c9e5edd8a9cacd1670fe59f13"),
        sha1_git=hash_to_bytes("c932c7649c6dfa4b82327d121215116909eb3bea"),
        sha256=hash_to_bytes(
            "92fb72daf8c6818288a35137b72155f507e5de8d892712ab96277aaed8cf8a36"
        ),
        blake2s256=hash_to_bytes(
            "76d0346f44e5a27f6bafdd9c2befd304aff83780f93121d801ab6a1d4769db11"
        ),
        status="visible",
        ctime=datetime.datetime(2019, 12, 1, tzinfo=datetime.timezone.utc),
    )
    contents: Tuple[Content, ...] = (content, content2, content3)

    skipped_content = SkippedContent(
        length=1024 * 1024 * 200,
        sha1_git=hash_to_bytes("33e45d56f88993aae6a0198013efa80716fd8920"),
        sha1=hash_to_bytes("43e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "7bbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "ade18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
        origin="file:///dev/zero",
    )
    skipped_content2 = SkippedContent(
        length=1024 * 1024 * 300,
        sha1_git=hash_to_bytes("44e45d56f88993aae6a0198013efa80716fd8921"),
        sha1=hash_to_bytes("54e45d56f88993aae6a0198013efa80716fd8920"),
        sha256=hash_to_bytes(
            "8cbd052ab054ef222c1c87be60cd191addedd24cc882d1f5f7f7be61dc61bb3a"
        ),
        blake2s256=hash_to_bytes(
            "9ce18b1adecb33f891ca36664da676e12c772cc193778aac9a137b8dc5834b9b"
        ),
        reason="Content too long",
        status="absent",
    )
    skipped_contents: Tuple[SkippedContent,
                            ...] = (skipped_content, skipped_content2)

    directory5 = Directory(
        id=hash_to_bytes("4b825dc642cb6eb9a060e54bf8d69288fbee4904"),
        entries=(),
    )
    directory = Directory(
        id=hash_to_bytes("5256e856a0a0898966d6ba14feb4388b8b82d302"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar\xc3",
                type="dir",
                target=directory5.id,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
    )
    directory2 = Directory(
        id=hash_to_bytes("8505808532953da7d2581741f01b29c04b1cb9ab"),
        entries=tuple([
            DirectoryEntry(
                name=b"oof",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            )
        ], ),
    )
    directory3 = Directory(
        id=hash_to_bytes("13089e6e544f78df7c9a40a3059050d10dee686a"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=content.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"subdir",
                type="dir",
                target=directory.id,
                perms=from_disk.DentryPerms.directory,
            ),
            DirectoryEntry(
                name=b"hello",
                type="file",
                target=content2.sha1_git,
                perms=from_disk.DentryPerms.content,
            ),
        ], ),
    )
    directory4 = Directory(
        id=hash_to_bytes("cd5dfd9c09d9e99ed123bc7937a0d5fddc3cd531"),
        entries=tuple([
            DirectoryEntry(
                name=b"subdir1",
                type="dir",
                target=directory3.id,
                perms=from_disk.DentryPerms.directory,
            )
        ], ),
    )

    directory6 = Directory(
        id=hash_to_bytes("afa0105cfcaa14fdbacee344e96659170bb1bda5"),
        entries=tuple([
            DirectoryEntry(
                name=b"foo",
                type="file",
                target=b"\x00" * 20,
                perms=from_disk.DentryPerms.content,
            ),
            DirectoryEntry(
                name=b"bar",
                type="dir",
                target=b"\x01" * 20,
                perms=from_disk.DentryPerms.directory,
            ),
        ], ),
        raw_manifest=(
            b"tree 61\x00"
            b"100644 foo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"  # noqa
            b"40000 bar\x00\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"  # noqa
        ),
    )

    directories: Tuple[Directory, ...] = (
        directory2,
        directory,
        directory3,
        directory4,
        directory5,
        directory6,
    )

    revision = Revision(
        id=hash_to_bytes("01a7114f36fddd5ef2511b2cadda237a68adbb12"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
        },
        extra_headers=(
            (b"gpgsig", b"test123"),
            (b"mergetag", b"foo\\bar"),
            (b"mergetag", b"\x22\xaf\x89\x80\x01\x00"),
        ),
        synthetic=True,
    )
    revision2 = Revision(
        id=hash_to_bytes("a646dd94c912829659b22a1e7e143d2fa5ebde1b"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    revision3 = Revision(
        id=hash_to_bytes("beb2844dff30658e27573cb46eb55980e974b391"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([revision.id, revision2.id]),
        type=RevisionType.GIT,
        directory=directory2.id,
        metadata=None,
        extra_headers=(),
        synthetic=True,
    )
    revision4 = Revision(
        id=hash_to_bytes("ae860aec43700c7f5a295e2ef47e2ae41b535dfe"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([revision3.id]),
        type=RevisionType.GIT,
        directory=directory.id,
        metadata=None,
        extra_headers=(),
        synthetic=False,
    )
    git_revisions: Tuple[Revision,
                         ...] = (revision, revision2, revision3, revision4)

    hg_revision = Revision(
        id=hash_to_bytes("951c9503541e7beaf002d7aebf2abd1629084c68"),
        message=b"hello",
        author=Person(
            name=b"Nicolas Dandrimont",
            email=b"*****@*****.**",
            fullname=b"Nicolas Dandrimont <*****@*****.**> ",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0200",
        ),
        committer=Person(
            name=b"St\xc3fano Zacchiroli",
            email=b"*****@*****.**",
            fullname=b"St\xc3fano Zacchiroli <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1123456789, microseconds=0),
            offset_bytes=b"+0200",
        ),
        parents=(),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata={
            "checksums": {
                "sha1": "tarball-sha1",
                "sha256": "tarball-sha256",
            },
            "signed-off-by": "some-dude",
            "node": "a316dfb434af2b451c1f393496b7eaeda343f543",
        },
        extra_headers=(),
        synthetic=True,
    )
    hg_revision2 = Revision(
        id=hash_to_bytes("df4afb063236300eb13b96a0d7fff03f7b7cbbaf"),
        message=b"hello again",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1123456789,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("fa1b7c84a9b40605b67653700f268349a6d6aca1")), ),
        synthetic=False,
    )
    hg_revision3 = Revision(
        id=hash_to_bytes("84d8e7081b47ebb88cad9fa1f25de5f330872a37"),
        message=b"a simple revision with no parents this time",
        author=Person(
            name=b"Roberto Dicosmo",
            email=b"*****@*****.**",
            fullname=b"Roberto Dicosmo <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1127351742,
                microseconds=220000,
            ),
            offset_bytes=b"+0000",
        ),
        parents=tuple([hg_revision.id, hg_revision2.id]),
        type=RevisionType.MERCURIAL,
        directory=directory2.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("7f294a01c49065a90b3fe8b4ad49f08ce9656ef6")), ),
        synthetic=True,
    )
    hg_revision4 = Revision(
        id=hash_to_bytes("4683324ba26dfe941a72cc7552e86eaaf7c27fe3"),
        message=b"parent of self.revision2",
        author=Person(
            name=b"me",
            email=b"*****@*****.**",
            fullname=b"me <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1234567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        committer=Person(
            name=b"committer-dude",
            email=b"*****@*****.**",
            fullname=b"committer-dude <*****@*****.**>",
        ),
        committer_date=TimestampWithTimezone(
            timestamp=Timestamp(
                seconds=1244567843,
                microseconds=220000,
            ),
            offset_bytes=b"-1200",
        ),
        parents=tuple([hg_revision3.id]),
        type=RevisionType.MERCURIAL,
        directory=directory.id,
        metadata=None,
        extra_headers=(
            (b"node",
             hash_to_bytes("f4160af0485c85823d9e829bae2c00b00a2e6297")), ),
        synthetic=False,
    )
    hg_revisions: Tuple[Revision, ...] = (
        hg_revision,
        hg_revision2,
        hg_revision3,
        hg_revision4,
    )
    revisions: Tuple[Revision, ...] = git_revisions + hg_revisions

    origins: Tuple[Origin, ...] = (
        Origin(url="https://github.com/user1/repo1"),
        Origin(url="https://github.com/user2/repo1"),
        Origin(url="https://github.com/user3/repo1"),
        Origin(url="https://gitlab.com/user1/repo1"),
        Origin(url="https://gitlab.com/user2/repo1"),
        Origin(url="https://forge.softwareheritage.org/source/repo1"),
        Origin(url="https://example.рф/🏛️.txt"),
    )
    origin, origin2 = origins[:2]

    metadata_authority = MetadataAuthority(
        type=MetadataAuthorityType.DEPOSIT_CLIENT,
        url="http://hal.inria.example.com/",
    )
    metadata_authority2 = MetadataAuthority(
        type=MetadataAuthorityType.REGISTRY,
        url="http://wikidata.example.com/",
    )
    authorities: Tuple[MetadataAuthority, ...] = (
        metadata_authority,
        metadata_authority2,
    )

    metadata_fetcher = MetadataFetcher(
        name="swh-deposit",
        version="0.0.1",
    )
    metadata_fetcher2 = MetadataFetcher(
        name="swh-example",
        version="0.0.1",
    )
    fetchers: Tuple[MetadataFetcher,
                    ...] = (metadata_fetcher, metadata_fetcher2)

    date_visit1 = datetime.datetime(2015,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit2 = datetime.datetime(2017,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)
    date_visit3 = datetime.datetime(2018,
                                    1,
                                    1,
                                    23,
                                    0,
                                    0,
                                    tzinfo=datetime.timezone.utc)

    type_visit1 = "git"
    type_visit2 = "hg"
    type_visit3 = "deb"

    origin_visit = OriginVisit(
        origin=origin.url,
        visit=1,
        date=date_visit1,
        type=type_visit1,
    )
    origin_visit2 = OriginVisit(
        origin=origin.url,
        visit=2,
        date=date_visit2,
        type=type_visit1,
    )
    origin_visit3 = OriginVisit(
        origin=origin2.url,
        visit=1,
        date=date_visit1,
        type=type_visit2,
    )
    origin_visits: Tuple[OriginVisit, ...] = (
        origin_visit,
        origin_visit2,
        origin_visit3,
    )

    release = Release(
        id=hash_to_bytes("f7f222093a18ec60d781070abec4a630c850b837"),
        name=b"v0.0.1",
        author=Person(
            name=b"olasd",
            email=b"*****@*****.**",
            fullname=b"olasd <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1234567890, microseconds=0),
            offset_bytes=b"+0042",
        ),
        target=revision.id,
        target_type=ObjectType.REVISION,
        message=b"synthetic release",
        synthetic=True,
    )
    release2 = Release(
        id=hash_to_bytes("db81a26783a3f4a9db07b4759ffc37621f159bb2"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision2.id,
        target_type=ObjectType.REVISION,
        message=b"v0.0.2\nMisc performance improvements + bug fixes",
        synthetic=False,
    )
    release3 = Release(
        id=hash_to_bytes("1c5d42e603ce2eea44917fadca76c78bad76aeb9"),
        name=b"v0.0.2",
        author=Person(
            name=b"tony",
            email=b"*****@*****.**",
            fullname=b"tony <*****@*****.**>",
        ),
        date=TimestampWithTimezone(
            timestamp=Timestamp(seconds=1634366813, microseconds=0),
            offset_bytes=b"-0200",
        ),
        target=revision3.id,
        target_type=ObjectType.REVISION,
        message=b"yet another synthetic release",
        synthetic=True,
    )

    releases: Tuple[Release, ...] = (release, release2, release3)

    snapshot = Snapshot(
        id=hash_to_bytes("9b922e6d8d5b803c1582aabe5525b7b91150788e"),
        branches={
            b"master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        },
    )
    empty_snapshot = Snapshot(
        id=hash_to_bytes("1a8893e6a86f444e8be8e7bda6cb34fb1735a00e"),
        branches={},
    )
    complete_snapshot = Snapshot(
        id=hash_to_bytes("db99fda25b43dc5cd90625ee4b0744751799c917"),
        branches={
            b"directory":
            SnapshotBranch(
                target=directory.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"directory2":
            SnapshotBranch(
                target=directory2.id,
                target_type=TargetType.DIRECTORY,
            ),
            b"content":
            SnapshotBranch(
                target=content.sha1_git,
                target_type=TargetType.CONTENT,
            ),
            b"alias":
            SnapshotBranch(
                target=b"revision",
                target_type=TargetType.ALIAS,
            ),
            b"revision":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
            b"release":
            SnapshotBranch(
                target=release.id,
                target_type=TargetType.RELEASE,
            ),
            b"snapshot":
            SnapshotBranch(
                target=empty_snapshot.id,
                target_type=TargetType.SNAPSHOT,
            ),
            b"dangling":
            None,
        },
    )

    snapshots: Tuple[Snapshot,
                     ...] = (snapshot, empty_snapshot, complete_snapshot)

    content_metadata1 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin.url,
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    content_metadata2 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        origin=origin2.url,
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=metadata_authority,
        fetcher=metadata_fetcher,
        format="yaml",
        metadata=b"foo: bar",
    )
    content_metadata3 = RawExtrinsicMetadata(
        target=ExtendedSWHID(object_type=ExtendedObjectType.CONTENT,
                             object_id=content.sha1_git),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
        origin=origin.url,
        visit=42,
        snapshot=snapshot.swhid(),
        release=release.swhid(),
        revision=revision.swhid(),
        directory=directory.swhid(),
        path=b"/foo/bar",
    )

    content_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        content_metadata1,
        content_metadata2,
        content_metadata3,
    )

    origin_metadata1 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2015,
                                         1,
                                         1,
                                         21,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="json",
        metadata=b'{"foo": "bar"}',
    )
    origin_metadata2 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority, metadata=None),
        fetcher=attr.evolve(metadata_fetcher, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )
    origin_metadata3 = RawExtrinsicMetadata(
        target=Origin(origin.url).swhid(),
        discovery_date=datetime.datetime(2017,
                                         1,
                                         1,
                                         22,
                                         0,
                                         0,
                                         tzinfo=datetime.timezone.utc),
        authority=attr.evolve(metadata_authority2, metadata=None),
        fetcher=attr.evolve(metadata_fetcher2, metadata=None),
        format="yaml",
        metadata=b"foo: bar",
    )

    origin_metadata: Tuple[RawExtrinsicMetadata, ...] = (
        origin_metadata1,
        origin_metadata2,
        origin_metadata3,
    )

    extid1 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=revision.id),
        extid_type="git",
        extid=revision.id,
    )

    extid2 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.REVISION,
                         object_id=hg_revision.id),
        extid_type="mercurial",
        extid=hash_to_bytes("a316dfb434af2b451c1f393496b7eaeda343f543"),
    )

    extid3 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory.id),
        extid_type="directory",
        extid=b"something",
    )
    extid4 = ExtID(
        target=CoreSWHID(object_type=SwhidObjectType.DIRECTORY,
                         object_id=directory2.id),
        extid_type="directory",
        extid=b"something",
        extid_version=2,
    )

    extids: Tuple[ExtID, ...] = (
        extid1,
        extid2,
        extid3,
        extid4,
    )
Beispiel #23
0
    Person,
    Release,
    Revision,
    RevisionType,
    Snapshot,
    SnapshotBranch,
    TargetType,
    Timestamp,
    TimestampWithTimezone,
)

hash_hex = "43e45d56f88993aae6a0198013efa80716fd8920"

ORIGIN_VISIT = OriginVisit(
    origin="some-url",
    visit=1,
    date=datetime.datetime.now(tz=datetime.timezone.utc),
    type="archive",
)

ORIGIN_VISIT_STATUS = OriginVisitStatus(
    origin="some-url",
    visit=1,
    type="archive",
    date=datetime.datetime.now(tz=datetime.timezone.utc),
    status="full",
    snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
    metadata=None,
)

CONTENT = Content(
    data=b"42\n",
Beispiel #24
0
def test_load_upgrade_from_revision_extids(caplog):
    """Tests that, when loading incrementally based on a snapshot made by an old
    version of the loader, the loader will convert revisions to releases
    and add them to the storage.

    Also checks that, if an extid exists pointing to a non-existent revision
    (which should never happen, but you never know...), the release is loaded from
    scratch."""

    storage = get_storage("memory")

    origin = "http://example.org"
    dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"d" * 20)
    dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"e" * 20)

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime.now(tz=datetime.timezone.utc))
    person = Person.from_fullname(b"Jane Doe <*****@*****.**>")

    rev1 = Revision(
        message=b"blah",
        author=person,
        date=date,
        committer=person,
        committer_date=date,
        directory=dir1_swhid.object_id,
        type=RevisionType.TAR,
        synthetic=True,
    )

    rel1 = Release(
        name=b"v1.0",
        message=b"blah\n",
        author=person,
        date=date,
        target=dir1_swhid.object_id,
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
    )

    rev1_swhid = rev1.swhid()
    rel1_swhid = rel1.swhid()
    rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION,
                           object_id=b"b" * 20)
    rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20)

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0),
        ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0),
    ])
    storage.revision_add([rev1])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev2_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel2_swhid.object_id, dir2_swhid.object_id),
        autospec=True,
    ).start()
    patch.object(
        loader,
        "get_versions",
        return_value=["v1.0", "v2.0", "v3.0"],
        autospec=True,
    ).start()

    caplog.set_level(logging.ERROR)

    loader.load()

    assert len(caplog.records) == 1
    (record, ) = caplog.records
    assert record.levelname == "ERROR"
    assert "Failed to upgrade branch branch-v2.0" in record.message

    assert loader._load_release.mock_calls == [
        # v1.0: not loaded because there is already a revision matching it
        # v2.0: loaded, as the revision is missing from the storage even though there
        #       is an extid
        call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
             Origin(url=origin)),
        # v3.0: loaded (did not exist yet)
        call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
             Origin(url=origin)),
    ]

    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
        ],
    )

    assert set(extids) == {
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid),
    }
Beispiel #25
0
def test_load_extids() -> None:
    """Checks PackageLoader.load() skips iff it should, and writes (only)
    the new ExtIDs"""
    storage = get_storage("memory")

    dir_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                          object_id=b"e" * 20)

    rels = [
        Release(
            name=f"v{i}.0".encode(),
            message=b"blah\n",
            target=dir_swhid.object_id,
            target_type=ModelObjectType.DIRECTORY,
            synthetic=True,
        ) for i in (1, 2, 3, 4)
    ]
    storage.release_add(rels[0:3])

    origin = "http://example.org"
    rel1_swhid = rels[0].swhid()
    rel2_swhid = rels[1].swhid()
    rel3_swhid = rels[2].swhid()
    rel4_swhid = rels[3].swhid()

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid),
    ])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel3_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel4_swhid.object_id, dir_swhid.object_id),
        autospec=True,
    ).start()

    loader.load()

    assert loader._load_release.mock_calls == [  # type: ignore
        # v1.0: not loaded because there is already its (extid_type, extid, rel)
        #       in the storage.
        # v2.0: loaded, because there is already a similar extid, but different type
        call(
            StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
            Origin(url=origin),
        ),
        # v3.0: loaded despite having an (extid_type, extid) in storage, because
        #       the target of the extid is not in the previous snapshot
        call(
            StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
            Origin(url=origin),
        ),
        # v4.0: loaded, because there isn't its extid
        call(
            StubPackageInfo(origin, "example-v4.0.tar", "v4.0"),
            Origin(url=origin),
        ),
    ]

    # then check the snapshot has all the branches.
    # versions 2.0 to 4.0 all point to rel4_swhid (instead of the value of the last
    # snapshot), because they had to be loaded (mismatched extid), and the mocked
    # _load_release always returns rel4_swhid.
    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
            b"branch-v4.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
            rel3_swhid.object_id,
            rel4_swhid.object_id,
        ],
    )

    assert set(extids) == {
        # What we inserted at the beginning of the test:
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid),
        # Added by the loader:
        ExtID("extid-type1", b"extid-of-v2.0", rel4_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel4_swhid),
        ExtID("extid-type2", b"extid-of-v4.0", rel4_swhid),
    }
def test_snapshot_get_latest(swh_storage, sample_data):
    origin = sample_data.origin
    swh_storage.origin_add([origin])

    visit1, visit2 = sample_data.origin_visits[:2]
    assert visit1.origin == origin.url

    swh_storage.origin_visit_add([visit1])
    ov1 = swh_storage.origin_visit_get_latest(origin.url)

    # Add snapshot to visit1, latest snapshot = visit 1 snapshot
    complete_snapshot = sample_data.snapshots[2]
    swh_storage.snapshot_add([complete_snapshot])

    swh_storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin.url,
            visit=ov1.visit,
            date=visit2.date,
            status="partial",
            snapshot=None,
        )
    ])
    assert visit1.date < visit2.date

    # no snapshot associated to the visit, so None
    actual_snapshot = snapshot_get_latest(swh_storage,
                                          origin.url,
                                          allowed_statuses=["partial"])
    assert actual_snapshot is None

    date_now = now()
    assert visit2.date < date_now
    swh_storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=ov1.origin,
            visit=ov1.visit,
            date=date_now,
            type=ov1.type,
            status="full",
            snapshot=complete_snapshot.id,
        )
    ])

    swh_storage.origin_visit_add(
        [OriginVisit(
            origin=origin.url,
            date=now(),
            type=visit1.type,
        )])

    actual_snapshot = snapshot_get_latest(swh_storage, origin.url)
    assert actual_snapshot is not None
    assert actual_snapshot == complete_snapshot

    actual_snapshot = snapshot_get_latest(swh_storage,
                                          origin.url,
                                          branches_count=1)
    assert actual_snapshot is not None
    assert actual_snapshot.id == complete_snapshot.id
    assert len(actual_snapshot.branches.values()) == 1

    with pytest.raises(ValueError,
                       match="branches_count must be a positive integer"):
        snapshot_get_latest(swh_storage,
                            origin.url,
                            branches_count="something-wrong")
Beispiel #27
0
def test_pypi_good_origin():
    """Tests loading a revision whose origin we can find"""

    source_original_artifact = {
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "date": "2014-05-07T22:03:00",
        "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12",
        "size": 46644,
        "sha256":
        "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5",
        "blake2s256":
        "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        "archive_type": "tar",
    }

    dest_original_artifacts = [{
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "archive_type": "tar",
        "length": 46644,
        "checksums": {
            "sha1":
            "3289269f75b4111dd00eaea53e00330db9a1db12",
            "sha256":
            "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
            "sha1_git":
            "1e5c38014731242cfa8594839bcba8a0c4e158c5",
            "blake2s256":
            "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        },
    }]

    revision_id = b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2"
    row = {
        "id":
        revision_id,
        "directory":
        DIRECTORY_ID,
        "date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "committer_date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "type":
        "tar",
        "message":
        b"0.1.32",
        "metadata": {
            "original_artifact": source_original_artifact
        },
    }

    origin_url = "https://pypi.org/project/PyPDFLite/"

    storage = get_storage("memory")

    snapshot_id = b"42" * 10
    storage.origin_add([Origin(url=origin_url)])
    storage.origin_visit_add(
        [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin_url,
            visit=1,
            date=now(),
            status="partial",
            snapshot=snapshot_id,
        )
    ])
    storage.snapshot_add([
        Snapshot(
            id=snapshot_id,
            branches={
                b"foo":
                SnapshotBranch(
                    target_type=TargetType.REVISION,
                    target=revision_id,
                )
            },
        )
    ])
    storage.metadata_authority_add([
        attr.evolve(PYPI_AUTHORITY, metadata={}),
        attr.evolve(SWH_AUTHORITY, metadata={}),
    ])
    storage.metadata_fetcher_add([FETCHER])
    deposit_cur = None
    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)

    revision_swhid = CoreSWHID.from_string(
        "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2")

    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=PYPI_AUTHORITY,
    ) == PagedResult(
        results=[],
        next_page_token=None,
    )
    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=SWH_AUTHORITY,
    ) == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(
                    2014,
                    5,
                    7,
                    22,
                    3,
                    tzinfo=datetime.timezone.utc,
                ),
                authority=SWH_AUTHORITY,
                fetcher=FETCHER,
                format="original-artifacts-json",
                metadata=json.dumps(dest_original_artifacts).encode(),
                origin=origin_url,
                revision=revision_swhid,
            ),
        ],
        next_page_token=None,
    )
Beispiel #28
0
def test_sub_directory_view_origin_context(client, archive_data,
                                           empty_directory, person, date):
    origin_url = "test_sub_directory_view_origin_context"
    subdir = Directory(entries=(
        DirectoryEntry(
            name=b"foo",
            type="dir",
            target=hash_to_bytes(empty_directory),
            perms=DentryPerms.directory,
        ),
        DirectoryEntry(
            name=b"bar",
            type="dir",
            target=hash_to_bytes(empty_directory),
            perms=DentryPerms.directory,
        ),
    ))

    parentdir = Directory(entries=(DirectoryEntry(
        name=b"baz",
        type="dir",
        target=subdir.id,
        perms=DentryPerms.directory,
    ), ))
    archive_data.directory_add([subdir, parentdir])

    revision = Revision(
        directory=parentdir.id,
        author=person,
        committer=person,
        message=b"commit message",
        date=TimestampWithTimezone.from_datetime(date),
        committer_date=TimestampWithTimezone.from_datetime(date),
        synthetic=False,
        type=RevisionType.GIT,
    )
    archive_data.revision_add([revision])

    snapshot = Snapshot(
        branches={
            b"HEAD":
            SnapshotBranch(
                target="refs/head/master".encode(),
                target_type=TargetType.ALIAS,
            ),
            b"refs/head/master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        })
    archive_data.snapshot_add([snapshot])

    archive_data.origin_add([Origin(url=origin_url)])
    date = now()
    visit = OriginVisit(origin=origin_url, date=date, type="git")
    visit = archive_data.origin_visit_add([visit])[0]
    visit_status = OriginVisitStatus(
        origin=origin_url,
        visit=visit.visit,
        date=date,
        status="full",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    dir_content = archive_data.directory_ls(hash_to_hex(parentdir.id))
    subdir = dir_content[0]
    subdir_content = archive_data.directory_ls(subdir["target"])
    _directory_view_checks(
        client,
        hash_to_hex(parentdir.id),
        subdir_content,
        subdir["name"],
        origin_url,
        hash_to_hex(snapshot.id),
        hash_to_hex(revision.id),
    )
    def test_load_incremental_from(
        self,
        parent_snapshot,
        previous_snapshot,
        expected_git_known_refs_percent,
        mocker,
    ):
        """Snapshot of parent origin has all branches, but previous snapshot was
        empty."""
        statsd_report = mocker.patch.object(self.loader.statsd, "_report")

        now = datetime.datetime.now(tz=datetime.timezone.utc)

        self.loader.storage.snapshot_add([parent_snapshot, previous_snapshot])
        self.loader.storage.origin_add(
            [Origin(url=f"base://{self.repo_url}"),
             Origin(url=self.repo_url)])
        self.loader.storage.origin_visit_add([
            OriginVisit(
                origin=f"base://{self.repo_url}",
                visit=42,
                date=now - datetime.timedelta(seconds=-1),
                type="git",
            ),
            OriginVisit(
                origin=self.repo_url,
                visit=42,
                date=now - datetime.timedelta(seconds=-1),
                type="git",
            ),
        ])
        self.loader.storage.origin_visit_status_add([
            OriginVisitStatus(
                origin=f"base://{self.repo_url}",
                visit=42,
                type="git",
                snapshot=parent_snapshot.id,
                date=now,
                status="full",
            ),
            OriginVisitStatus(
                origin=self.repo_url,
                visit=42,
                type="git",
                snapshot=previous_snapshot.id,
                date=now,
                status="full",
            ),
        ])
        self.loader.storage.flush()

        res = self.loader.load()
        assert res == {"status": "eventful"}

        self.fetcher_cls.assert_called_once_with(
            credentials={},
            lister_name="fake-lister",
            lister_instance_name="",
            origin=Origin(url=self.repo_url),
        )
        self.fetcher.get_parent_origins.assert_called_once_with()

        # First tries the same origin
        assert self.loader.storage.origin_visit_get_latest.mock_calls == [
            call(
                self.repo_url,
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
            # As it does not already have a snapshot, fall back to the parent origin
            call(
                f"base://{self.repo_url}",
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
        ]

        assert self.loader.statsd.constant_tags == {
            "visit_type": "git",
            "incremental_enabled": True,
            "has_parent_snapshot": True,
            "has_previous_snapshot": True,
            "has_parent_origins": True,
        }
        assert [
            c for c in statsd_report.mock_calls if c[1][0].startswith("git_")
        ] == [
            call("git_total", "c", 1, {}, 1),
            call("git_ignored_refs_percent", "h", 0.0, {}, 1),
            call("git_known_refs_percent", "h",
                 expected_git_known_refs_percent, {}, 1),
        ]
Beispiel #30
0
            b"type commit\n"
            b"tag v0.0.1\n"
            b"tagger foo 1234567890 +200"  # missing leading 0 for timezone
            b"\n\nfoo"),
    ),
]

ORIGINS = [
    Origin(url="https://somewhere.org/den/fox", ),
    Origin(url="https://overtherainbow.org/fox/den", ),
]

ORIGIN_VISITS = [
    OriginVisit(
        origin=ORIGINS[0].url,
        date=datetime.datetime(2013, 5, 7, 4, 20, 39, 369271, tzinfo=UTC),
        visit=1,
        type="git",
    ),
    OriginVisit(
        origin=ORIGINS[1].url,
        date=datetime.datetime(2014, 11, 27, 17, 20, 39, tzinfo=UTC),
        visit=1,
        type="hg",
    ),
    OriginVisit(
        origin=ORIGINS[0].url,
        date=datetime.datetime(2018, 11, 27, 17, 20, 39, tzinfo=UTC),
        visit=2,
        type="git",
    ),
    OriginVisit(