def test_visit_and_snapshot_get_from_revision(swh_storage, sample_data):
    origin = sample_data.origin
    swh_storage.origin_add([origin])

    date_visit2 = now()
    visit1, visit2 = sample_data.origin_visits[:2]
    assert visit1.origin == origin.url

    ov1, ov2 = swh_storage.origin_visit_add([visit1, visit2])

    revision1, revision2, revision3 = sample_data.revisions[:3]
    swh_storage.revision_add([revision1, revision2])

    empty_snapshot, complete_snapshot = sample_data.snapshots[1:3]
    swh_storage.snapshot_add([complete_snapshot])

    # Add complete_snapshot to visit1 which targets revision1
    ovs1, ovs2 = [
        OriginVisitStatus(
            origin=ov1.origin,
            visit=ov1.visit,
            date=date_visit2,
            type=ov1.type,
            status="partial",
            snapshot=complete_snapshot.id,
        ),
        OriginVisitStatus(
            origin=ov2.origin,
            visit=ov2.visit,
            date=now(),
            type=ov2.type,
            status="full",
            snapshot=empty_snapshot.id,
        ),
    ]

    swh_storage.origin_visit_status_add([ovs1, ovs2])
    assert ov1.date < ov2.date
    assert ov2.date < ovs1.date
    assert ovs1.date < ovs2.date

    # revision3 does not exist so result is None
    actual_snapshot_id = snapshot_id_get_from_revision(swh_storage, origin.url,
                                                       revision3.id)
    assert actual_snapshot_id is None

    # no snapshot targets revision2 for origin.url so result is None
    res = list(
        visits_and_snapshots_get_from_revision(swh_storage, origin.url,
                                               revision2.id))
    assert res == []

    # complete_snapshot targets at least revision1
    res = list(
        visits_and_snapshots_get_from_revision(swh_storage, origin.url,
                                               revision1.id))
    assert res == [(ov1, ovs1, complete_snapshot)]
Exemple #2
0
def test_iter_origin_visit_status(swh_storage, sample_data):
    origin1, origin2 = sample_data.origins[:2]
    swh_storage.origin_add([origin1])

    ov1 = swh_storage.origin_visit_add([sample_data.origin_visit])[0]
    assert ov1.origin == origin1.url

    date_past = now() - datetime.timedelta(weeks=20)

    ovs1 = OriginVisitStatus(
        origin=ov1.origin,
        visit=ov1.visit,
        date=ov1.date,
        type=ov1.type,
        status="created",
        snapshot=None,
    )
    new_visit_statuses = [ovs1]
    for i in range(20):
        status_date = date_past + datetime.timedelta(days=i)

        new_visit_statuses.append(
            OriginVisitStatus(
                origin=ov1.origin,
                visit=ov1.visit,
                date=status_date,
                type=ov1.type,
                status="created",
                snapshot=None,
            ))

    swh_storage.origin_visit_status_add(new_visit_statuses)
    reversed_visit_statuses = list(reversed(new_visit_statuses))

    # order asc
    actual_visit_statuses = list(
        iter_origin_visit_statuses(swh_storage, ov1.origin, ov1.visit))
    assert actual_visit_statuses == new_visit_statuses

    # order desc
    actual_visit_statuses = list(
        iter_origin_visit_statuses(swh_storage,
                                   ov1.origin,
                                   ov1.visit,
                                   order=ListOrder.DESC))
    assert actual_visit_statuses == reversed_visit_statuses

    # no result
    actual_visit_statuses = list(
        iter_origin_visit_statuses(swh_storage, origin2.url, ov1.visit))
    assert actual_visit_statuses == []
Exemple #3
0
def test_get_origin_visit_non_resolvable_snapshots(archive_data, new_origin,
                                                   new_snapshots):
    visits = []
    archive_data.origin_add([new_origin])
    # create 6 full visits, the first three have resolvable snapshots
    # while the last three have non resolvable snapshots
    for i, snp in enumerate(new_snapshots):
        visit_date = now() + timedelta(days=i * 10)
        visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        archive_data.snapshot_add([new_snapshots[i]])
        visit_status = OriginVisitStatus(
            origin=new_origin.url,
            visit=visit.visit,
            date=visit_date + timedelta(minutes=5),
            status="full",
            snapshot=new_snapshots[i].id,
        )
        if i < 3:
            archive_data.origin_visit_status_add([visit_status])
        visits.append(visit.visit)

    # should return the third visit
    expected_visit = archive_data.origin_visit_get_by(new_origin.url,
                                                      visits[2])
    assert get_origin_visit((OriginInfo(url=new_origin.url))) == expected_visit
Exemple #4
0
def test_get_origin_visit_return_first_valid_partial_visit(
        archive_data, new_origin, new_snapshots):
    visits = []

    archive_data.origin_add([new_origin])
    # create 6 visits, the first three have full status but null snapshot
    # while the last three have partial status with valid snapshot
    for i, snp in enumerate(new_snapshots):
        visit_date = now() + timedelta(days=i * 10)
        visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        archive_data.snapshot_add([new_snapshots[i]])
        visit_status = OriginVisitStatus(
            origin=new_origin.url,
            visit=visit.visit,
            date=visit_date + timedelta(minutes=5),
            status="full" if i < 3 else "partial",
            snapshot=new_snapshots[i].id if i > 2 else None,
        )
        if i > 2:
            archive_data.origin_visit_status_add([visit_status])

        visits.append(visit.visit)

    # should return the last visit
    expected_visit = archive_data.origin_visit_get_by(new_origin.url,
                                                      visits[-1])
    assert get_origin_visit((OriginInfo(url=new_origin.url))) == expected_visit
Exemple #5
0
def _add_origin(storage,
                search,
                origin_url,
                visit_type="git",
                snapshot_branches={}):
    storage.origin_add([Origin(url=origin_url)])
    search.origin_update([{
        "url": origin_url,
        "has_visits": True,
        "visit_types": [visit_type]
    }])
    date = now()
    visit = OriginVisit(origin=origin_url, date=date, type=visit_type)
    visit = storage.origin_visit_add([visit])[0]
    snapshot = Snapshot.from_dict({"branches": snapshot_branches})
    storage.snapshot_add([snapshot])
    visit_status = OriginVisitStatus(
        origin=origin_url,
        visit=visit.visit,
        date=date + timedelta(minutes=1),
        type=visit.type,
        status="full",
        snapshot=snapshot.id,
    )
    storage.origin_visit_status_add([visit_status])
Exemple #6
0
def test_origin_snapshot_invalid_branch(
    client, archive_data, new_origin, new_snapshot, visit_dates, revisions
):
    snp_dict = new_snapshot.to_dict()
    archive_data.origin_add([new_origin])
    for i, branch in enumerate(snp_dict["branches"].keys()):
        snp_dict["branches"][branch] = {
            "target_type": "revision",
            "target": hash_to_bytes(revisions[i]),
        }

    archive_data.snapshot_add([Snapshot.from_dict(snp_dict)])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="full",
        snapshot=snp_dict["id"],
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory",
        query_params={"origin_url": new_origin.url, "branch": "invalid_branch"},
    )

    check_html_get_response(client, url, status_code=404, template_used="error.html")
Exemple #7
0
def test_origin_empty_snapshot_null_revision(client, archive_data, new_origin):
    snapshot = Snapshot(
        branches={
            b"HEAD": SnapshotBranch(
                target="refs/head/master".encode(), target_type=TargetType.ALIAS,
            ),
            b"refs/head/master": None,
        }
    )
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([snapshot])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=now(), type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="partial",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory", query_params={"origin_url": new_origin.url},
    )

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/directory.html"
    )
    resp_content = resp.content.decode("utf-8")
    assert re.search("snapshot.*is empty", resp_content)
    assert not re.search("swh-tr-link", resp_content)
Exemple #8
0
def row_to_visit_status(row: OriginVisitStatusRow) -> OriginVisitStatus:
    """Format a row representing a visit_status to an actual OriginVisitStatus."""
    return OriginVisitStatus.from_dict({
        **row.to_dict(),
        "date":
        row.date.replace(tzinfo=datetime.timezone.utc),
        "metadata": (json.loads(row.metadata) if row.metadata else None),
    })
Exemple #9
0
def test_pypi_origin_from_project_name(mocker):
    origin_url = "https://pypi.org/project/ProjectName/"

    storage = get_storage("memory")

    revision_id = b"41" * 10
    snapshot_id = b"42" * 10
    storage.origin_add([Origin(url=origin_url)])
    storage.origin_visit_add(
        [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin_url,
            visit=1,
            date=now(),
            status="partial",
            snapshot=snapshot_id,
        )
    ])
    storage.snapshot_add([
        Snapshot(
            id=snapshot_id,
            branches={
                b"foo":
                SnapshotBranch(
                    target_type=TargetType.REVISION,
                    target=revision_id,
                )
            },
        )
    ])

    class response:
        code = 200

        def read(self):
            return b'{"info": {"name": "ProjectName"}}'

    mock_urlopen = mocker.patch(
        "swh.storage.migrate_extrinsic_metadata.urlopen",
        return_value=response(),
    )

    assert (pypi_origin_from_filename(
        storage, revision_id, "ProjectName-1.0.0.tar.gz") == origin_url)
    mock_urlopen.assert_not_called()
    assert (pypi_origin_from_filename(
        storage, revision_id, "projectname-1.0.0.tar.gz") == origin_url)
    mock_urlopen.assert_called_once_with(
        "https://pypi.org/pypi/projectname/json/")
Exemple #10
0
def test_api_lookup_origin_visits(api_client, archive_data, new_origin,
                                  visit_dates, new_snapshots):

    archive_data.origin_add([new_origin])
    for i, visit_date in enumerate(visit_dates):
        origin_visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        archive_data.snapshot_add([new_snapshots[i]])
        visit_status = OriginVisitStatus(
            origin=new_origin.url,
            visit=origin_visit.visit,
            date=now(),
            status="full",
            snapshot=new_snapshots[i].id,
        )
        archive_data.origin_visit_status_add([visit_status])

    all_visits = list(reversed(get_origin_visits(new_origin.to_dict())))

    for last_visit, expected_visits in (
        (None, all_visits[:2]),
        (all_visits[1]["visit"], all_visits[2:]),
    ):

        url = reverse(
            "api-1-origin-visits",
            url_args={"origin_url": new_origin.url},
            query_params={
                "per_page": 2,
                "last_visit": last_visit
            },
        )

        rv = check_api_get_responses(api_client, url, status_code=200)

        for i in range(len(expected_visits)):
            expected_visits[i] = enrich_origin_visit(
                expected_visits[i],
                with_origin_link=False,
                with_origin_visit_link=True,
                request=rv.wsgi_request,
            )

        assert rv.data == expected_visits
Exemple #11
0
def test_api_lookup_origin_visit_latest_with_snapshot(api_client, archive_data,
                                                      new_origin, visit_dates,
                                                      new_snapshots):
    archive_data.origin_add([new_origin])
    visit_dates.sort()
    visit_ids = []
    for i, visit_date in enumerate(visit_dates):
        origin_visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        visit_ids.append(origin_visit.visit)

    archive_data.snapshot_add([new_snapshots[0]])

    # Add snapshot to the latest visit
    visit_id = visit_ids[-1]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit_id,
        date=now(),
        status="full",
        snapshot=new_snapshots[0].id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "api-1-origin-visit-latest",
        url_args={"origin_url": new_origin.url},
        query_params={"require_snapshot": True},
    )

    rv = check_api_get_responses(api_client, url, status_code=200)

    expected_visit = archive_data.origin_visit_status_get_latest(
        new_origin.url, type="git", require_snapshot=True)

    expected_visit = enrich_origin_visit(
        expected_visit,
        with_origin_link=True,
        with_origin_visit_link=False,
        request=rv.wsgi_request,
    )

    assert rv.data == expected_visit
Exemple #12
0
def test_origin_branches_pagination_with_alias(
    client, archive_data, mocker, new_origin, visit_dates, revisions, existing_release
):
    """
    When a snapshot contains a branch or a release alias, pagination links
    in the branches / releases view should be displayed.
    """
    mocker.patch("swh.web.browse.snapshot_context.PER_PAGE", len(revisions) / 2)
    snp_dict = {"branches": {}, "id": hash_to_bytes(random_sha1())}
    for i in range(len(revisions)):
        branch = "".join(random.choices(string.ascii_lowercase, k=8))
        snp_dict["branches"][branch.encode()] = {
            "target_type": "revision",
            "target": hash_to_bytes(revisions[i]),
        }
    release = "".join(random.choices(string.ascii_lowercase, k=8))
    snp_dict["branches"][b"RELEASE_ALIAS"] = {
        "target_type": "alias",
        "target": release.encode(),
    }
    snp_dict["branches"][release.encode()] = {
        "target_type": "release",
        "target": hash_to_bytes(existing_release),
    }
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([Snapshot.from_dict(snp_dict)])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="full",
        snapshot=snp_dict["id"],
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse("browse-origin-branches", query_params={"origin_url": new_origin.url})

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/branches.html"
    )
    assert_contains(resp, '<ul class="pagination')
Exemple #13
0
def test_api_lookup_origin_visit(api_client, archive_data, new_origin,
                                 visit_dates, new_snapshots):
    archive_data.origin_add([new_origin])
    for i, visit_date in enumerate(visit_dates):
        origin_visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        visit_id = origin_visit.visit
        archive_data.snapshot_add([new_snapshots[i]])
        visit_status = OriginVisitStatus(
            origin=new_origin.url,
            visit=origin_visit.visit,
            date=visit_date + timedelta(minutes=5),
            status="full",
            snapshot=new_snapshots[i].id,
        )
        archive_data.origin_visit_status_add([visit_status])
        url = reverse(
            "api-1-origin-visit",
            url_args={
                "origin_url": new_origin.url,
                "visit_id": visit_id
            },
        )

        rv = check_api_get_responses(api_client, url, status_code=200)

        expected_visit = archive_data.origin_visit_get_by(
            new_origin.url, visit_id)

        expected_visit = enrich_origin_visit(
            expected_visit,
            with_origin_link=True,
            with_origin_visit_link=False,
            request=rv.wsgi_request,
        )

        assert rv.data == expected_visit
 def test_pypi_missing_branch(self):
     origin_url = "https://pypi.org/project/abcdef/"
     self.indexer.storage.origin_add([Origin(url=origin_url, )])
     visit = self.indexer.storage.origin_visit_add([
         OriginVisit(
             origin=origin_url,
             date=datetime(2019, 2, 27, tzinfo=timezone.utc),
             type="pypi",
         )
     ])[0]
     self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
     visit_status = OriginVisitStatus(
         origin=origin_url,
         visit=visit.visit,
         date=now(),
         status="full",
         snapshot=SAMPLE_SNAPSHOT.id,
     )
     self.indexer.storage.origin_visit_status_add([visit_status])
     self.indexer.run(["https://pypi.org/project/abcdef/"])
     self.assertEqual(self.indexer.results, [])
 def test_git_partial_snapshot(self):
     """Checks partial snapshots are ignored."""
     origin_url = "https://github.com/SoftwareHeritage/swh-core"
     self.indexer.storage.origin_add([Origin(url=origin_url)])
     visit = self.indexer.storage.origin_visit_add([
         OriginVisit(
             origin=origin_url,
             date=datetime(2019, 2, 27, tzinfo=timezone.utc),
             type="git",
         )
     ])[0]
     self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
     visit_status = OriginVisitStatus(
         origin=origin_url,
         visit=visit.visit,
         date=now(),
         status="partial",
         snapshot=SAMPLE_SNAPSHOT.id,
     )
     self.indexer.storage.origin_visit_status_add([visit_status])
     self.indexer.run([origin_url])
     self.assertEqual(self.indexer.results, [])
Exemple #16
0
def load_task_process(env: Environment, task: Task,
                      status_queue: Queue) -> Iterator[Event]:
    """A loading task. This pushes OriginVisitStatus objects to the
    status_queue to simulate the visible outcomes of the task.

    Uses the `load_task_duration` function to determine its run time.
    """
    status = OriginVisitStatus(
        origin=task.origin,
        visit=42,
        type=task.visit_type,
        status="created",
        date=env.time,
        snapshot=None,
    )
    logger.debug("%s task %s origin=%s: Start", env.time, task.visit_type,
                 task.origin)
    yield status_queue.put(TaskEvent(task=task, status=status))

    origin_model = OriginModel(task.visit_type, task.origin)
    (run_time, end_status,
     snapshot) = origin_model.load_task_characteristics(env.time)
    yield env.timeout(run_time)

    logger.debug("%s task %s origin=%s: End", env.time, task.visit_type,
                 task.origin)

    yield status_queue.put(
        TaskEvent(
            task=task,
            status=attr.evolve(status,
                               status=end_status,
                               date=env.time,
                               snapshot=snapshot),
        ))

    env.report.record_visit((task.visit_type, task.origin), run_time,
                            end_status, snapshot)
Exemple #17
0
def fill_storage(storage):
    storage.origin_add(ORIGINS)
    storage.directory_add([DIRECTORY, DIRECTORY2])
    storage.revision_add(REVISIONS)
    storage.snapshot_add(SNAPSHOTS)

    for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS):
        assert snapshot.id is not None

        visit = storage.origin_visit_add(
            [OriginVisit(origin=visit["origin"], date=now(), type=visit["type"])]
        )[0]
        visit_status = OriginVisitStatus(
            origin=visit.origin,
            visit=visit.visit,
            date=now(),
            status="full",
            snapshot=snapshot.id,
        )
        storage.origin_visit_status_add([visit_status])

    contents = []
    for (obj_id, content) in OBJ_STORAGE_DATA.items():
        content_hashes = hashutil.MultiHash.from_data(content).digest()
        contents.append(
            Content(
                data=content,
                length=len(content),
                status="visible",
                sha1=hash_to_bytes(obj_id),
                sha1_git=hash_to_bytes(obj_id),
                sha256=content_hashes["sha256"],
                blake2s256=content_hashes["blake2s256"],
            )
        )
    storage.content_add(contents)
def test_snapshot_get_latest_none(swh_storage, sample_data):
    """Retrieve latest snapshot on unknown origin or origin without snapshot should
    yield no result

    """
    # unknown origin so None
    assert snapshot_get_latest(swh_storage, "unknown-origin") is None

    # no snapshot on origin visit so None
    origin = sample_data.origin
    swh_storage.origin_add([origin])
    origin_visit, origin_visit2 = sample_data.origin_visits[:2]
    assert origin_visit.origin == origin.url

    swh_storage.origin_visit_add([origin_visit])
    assert snapshot_get_latest(swh_storage, origin.url) is None

    ov1 = swh_storage.origin_visit_get_latest(origin.url)
    assert ov1 is not None

    # visit references a snapshot but the snapshot does not exist in backend for some
    # reason
    complete_snapshot = sample_data.snapshots[2]
    swh_storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin.url,
            visit=ov1.visit,
            date=origin_visit2.date,
            status="partial",
            snapshot=complete_snapshot.id,
        )
    ])
    # so we do not find it
    assert snapshot_get_latest(swh_storage, origin.url) is None
    assert snapshot_get_latest(swh_storage, origin.url,
                               branches_count=1) is None
Exemple #19
0
def test_origin_browse_directory_branch_with_non_resolvable_revision(
    client, archive_data, new_origin, unknown_revision
):
    branch_name = "master"
    snapshot = Snapshot(
        branches={
            branch_name.encode(): SnapshotBranch(
                target=hash_to_bytes(unknown_revision), target_type=TargetType.REVISION,
            )
        }
    )
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([snapshot])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=now(), type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="partial",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory",
        query_params={"origin_url": new_origin.url, "branch": branch_name},
    )

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/directory.html"
    )
    assert_contains(
        resp, f"Revision {unknown_revision } could not be found in the archive."
    )
Exemple #20
0
def test_pypi_good_origin():
    """Tests loading a revision whose origin we can find"""

    source_original_artifact = {
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "date": "2014-05-07T22:03:00",
        "sha1": "3289269f75b4111dd00eaea53e00330db9a1db12",
        "size": 46644,
        "sha256":
        "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "sha1_git": "1e5c38014731242cfa8594839bcba8a0c4e158c5",
        "blake2s256":
        "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        "archive_type": "tar",
    }

    dest_original_artifacts = [{
        "url":
        "https://files.pythonhosted.org/packages/34/4f/30087f22eaae8ad7077a28ce157342745a2977e264b8a8e4e7f804a8aa5e/PyPDFLite-0.1.32.tar.gz",
        "filename": "PyPDFLite-0.1.32.tar.gz",
        "archive_type": "tar",
        "length": 46644,
        "checksums": {
            "sha1":
            "3289269f75b4111dd00eaea53e00330db9a1db12",
            "sha256":
            "911497d655cf7ef6530c5b57773dad7da97e21cf4d608ad9ad1e38bd7bec7824",
            "sha1_git":
            "1e5c38014731242cfa8594839bcba8a0c4e158c5",
            "blake2s256":
            "45792e57873f56d385c694e36c98a580cbba60d5ea91eb6fd0a2d1c71c1fb385",
        },
    }]

    revision_id = b"N\xa9\x91|\xdfS\xcd\x13SJ\x04.N\xb3x{\x86\xc84\xd2"
    row = {
        "id":
        revision_id,
        "directory":
        DIRECTORY_ID,
        "date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "committer_date":
        datetime.datetime(2014, 5, 7, 22, 3, tzinfo=datetime.timezone.utc),
        "type":
        "tar",
        "message":
        b"0.1.32",
        "metadata": {
            "original_artifact": source_original_artifact
        },
    }

    origin_url = "https://pypi.org/project/PyPDFLite/"

    storage = get_storage("memory")

    snapshot_id = b"42" * 10
    storage.origin_add([Origin(url=origin_url)])
    storage.origin_visit_add(
        [OriginVisit(origin=origin_url, visit=1, date=now(), type="pypi")])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin_url,
            visit=1,
            date=now(),
            status="partial",
            snapshot=snapshot_id,
        )
    ])
    storage.snapshot_add([
        Snapshot(
            id=snapshot_id,
            branches={
                b"foo":
                SnapshotBranch(
                    target_type=TargetType.REVISION,
                    target=revision_id,
                )
            },
        )
    ])
    storage.metadata_authority_add([
        attr.evolve(PYPI_AUTHORITY, metadata={}),
        attr.evolve(SWH_AUTHORITY, metadata={}),
    ])
    storage.metadata_fetcher_add([FETCHER])
    deposit_cur = None
    handle_row(copy.deepcopy(row), storage, deposit_cur, dry_run=False)

    revision_swhid = CoreSWHID.from_string(
        "swh:1:rev:4ea9917cdf53cd13534a042e4eb3787b86c834d2")

    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=PYPI_AUTHORITY,
    ) == PagedResult(
        results=[],
        next_page_token=None,
    )
    assert storage.raw_extrinsic_metadata_get(
        DIRECTORY_SWHID,
        authority=SWH_AUTHORITY,
    ) == PagedResult(
        results=[
            RawExtrinsicMetadata(
                target=DIRECTORY_SWHID,
                discovery_date=datetime.datetime(
                    2014,
                    5,
                    7,
                    22,
                    3,
                    tzinfo=datetime.timezone.utc,
                ),
                authority=SWH_AUTHORITY,
                fetcher=FETCHER,
                format="original-artifacts-json",
                metadata=json.dumps(dest_original_artifacts).encode(),
                origin=origin_url,
                revision=revision_swhid,
            ),
        ],
        next_page_token=None,
    )
def test_sub_directory_view_origin_context(client, archive_data,
                                           empty_directory, person, date):
    origin_url = "test_sub_directory_view_origin_context"
    subdir = Directory(entries=(
        DirectoryEntry(
            name=b"foo",
            type="dir",
            target=hash_to_bytes(empty_directory),
            perms=DentryPerms.directory,
        ),
        DirectoryEntry(
            name=b"bar",
            type="dir",
            target=hash_to_bytes(empty_directory),
            perms=DentryPerms.directory,
        ),
    ))

    parentdir = Directory(entries=(DirectoryEntry(
        name=b"baz",
        type="dir",
        target=subdir.id,
        perms=DentryPerms.directory,
    ), ))
    archive_data.directory_add([subdir, parentdir])

    revision = Revision(
        directory=parentdir.id,
        author=person,
        committer=person,
        message=b"commit message",
        date=TimestampWithTimezone.from_datetime(date),
        committer_date=TimestampWithTimezone.from_datetime(date),
        synthetic=False,
        type=RevisionType.GIT,
    )
    archive_data.revision_add([revision])

    snapshot = Snapshot(
        branches={
            b"HEAD":
            SnapshotBranch(
                target="refs/head/master".encode(),
                target_type=TargetType.ALIAS,
            ),
            b"refs/head/master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        })
    archive_data.snapshot_add([snapshot])

    archive_data.origin_add([Origin(url=origin_url)])
    date = now()
    visit = OriginVisit(origin=origin_url, date=date, type="git")
    visit = archive_data.origin_visit_add([visit])[0]
    visit_status = OriginVisitStatus(
        origin=origin_url,
        visit=visit.visit,
        date=date,
        status="full",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    dir_content = archive_data.directory_ls(hash_to_hex(parentdir.id))
    subdir = dir_content[0]
    subdir_content = archive_data.directory_ls(subdir["target"])
    _directory_view_checks(
        client,
        hash_to_hex(parentdir.id),
        subdir_content,
        subdir["name"],
        origin_url,
        hash_to_hex(snapshot.id),
        hash_to_hex(revision.id),
    )
Exemple #22
0
def test_cli_journal_client(
    cli_runner,
    swh_config,
    indexer_scheduler,
    kafka_prefix: str,
    kafka_server,
    consumer: Consumer,
):
    """Test the 'swh indexer journal-client' cli tool."""
    journal_writer = get_journal_writer(
        "kafka",
        brokers=[kafka_server],
        prefix=kafka_prefix,
        client_id="test producer",
        value_sanitizer=lambda object_type, value: value,
        flush_timeout=3,  # fail early if something is going wrong
    )

    visit_statuses = [
        OriginVisitStatus(
            origin="file:///dev/zero",
            visit=1,
            date=now(),
            status="full",
            snapshot=None,
        ),
        OriginVisitStatus(
            origin="file:///dev/foobar",
            visit=2,
            date=now(),
            status="full",
            snapshot=None,
        ),
        OriginVisitStatus(
            origin="file:///tmp/spamegg",
            visit=3,
            date=now(),
            status="full",
            snapshot=None,
        ),
        OriginVisitStatus(
            origin="file:///dev/0002",
            visit=6,
            date=now(),
            status="full",
            snapshot=None,
        ),
        OriginVisitStatus(  # will be filtered out due to its 'partial' status
            origin="file:///dev/0000",
            visit=4,
            date=now(),
            status="partial",
            snapshot=None,
        ),
        OriginVisitStatus(  # will be filtered out due to its 'ongoing' status
            origin="file:///dev/0001",
            visit=5,
            date=now(),
            status="ongoing",
            snapshot=None,
        ),
    ]

    journal_writer.write_additions("origin_visit_status", visit_statuses)
    visit_statuses_full = [vs for vs in visit_statuses if vs.status == "full"]

    result = cli_runner.invoke(
        indexer_cli_group,
        [
            "-C",
            swh_config,
            "journal-client",
            "--broker",
            kafka_server,
            "--prefix",
            kafka_prefix,
            "--group-id",
            "test-consumer",
            "--stop-after-objects",
            len(visit_statuses),
            "--origin-metadata-task-type",
            "index-origin-metadata",
        ],
        catch_exceptions=False,
    )

    # Check the output
    expected_output = "Done.\n"
    assert result.exit_code == 0, result.output
    assert result.output == expected_output

    # Check scheduled tasks
    tasks = indexer_scheduler.search_tasks(task_type="index-origin-metadata")

    # This can be split into multiple tasks but no more than the origin-visit-statuses
    # written in the journal
    assert len(tasks) <= len(visit_statuses_full)

    actual_origins = []
    for task in tasks:
        actual_task = dict(task)
        assert actual_task["type"] == "index-origin-metadata"
        scheduled_origins = actual_task["arguments"]["args"][0]
        actual_origins.extend(scheduled_origins)

    assert set(actual_origins) == {vs.origin for vs in visit_statuses_full}
Exemple #23
0
def visit_status_to_row(status: OriginVisitStatus) -> OriginVisitStatusRow:
    d = status.to_dict()
    return OriginVisitStatusRow.from_dict({
        **d, "metadata":
        json.dumps(d["metadata"])
    })
Exemple #24
0
def test_load_extids() -> None:
    """Checks PackageLoader.load() skips iff it should, and writes (only)
    the new ExtIDs"""
    storage = get_storage("memory")

    dir_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                          object_id=b"e" * 20)

    rels = [
        Release(
            name=f"v{i}.0".encode(),
            message=b"blah\n",
            target=dir_swhid.object_id,
            target_type=ModelObjectType.DIRECTORY,
            synthetic=True,
        ) for i in (1, 2, 3, 4)
    ]
    storage.release_add(rels[0:3])

    origin = "http://example.org"
    rel1_swhid = rels[0].swhid()
    rel2_swhid = rels[1].swhid()
    rel3_swhid = rels[2].swhid()
    rel4_swhid = rels[3].swhid()

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid),
    ])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel3_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel4_swhid.object_id, dir_swhid.object_id),
        autospec=True,
    ).start()

    loader.load()

    assert loader._load_release.mock_calls == [  # type: ignore
        # v1.0: not loaded because there is already its (extid_type, extid, rel)
        #       in the storage.
        # v2.0: loaded, because there is already a similar extid, but different type
        call(
            StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
            Origin(url=origin),
        ),
        # v3.0: loaded despite having an (extid_type, extid) in storage, because
        #       the target of the extid is not in the previous snapshot
        call(
            StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
            Origin(url=origin),
        ),
        # v4.0: loaded, because there isn't its extid
        call(
            StubPackageInfo(origin, "example-v4.0.tar", "v4.0"),
            Origin(url=origin),
        ),
    ]

    # then check the snapshot has all the branches.
    # versions 2.0 to 4.0 all point to rel4_swhid (instead of the value of the last
    # snapshot), because they had to be loaded (mismatched extid), and the mocked
    # _load_release always returns rel4_swhid.
    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
            b"branch-v4.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel4_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
            rel3_swhid.object_id,
            rel4_swhid.object_id,
        ],
    )

    assert set(extids) == {
        # What we inserted at the beginning of the test:
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type2", b"extid-of-v2.0", rel2_swhid),
        # Added by the loader:
        ExtID("extid-type1", b"extid-of-v2.0", rel4_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel4_swhid),
        ExtID("extid-type2", b"extid-of-v4.0", rel4_swhid),
    }
Exemple #25
0
)

hash_hex = "43e45d56f88993aae6a0198013efa80716fd8920"

ORIGIN_VISIT = OriginVisit(
    origin="some-url",
    visit=1,
    date=datetime.datetime.now(tz=datetime.timezone.utc),
    type="archive",
)

ORIGIN_VISIT_STATUS = OriginVisitStatus(
    origin="some-url",
    visit=1,
    type="archive",
    date=datetime.datetime.now(tz=datetime.timezone.utc),
    status="full",
    snapshot=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
    metadata=None,
)

CONTENT = Content(
    data=b"42\n",
    length=3,
    sha1=hash_to_bytes("34973274ccef6ab4dfaaf86599792fa9c3fe4689"),
    sha1_git=hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
    sha256=hash_to_bytes(
        "673650f936cb3b0a2f93ce09d81be10748b1b203c19e8176b4eefc1964a0cf3a"),
    blake2s256=hash_to_bytes(
        "d5fe1939576527e42cfd76a9455a2432fe7f56669564577dd93c4280e76d661d"),
    status="visible",
Exemple #26
0
    def load(self) -> Dict[str, str]:
        r"""Loading logic for the loader to follow:

        - Store the actual ``origin_visit`` to storage
        - Call :meth:`prepare` to prepare any eventual state
        - Call :meth:`get_origin` to get the origin we work with and store

        - while True:

          - Call :meth:`fetch_data` to fetch the data to store
          - Call :meth:`process_data` to optionally run processing between
            :meth:`fetch_data` and :meth:`store_data`
          - Call :meth:`store_data` to store the data

        - Call :meth:`cleanup` to clean up any eventual state put in place
             in :meth:`prepare` method.

        """
        try:
            with self.statsd_timed("pre_cleanup"):
                self.pre_cleanup()
        except Exception:
            msg = "Cleaning up dangling data failed! Continue loading."
            self.log.warning(msg)
            sentry_sdk.capture_exception()

        self._store_origin_visit()

        assert (
            self.visit.visit
        ), "The method `_store_origin_visit` should set the visit (OriginVisit)"
        self.log.info("Load origin '%s' with type '%s'", self.origin.url,
                      self.visit.type)

        try:
            with self.statsd_timed("build_extrinsic_origin_metadata"):
                metadata = self.build_extrinsic_origin_metadata()
            self.load_metadata_objects(metadata)
        except Exception as e:
            sentry_sdk.capture_exception(e)
            # Do not fail the whole task if this is the only failure
            self.log.exception(
                "Failure while loading extrinsic origin metadata.",
                extra={
                    "swh_task_args": [],
                    "swh_task_kwargs": {
                        "origin": self.origin.url,
                        "lister_name": self.lister_name,
                        "lister_instance_name": self.lister_instance_name,
                    },
                },
            )

        total_time_fetch_data = 0.0
        total_time_process_data = 0.0
        total_time_store_data = 0.0

        try:
            # Initially not a success, will be True when actually one
            success = False
            with self.statsd_timed("prepare"):
                self.prepare()

            while True:
                t1 = time.monotonic()
                more_data_to_fetch = self.fetch_data()
                t2 = time.monotonic()
                total_time_fetch_data += t2 - t1

                more_data_to_fetch = self.process_data() and more_data_to_fetch
                t3 = time.monotonic()
                total_time_process_data += t3 - t2

                self.store_data()
                t4 = time.monotonic()
                total_time_store_data += t4 - t3
                if not more_data_to_fetch:
                    break

            self.statsd_timing("fetch_data", total_time_fetch_data * 1000.0)
            self.statsd_timing("process_data",
                               total_time_process_data * 1000.0)
            self.statsd_timing("store_data", total_time_store_data * 1000.0)

            status = self.visit_status()
            visit_status = OriginVisitStatus(
                origin=self.origin.url,
                visit=self.visit.visit,
                type=self.visit_type,
                date=now(),
                status=status,
                snapshot=self.loaded_snapshot_id,
            )
            self.storage.origin_visit_status_add([visit_status])
            success = True
            with self.statsd_timed("post_load",
                                   tags={
                                       "success": success,
                                       "status": status
                                   }):
                self.post_load()
        except BaseException as e:
            success = False
            if isinstance(e, NotFound):
                status = "not_found"
                task_status = "uneventful"
            else:
                status = "partial" if self.loaded_snapshot_id else "failed"
                task_status = "failed"

            self.log.exception(
                "Loading failure, updating to `%s` status",
                status,
                extra={
                    "swh_task_args": [],
                    "swh_task_kwargs": {
                        "origin": self.origin.url,
                        "lister_name": self.lister_name,
                        "lister_instance_name": self.lister_instance_name,
                    },
                },
            )
            if not isinstance(e, (SystemExit, KeyboardInterrupt)):
                sentry_sdk.capture_exception()
            visit_status = OriginVisitStatus(
                origin=self.origin.url,
                visit=self.visit.visit,
                type=self.visit_type,
                date=now(),
                status=status,
                snapshot=self.loaded_snapshot_id,
            )
            self.storage.origin_visit_status_add([visit_status])
            with self.statsd_timed("post_load",
                                   tags={
                                       "success": success,
                                       "status": status
                                   }):
                self.post_load(success=success)
            if not isinstance(e, Exception):
                # e derives from BaseException but not Exception; this is most likely
                # SystemExit or KeyboardInterrupt, so we should re-raise it.
                raise
            return {"status": task_status}
        finally:
            with self.statsd_timed("flush",
                                   tags={
                                       "success": success,
                                       "status": status
                                   }):
                self.flush()
            with self.statsd_timed("cleanup",
                                   tags={
                                       "success": success,
                                       "status": status
                                   }):
                self.cleanup()

        return self.load_status()
Exemple #27
0
def test_load_upgrade_from_revision_extids(caplog):
    """Tests that, when loading incrementally based on a snapshot made by an old
    version of the loader, the loader will convert revisions to releases
    and add them to the storage.

    Also checks that, if an extid exists pointing to a non-existent revision
    (which should never happen, but you never know...), the release is loaded from
    scratch."""

    storage = get_storage("memory")

    origin = "http://example.org"
    dir1_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"d" * 20)
    dir2_swhid = CoreSWHID(object_type=ObjectType.DIRECTORY,
                           object_id=b"e" * 20)

    date = TimestampWithTimezone.from_datetime(
        datetime.datetime.now(tz=datetime.timezone.utc))
    person = Person.from_fullname(b"Jane Doe <*****@*****.**>")

    rev1 = Revision(
        message=b"blah",
        author=person,
        date=date,
        committer=person,
        committer_date=date,
        directory=dir1_swhid.object_id,
        type=RevisionType.TAR,
        synthetic=True,
    )

    rel1 = Release(
        name=b"v1.0",
        message=b"blah\n",
        author=person,
        date=date,
        target=dir1_swhid.object_id,
        target_type=ModelObjectType.DIRECTORY,
        synthetic=True,
    )

    rev1_swhid = rev1.swhid()
    rel1_swhid = rel1.swhid()
    rev2_swhid = CoreSWHID(object_type=ObjectType.REVISION,
                           object_id=b"b" * 20)
    rel2_swhid = CoreSWHID(object_type=ObjectType.RELEASE, object_id=b"c" * 20)

    # Results of a previous load
    storage.extid_add([
        ExtID("extid-type1", b"extid-of-v1.0", rev1_swhid, 0),
        ExtID("extid-type1", b"extid-of-v2.0", rev2_swhid, 0),
    ])
    storage.revision_add([rev1])
    last_snapshot = Snapshot(
        branches={
            b"v1.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev1_swhid.object_id),
            b"v2.0":
            SnapshotBranch(target_type=TargetType.REVISION,
                           target=rev2_swhid.object_id),
        })
    storage.snapshot_add([last_snapshot])
    date = datetime.datetime.now(tz=datetime.timezone.utc)
    storage.origin_add([Origin(url=origin)])
    storage.origin_visit_add([
        OriginVisit(origin="http://example.org",
                    visit=1,
                    date=date,
                    type="tar")
    ])
    storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin,
            visit=1,
            status="full",
            date=date,
            snapshot=last_snapshot.id,
        )
    ])

    loader = StubPackageLoader(storage, "http://example.org")
    patch.object(
        loader,
        "_load_release",
        return_value=(rel2_swhid.object_id, dir2_swhid.object_id),
        autospec=True,
    ).start()
    patch.object(
        loader,
        "get_versions",
        return_value=["v1.0", "v2.0", "v3.0"],
        autospec=True,
    ).start()

    caplog.set_level(logging.ERROR)

    loader.load()

    assert len(caplog.records) == 1
    (record, ) = caplog.records
    assert record.levelname == "ERROR"
    assert "Failed to upgrade branch branch-v2.0" in record.message

    assert loader._load_release.mock_calls == [
        # v1.0: not loaded because there is already a revision matching it
        # v2.0: loaded, as the revision is missing from the storage even though there
        #       is an extid
        call(StubPackageInfo(origin, "example-v2.0.tar", "v2.0"),
             Origin(url=origin)),
        # v3.0: loaded (did not exist yet)
        call(StubPackageInfo(origin, "example-v3.0.tar", "v3.0"),
             Origin(url=origin)),
    ]

    snapshot = Snapshot(
        branches={
            b"branch-v1.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel1_swhid.object_id),
            b"branch-v2.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
            b"branch-v3.0":
            SnapshotBranch(target_type=TargetType.RELEASE,
                           target=rel2_swhid.object_id),
        })
    assert snapshot_get_latest(storage, origin) == snapshot

    extids = storage.extid_get_from_target(
        ObjectType.RELEASE,
        [
            rel1_swhid.object_id,
            rel2_swhid.object_id,
        ],
    )

    assert set(extids) == {
        ExtID("extid-type1", b"extid-of-v1.0", rel1_swhid),
        ExtID("extid-type1", b"extid-of-v2.0", rel2_swhid),
        ExtID("extid-type2", b"extid-of-v3.0", rel2_swhid),
    }
    def test_load_incremental_from(
        self,
        parent_snapshot,
        previous_snapshot,
        expected_git_known_refs_percent,
        mocker,
    ):
        """Snapshot of parent origin has all branches, but previous snapshot was
        empty."""
        statsd_report = mocker.patch.object(self.loader.statsd, "_report")

        now = datetime.datetime.now(tz=datetime.timezone.utc)

        self.loader.storage.snapshot_add([parent_snapshot, previous_snapshot])
        self.loader.storage.origin_add(
            [Origin(url=f"base://{self.repo_url}"),
             Origin(url=self.repo_url)])
        self.loader.storage.origin_visit_add([
            OriginVisit(
                origin=f"base://{self.repo_url}",
                visit=42,
                date=now - datetime.timedelta(seconds=-1),
                type="git",
            ),
            OriginVisit(
                origin=self.repo_url,
                visit=42,
                date=now - datetime.timedelta(seconds=-1),
                type="git",
            ),
        ])
        self.loader.storage.origin_visit_status_add([
            OriginVisitStatus(
                origin=f"base://{self.repo_url}",
                visit=42,
                type="git",
                snapshot=parent_snapshot.id,
                date=now,
                status="full",
            ),
            OriginVisitStatus(
                origin=self.repo_url,
                visit=42,
                type="git",
                snapshot=previous_snapshot.id,
                date=now,
                status="full",
            ),
        ])
        self.loader.storage.flush()

        res = self.loader.load()
        assert res == {"status": "eventful"}

        self.fetcher_cls.assert_called_once_with(
            credentials={},
            lister_name="fake-lister",
            lister_instance_name="",
            origin=Origin(url=self.repo_url),
        )
        self.fetcher.get_parent_origins.assert_called_once_with()

        # First tries the same origin
        assert self.loader.storage.origin_visit_get_latest.mock_calls == [
            call(
                self.repo_url,
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
            # As it does not already have a snapshot, fall back to the parent origin
            call(
                f"base://{self.repo_url}",
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
        ]

        assert self.loader.statsd.constant_tags == {
            "visit_type": "git",
            "incremental_enabled": True,
            "has_parent_snapshot": True,
            "has_previous_snapshot": True,
            "has_parent_origins": True,
        }
        assert [
            c for c in statsd_report.mock_calls if c[1][0].startswith("git_")
        ] == [
            call("git_total", "c", 1, {}, 1),
            call("git_ignored_refs_percent", "h", 0.0, {}, 1),
            call("git_known_refs_percent", "h",
                 expected_git_known_refs_percent, {}, 1),
        ]
    def test_load_incremental(self, mocker):
        statsd_report = mocker.patch.object(self.loader.statsd, "_report")

        snapshot_id = b"\x01" * 20
        now = datetime.datetime.now(tz=datetime.timezone.utc)

        def ovgl(origin_url, allowed_statuses, require_snapshot, type):
            if origin_url == f"base://{self.repo_url}":
                return OriginVisit(origin=origin_url,
                                   visit=42,
                                   date=now,
                                   type="git")
            else:
                return None

        self.loader.storage.origin_visit_get_latest.side_effect = ovgl
        self.loader.storage.origin_visit_status_get_latest.return_value = (
            OriginVisitStatus(
                origin=f"base://{self.repo_url}",
                visit=42,
                snapshot=snapshot_id,
                date=now,
                status="full",
            ))
        self.loader.storage.snapshot_get_branches.return_value = {
            "id": snapshot_id,
            "branches": {
                b"refs/heads/master": SNAPSHOT1.branches[b"refs/heads/master"]
            },
            "next_branch": None,
        }

        res = self.loader.load()
        assert res == {"status": "eventful"}

        self.fetcher_cls.assert_called_once_with(
            credentials={},
            lister_name="fake-lister",
            lister_instance_name="",
            origin=Origin(url=self.repo_url),
        )
        self.fetcher.get_parent_origins.assert_called_once_with()

        # First tries the same origin
        assert self.loader.storage.origin_visit_get_latest.mock_calls == [
            call(
                self.repo_url,
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
            # As it does not already have a snapshot, fall back to the parent origin
            call(
                f"base://{self.repo_url}",
                allowed_statuses=None,
                require_snapshot=True,
                type=None,
            ),
        ]

        # TODO: assert "incremental*" is added to constant tags before these
        # metrics are sent
        assert [
            c for c in statsd_report.mock_calls if c[1][0].startswith("git_")
        ] == [
            call("git_total", "c", 1, {}, 1),
            call("git_ignored_refs_percent", "h", 0.0, {}, 1),
            call("git_known_refs_percent", "h", 0.25, {}, 1),
        ]
        assert self.loader.statsd.constant_tags == {
            "visit_type": "git",
            "incremental_enabled": True,
            "has_parent_snapshot": True,
            "has_previous_snapshot": False,
            "has_parent_origins": True,
        }

        self.fetcher.reset_mock()
        self.fetcher_cls.reset_mock()
        if sys.version_info >= (3, 9, 0):
            self.loader.storage.reset_mock(return_value=True, side_effect=True)
        else:
            # Reimplement https://github.com/python/cpython/commit/aef7dc89879d099dc704bd8037b8a7686fb72838  # noqa
            # for old Python versions:
            def reset_mock(m):
                m.reset_mock(return_value=True, side_effect=True)
                for child in m._mock_children.values():
                    reset_mock(child)

            reset_mock(self.loader.storage)
        statsd_report.reset_mock()

        # Load again
        res = self.loader.load()
        assert res == {"status": "uneventful"}

        self.fetcher_cls.assert_called_once_with(
            credentials={},
            lister_name="fake-lister",
            lister_instance_name="",
            origin=Origin(url=self.repo_url),
        )
        self.fetcher.get_parent_origins.assert_not_called()

        assert self.loader.storage.origin_visit_get_latest.mock_calls == [
            # Tries the same origin, and finds a snapshot
            call(
                self.repo_url,
                type=None,
                allowed_statuses=None,
                require_snapshot=True,
            ),
            # also fetches the parent, in case the origin was rebased on the parent
            # since the last visit
            call(
                f"base://{self.repo_url}",
                type=None,
                allowed_statuses=None,
                require_snapshot=True,
            ),
        ]

        # TODO: assert "incremental*" is added to constant tags before these
        # metrics are sent
        assert [
            c for c in statsd_report.mock_calls if c[1][0].startswith("git_")
        ] == [
            call("git_total", "c", 1, {}, 1),
            call("git_ignored_refs_percent", "h", 0.0, {}, 1),
            call("git_known_refs_percent", "h", 1.0, {}, 1),
        ]
        assert self.loader.statsd.constant_tags == {
            "visit_type": "git",
            "incremental_enabled": True,
            "has_parent_snapshot":
            False,  # Because we reset the mock since last time
            "has_previous_snapshot": True,
            "has_parent_origins": True,
        }
Exemple #30
0
        date=datetime.datetime(2015, 11, 27, 17, 20, 39, tzinfo=UTC),
        visit=2,
        type="hg",
    ),
]

# The origin-visit-status dates needs to be shifted slightly in the future from their
# visit dates counterpart. Otherwise, we are hitting storage-wise the "on conflict"
# ignore policy (because origin-visit-add creates an origin-visit-status with the same
# parameters from the origin-visit {origin, visit, date}...
ORIGIN_VISIT_STATUSES = [
    OriginVisitStatus(
        origin=ORIGINS[0].url,
        date=datetime.datetime(2013, 5, 7, 4, 20, 39, 432222, tzinfo=UTC),
        visit=1,
        type="git",
        status="ongoing",
        snapshot=None,
        metadata=None,
    ),
    OriginVisitStatus(
        origin=ORIGINS[1].url,
        date=datetime.datetime(2014, 11, 27, 17, 21, 12, tzinfo=UTC),
        visit=1,
        type="hg",
        status="ongoing",
        snapshot=None,
        metadata=None,
    ),
    OriginVisitStatus(
        origin=ORIGINS[0].url,