def test_journal_client_origin_visit_statuses_same_snapshot_permutation(
        visit_statuses, swh_scheduler):
    """Ensure out of order topic subscription ends up in the same final state"""
    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([
        ("cavabarder", "hg")
    ])
    visit_stats = actual_origin_visit_stats[0]
    assert_visit_stats_ok(
        visit_stats,
        OriginVisitStats(
            url="cavabarder",
            visit_type="hg",
            last_successful=DATE1 + 2 * ONE_YEAR,
            last_visit=DATE1 + 2 * ONE_YEAR,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=hash_to_bytes(
                "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
        ),
        ignore_fields=[
            "next_visit_queue_position",
            "next_position_offset",
            "successive_visits",
        ],
    )

    # We ignore out of order messages, so the next_position_offset isn't exact
    # depending on the permutation. What matters is consistency of the final
    # dates (last_visit and last_successful).
    assert 4 <= visit_stats.next_position_offset <= 6
    # same goes for successive_visits
    assert 1 <= visit_stats.successive_visits <= 3
def test_journal_client_origin_visit_status_from_journal_ignored_status(
        swh_scheduler):
    """Only final statuses (full, partial) are important, the rest remain ignored."""
    # Trace method calls on the swh_scheduler
    swh_scheduler = Mock(wraps=swh_scheduler)

    visit_statuses = [
        {
            "origin": "foo",
            "visit": 1,
            "status": "created",
            "date": utcnow(),
            "type": "git",
            "snapshot": None,
        },
        {
            "origin": "bar",
            "visit": 1,
            "status": "ongoing",
            "date": utcnow(),
            "type": "svn",
            "snapshot": None,
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    # All messages have been ignored: no stats have been upserted
    swh_scheduler.origin_visit_stats_upsert.assert_not_called()
def test_journal_client_origin_visit_status_duplicated_messages(swh_scheduler):
    """A duplicated message must be ignored"""
    visit_status = {
        "origin": "foo",
        "visit": 1,
        "status": "full",
        "date": DATE1,
        "type": "git",
        "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
    }

    process_journal_objects({"origin_visit_status": [visit_status]},
                            scheduler=swh_scheduler)

    process_journal_objects({"origin_visit_status": [visit_status]},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_successful=DATE1,
            last_visit=DATE1,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=hash_to_bytes(
                "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
            successive_visits=1,
        ),
    )
def test_journal_client_origin_visit_status_from_journal_last_successful(
        swh_scheduler):
    visit_statuses = [
        {
            "origin": "bar",
            "visit": 1,
            "status": "partial",
            "date": utcnow(),
            "type": "git",
            "snapshot":
            hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        },
        {
            "origin": "foo",
            "visit": 1,
            "status": "full",
            "date": DATE1,
            "type": "git",
            "snapshot":
            hash_to_bytes("eeecc0710eb6cf9efd5b920a8453e1e07157bfff"),
        },
        {
            "origin": "foo",
            "visit": 2,
            "status": "partial",
            "date": DATE2,
            "type": "git",
            "snapshot":
            hash_to_bytes("aaacc0710eb6cf9efd5b920a8453e1e07157baaa"),
        },
        {
            "origin": "foo",
            "visit": 3,
            "status": "full",
            "date": DATE3,
            "type": "git",
            "snapshot":
            hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd"),
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_successful=DATE3,
            last_visit=DATE3,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=hash_to_bytes(
                "dddcc0710eb6cf9efd5b920a8453e1e07157bddd"),
            next_position_offset=0,
            successive_visits=3,
        ),
    )
def test_journal_client_origin_visit_status_from_journal_last_failed(
        swh_scheduler):
    visit_statuses = [
        {
            "origin": "foo",
            "visit": 1,
            "status": "partial",
            "date": utcnow(),
            "type": "git",
            "snapshot": None,
        },
        {
            "origin": "bar",
            "visit": 1,
            "status": "full",
            "date": DATE1,
            "type": "git",
            "snapshot": None,
        },
        {
            "origin": "bar",
            "visit": 2,
            "status": "full",
            "date": DATE2,
            "type": "git",
            "snapshot": None,
        },
        {
            "origin": "bar",
            "visit": 3,
            "status": "full",
            "date": DATE3,
            "type": "git",
            "snapshot": None,
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="bar",
            visit_type="git",
            last_visit=DATE3,
            last_visit_status=LastVisitStatus.failed,
            next_position_offset=6,
            successive_visits=3,
        ),
    )
def test_journal_client_origin_visit_status_from_journal_last_uneventful(
        swh_scheduler):
    visit_status = {
        "origin": "foo",
        "visit": 1,
        "status": "full",
        "date": DATE3 + ONE_DAY,
        "type": "git",
        "snapshot": hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
    }

    # Let's insert some visit stats with some previous visit information
    swh_scheduler.origin_visit_stats_upsert([
        OriginVisitStats(
            url=visit_status["origin"],
            visit_type=visit_status["type"],
            last_successful=DATE2,
            last_visit=DATE3,
            last_visit_status=LastVisitStatus.failed,
            last_snapshot=visit_status["snapshot"],
            next_visit_queue_position=None,
            next_position_offset=4,
            successive_visits=1,
        )
    ])

    process_journal_objects({"origin_visit_status": [visit_status]},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([
        (visit_status["origin"], visit_status["type"])
    ])

    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url=visit_status["origin"],
            visit_type=visit_status["type"],
            last_visit=DATE3 + ONE_DAY,
            last_successful=DATE3 + ONE_DAY,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=visit_status["snapshot"],
            next_visit_queue_position=None,
            next_position_offset=5,
            successive_visits=1,
        ),
    )
def test_journal_client_origin_visit_status_several_upsert(swh_scheduler):
    """An old message updates old information"""
    visit_status1 = {
        "origin": "foo",
        "visit": 1,
        "status": "full",
        "date": DATE1,
        "type": "git",
        "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
    }

    visit_status2 = {
        "origin": "foo",
        "visit": 1,
        "status": "full",
        "date": DATE2,
        "type": "git",
        "snapshot": hash_to_bytes("aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
    }

    process_journal_objects({"origin_visit_status": [visit_status2]},
                            scheduler=swh_scheduler)

    process_journal_objects({"origin_visit_status": [visit_status1]},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_successful=DATE2,
            last_visit=DATE2,
            last_visit_status=LastVisitStatus.successful,
            last_snapshot=hash_to_bytes(
                "aaaaaabbbeb6cf9efd5b920a8453e1e07157b6cd"),
            next_position_offset=4,
            successive_visits=1,
        ),
    )
Ejemplo n.º 8
0
def scheduler_journal_client_process(
        env: Environment,
        status_queue: Queue) -> Generator[Event, TaskEvent, None]:
    """Scheduler journal client. Every once in a while, pulls
    :class:`OriginVisitStatuses <swh.model.model.OriginVisitStatus>`
    from the status_queue to update the scheduler origin_visit_stats table."""
    BATCH_SIZE = 100

    statuses: List[Dict[str, Any]] = []
    while True:
        task_event = yield status_queue.get()
        statuses.append(task_event.status.to_dict())
        if len(statuses) < BATCH_SIZE:
            continue

        logger.debug("%s journal client: processing %s statuses", env.time,
                     len(statuses))
        process_journal_objects({"origin_visit_status": statuses},
                                scheduler=env.scheduler)

        statuses = []
def test_journal_client_origin_visit_status_after_grab_next_visits(
        swh_scheduler, stored_lister):
    """Ensure OriginVisitStat entries created in the db as a result of calling
    grab_next_visits() do not mess the OriginVisitStats upsert mechanism.

    """

    listed_origins = [
        ListedOrigin(lister_id=stored_lister.id,
                     url=url,
                     visit_type=visit_type) for (url, visit_type) in set(
                         (v["origin"], v["type"]) for v in VISIT_STATUSES_2)
    ]
    swh_scheduler.record_listed_origins(listed_origins)
    before = utcnow()
    swh_scheduler.grab_next_visits(visit_type="git",
                                   count=10,
                                   policy="oldest_scheduled_first")
    after = utcnow()

    assert swh_scheduler.origin_visit_stats_get([("cavabarder", "hg")]) == []
    assert swh_scheduler.origin_visit_stats_get([("cavabarder", "git")
                                                 ])[0] is not None

    process_journal_objects({"origin_visit_status": VISIT_STATUSES_2},
                            scheduler=swh_scheduler)

    for url in ("cavabarder", "iciaussi"):
        ovs = swh_scheduler.origin_visit_stats_get([(url, "git")])[0]
        assert before <= ovs.last_scheduled <= after

        ovs = swh_scheduler.origin_visit_stats_get([(url, "hg")])[0]
        assert ovs.last_scheduled is None

    ovs = swh_scheduler.origin_visit_stats_get([("cavabarder", "git")])[0]
    assert ovs.last_successful == DATE1 + 5 * ONE_DAY
    assert ovs.last_visit == DATE1 + 5 * ONE_DAY
    assert ovs.last_visit_status == LastVisitStatus.successful
    assert ovs.last_snapshot == hash_to_bytes(
        "5555555555555555555555555555555555555555")
def test_journal_client_ignore_missing_type(swh_scheduler):
    """Ignore statuses with missing type key"""
    # Trace method calls on the swh_scheduler
    swh_scheduler = Mock(wraps=swh_scheduler)

    date = utcnow()
    snapshot = hash_to_bytes("dddcc0710eb6cf9efd5b920a8453e1e07157bddd")
    visit_statuses = [
        {
            "origin": "foo",
            "visit": 1,
            "status": "full",
            "date": date,
            "snapshot": snapshot,
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    # The message has been ignored: no stats have been upserted
    swh_scheduler.origin_visit_stats_upsert.assert_not_called()
def test_journal_client_origin_visit_status_from_journal_last_failed2(
        swh_scheduler):
    visit_statuses = [
        {
            "origin": "bar",
            "visit": 2,
            "status": "failed",
            "date": DATE1,
            "type": "git",
            "snapshot":
            hash_to_bytes("d81cc0710eb6cf9efd5b920a8453e1e07157b6cd"),
        },
        {
            "origin": "bar",
            "visit": 3,
            "status": "failed",
            "date": DATE2,
            "type": "git",
            "snapshot": None,
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="bar",
            visit_type="git",
            last_visit=DATE2,
            last_visit_status=LastVisitStatus.failed,
            next_position_offset=5,
            successive_visits=2,
        ),
    )
def test_disable_failing_origins(swh_scheduler):
    """Origin with too many failed attempts ends up being deactivated in the scheduler."""

    # actually store the origin in the scheduler so we can check it's deactivated in the
    # end.
    lister = swh_scheduler.get_or_create_lister(name="something",
                                                instance_name="something")
    origin = ListedOrigin(url="bar",
                          enabled=True,
                          visit_type="svn",
                          lister_id=lister.id)
    swh_scheduler.record_listed_origins([origin])

    visit_statuses = [
        {
            "origin": "bar",
            "visit": 2,
            "status": "failed",
            "date": DATE1,
            "type": "svn",
            "snapshot": None,
        },
        {
            "origin": "bar",
            "visit": 3,
            "status": "failed",
            "date": DATE2,
            "type": "svn",
            "snapshot": None,
        },
        {
            "origin": "bar",
            "visit": 3,
            "status": "failed",
            "date": DATE3,
            "type": "svn",
            "snapshot": None,
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("bar",
                                                                       "svn")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="bar",
            visit_type="svn",
            last_successful=None,
            last_visit=DATE3,
            last_visit_status=LastVisitStatus.failed,
            next_position_offset=6,
            successive_visits=3,
        ),
    )

    # Now check that the origin in question is disabled
    actual_page = swh_scheduler.get_listed_origins(url="bar", enabled=False)

    assert len(actual_page.results) == 1
    assert actual_page.next_page_token is None

    for origin in actual_page.results:
        assert origin.enabled is False
        assert origin.lister_id == lister.id
        assert origin.url == "bar"
        assert origin.visit_type == "svn"
def test_journal_client_origin_visit_status_from_journal_last_not_found(
        swh_scheduler):
    visit_status = {
        "origin": "foo",
        "visit": 1,
        "status": "not_found",
        "date": DATE1,
        "type": "git",
        "snapshot": None,
    }

    process_journal_objects({"origin_visit_status": [visit_status]},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_visit=visit_status["date"],
            last_visit_status=LastVisitStatus.not_found,
            next_position_offset=4,
            successive_visits=1,
        ),
    )

    visit_statuses = [
        {
            "origin": "foo",
            "visit": 3,
            "status": "not_found",
            "date": DATE2,
            "type": "git",
            "snapshot": None,
        },
        {
            "origin": "foo",
            "visit": 4,
            "status": "not_found",
            "date": DATE3,
            "type": "git",
            "snapshot": None,
        },
    ]

    process_journal_objects({"origin_visit_status": visit_statuses},
                            scheduler=swh_scheduler)

    actual_origin_visit_stats = swh_scheduler.origin_visit_stats_get([("foo",
                                                                       "git")])
    assert_visit_stats_ok(
        actual_origin_visit_stats[0],
        OriginVisitStats(
            url="foo",
            visit_type="git",
            last_visit=DATE3,
            last_visit_status=LastVisitStatus.not_found,
            next_position_offset=6,
            successive_visits=3,
        ),
    )