Python url_for_savepage Examples, pgark.archivers.wayback.url_for_savepage Python Examples

Example #1

0

Show file

File: foofixtures.py Project: dannguyen/pgark

def too_soon_response(too_soon_urls):
    target_url = too_soon_urls[0]

    srcdir = FIXTURES_DIR.joinpath("job-save-too-soon")
    submit_resptext = srcdir.joinpath("submit-response.html").read_text()

    with responses.RequestsMock() as rsps:
        rsps.add(
            "POST",
            wb.url_for_savepage(target_url),
            body=submit_resptext,
            status=200,
            match=[
                responses.urlencoded_params_matcher(
                    {"url": target_url, "capture_all": "on"}
                )
            ],
        )

        rsps.add(
            "GET",
            wb.url_for_jobstatus(wb.extract_job_id(submit_resptext)),
            body=srcdir.joinpath("status-0.json").read_text(),
            status=200,
        )

        yield rsps

Example #2

0

Show file

File: test_wayback_subcommands.py Project: dannguyen/pgark

def test_snapshot_too_soon():
    srcdir = FIXTURES_DIR.joinpath("job-save-too-soon")
    target_url = "https://plainlanguage.gov/"

    submit_resptext = srcdir.joinpath("submit-response.html").read_text()

    responses.add(
        "POST",
        wb.url_for_savepage(target_url),
        body=submit_resptext,
        status=200,
        match=[
            responses.urlencoded_params_matcher(
                {"url": target_url, "capture_all": "on"}
            )
        ],
    )

    responses.add(
        "GET",
        wb.url_for_jobstatus(wb.extract_job_id(submit_resptext)),
        body=srcdir.joinpath("status-0.json").read_text(),
        status=200,
    )

    answer, meta = wb.snapshot(target_url)

    assert answer == meta.snapshot_url
    assert meta.subcommand == "snapshot"
    assert meta.was_new_snapshot_created() is False
    assert (
        meta.too_soon()
        == "The same snapshot had been made 4 minutes and 18 seconds ago. We only allow new captures of the same URL every 20 minutes."
    )

Example #3

0

Show file

File: foofixtures.py Project: dannguyen/pgark

def save_success_response(success_urls):
    srcdir = FIXTURES_DIR.joinpath("job-save-success")
    target_url = success_urls[0]

    submit_resptext = srcdir.joinpath("submit-response.html").read_text()
    expected_job_url = wb.url_for_jobstatus(submit_resptext)

    status_paths = iter(
        [
            srcdir.joinpath("status-0.json"),
            srcdir.joinpath("status-1.json"),
            srcdir.joinpath("status-9.json"),
            srcdir.joinpath("status-10.json"),
        ]
    )

    with responses.RequestsMock() as rsps:
        rsps.add(
            "POST",
            wb.url_for_savepage(target_url),
            body=submit_resptext,
            status=200,
            match=[
                responses.urlencoded_params_matcher(
                    {"url": target_url, "capture_all": "on"}
                )
            ],
        )

        rsps.add_callback(
            "GET",
            expected_job_url,
            callback=lambda req: (
                200,
                {},
                next(status_paths).read_text(),
            ),  # 2nd arg is a headers dict
        )

        yield rsps

Example #4

0

Show file

File: test_wayback_subcommands.py Project: dannguyen/pgark

def test_snapshot_submit_request(session):
    target_url = "https://plainlanguage.gov/"
    save_url = wb.url_for_savepage(target_url)
    resptext = FIXTURES_DIR.joinpath(
        "job-save-success/submit-response.html"
    ).read_text()

    responses.add(
        "POST",
        save_url,
        body=resptext,
        status=200,
        match=[
            responses.urlencoded_params_matcher(
                {"url": target_url, "capture_all": "on"}
            )
        ],
    )

    resp = wb.submit_snapshot_request(session, target_url, headers={})

    assert f'<h2 id="spn-title">Saving page {target_url}</h2>' in resp.text

Example #5

0

Show file

File: test_wayback_subcommands.py Project: dannguyen/pgark

def test_snapshot_too_many_for_period():
    srcdir = FIXTURES_DIR.joinpath("job-save-too-many-today")
    submit_resptext = srcdir.joinpath("submit-response.html").read_text()
    target_url = "https://nytimes.com/"

    responses.add(
        "POST",
        wb.url_for_savepage(target_url),
        body=submit_resptext,
        status=200,
        match=[
            responses.urlencoded_params_matcher(
                {"url": target_url, "capture_all": "on"}
            )
        ],
    )
    # mock request for availability URL
    responses.add(
        "GET",
        wb.url_for_availability(target_url),
        body=srcdir.joinpath("check-availability.json").read_text(),
    )

    answer, meta = wb.snapshot(target_url)

    assert answer == meta.snapshot_url
    assert meta.subcommand == "snapshot"
    assert meta.was_new_snapshot_created() == False
    assert (
        meta.too_many_during_period()
        == """This URL has been already captured 10 times today. Please email us at "*****@*****.**" if you would like to discuss this more."""
    )

    # import pdb; pdb.set_trace()
    # server payload is the payload returned by availability API response
    assert meta.server_payload["archived_snapshots"]["closest"]["available"] is True

Example #6

0

Show file

File: test_wayback_subcommands.py Project: dannguyen/pgark

def test_snapshot_submit_request_not_ok(session):
    """not sure when this would happen, when server is down?"""
    target_url = "https://plainlanguage.gov/"
    save_url = wb.url_for_savepage(target_url)
    resptext = FIXTURES_DIR.joinpath(
        "job-save-success/submit-response.html"
    ).read_text()
    responses.add(
        "POST",
        save_url,
        body=resptext,
        status=503,
        match=[
            responses.urlencoded_params_matcher(
                {"url": target_url, "capture_all": "on"}
            )
        ],
    )

    with pytest.raises(ServerStatusError) as err:
        resp = wb.submit_snapshot_request(session, target_url, headers={})
    assert (
        f"Server status was NOT OK; returned 503 for: {save_url}" in err.value.args[0]
    )

Example #7

0

Show file

File: test_wayback_helpers.py Project: dannguyen/pgark

def test_url_for_savepage():
    target = "https://example.com/foo"
    assert (wb.url_for_savepage(target) ==
            "http://web.archive.org/save/https://example.com/foo")

Example #8

0

Show file

File: test_wayback_subcommands.py Project: dannguyen/pgark

def test_snapshot_successful(success_status_paths):
    #### fixture setup (todo: refactor?)
    srcdir = FIXTURES_DIR.joinpath("job-save-success")

    target_url = "https://plainlanguage.gov/"
    save_url = wb.url_for_savepage(target_url)

    submit_resptext = srcdir.joinpath("submit-response.html").read_text()
    expected_job_id = wb.extract_job_id(submit_resptext)
    expected_job_url = wb.url_for_jobstatus(expected_job_id)

    #### mock responses
    responses.add(
        "POST",
        save_url,
        body=submit_resptext,
        status=200,
        match=[
            responses.urlencoded_params_matcher(
                {"url": target_url, "capture_all": "on"}
            )
        ],
    )

    responses.add_callback(
        "GET",
        expected_job_url,
        callback=lambda req: (
            200,
            {},
            next(success_status_paths).read_text(),
        ),  # 2nd arg is a headers dict
    )

    answer, meta = wb.snapshot(target_url, user_agent="guy incognito", poll_interval=0)

    # make sure snapshot, as expected by the setup, exhausted the success_status_paths iterator
    assert next(success_status_paths, False) is False

    # test return values
    assert type(answer) is str
    assert type(meta) is wb.TaskMeta
    assert meta.subcommand == "snapshot"
    assert meta.target_url == target_url
    assert meta.created_at.strftime("%Y-%m-%d %H:%M:%S%z") == "2020-09-01 14:30:55+0000"

    data = meta.to_dict()
    # test that answer is snapshot url
    assert (
        answer
        == wb.BASE_DOMAIN
        + "/web/"
        + data["server_payload"]["timestamp"]
        + "/"
        + target_url
    )

    # test data response
    assert data["subcommand"] == "snapshot"
    assert data["was_new_snapshot_created"] is True
    assert data["snapshot_url"] == answer
    assert data["request_meta"]["user_agent"] == "guy incognito"

    issues = data["issues"]
    assert issues["too_soon"] is False
    assert issues["too_many_during_period"] is False

    jd = data["server_payload"]
    assert jd["status"] == "success"
    assert jd["timestamp"] in data["snapshot_url"]

    # not sure if this is always the case...what happens if there's a redirect?
    assert jd["original_url"] == target_url