def test_check_success_and_available(): target_url = "www.whitehouse.gov/issues/immigration/" resptext = FIXTURES_DIR.joinpath("check/available-true.json").read_text() expected_snap_url = "http://web.archive.org/web/20200903230055/https://www.whitehouse.gov/issues/immigration/" responses.add( "GET", wb.url_for_availability(target_url), body=resptext, ) answer, meta = wb.check_availability(target_url) assert answer == expected_snap_url assert meta.snapshot_url == expected_snap_url assert meta.target_url == target_url assert meta.is_success() is True assert meta.was_new_snapshot_created() is False assert meta.created_at.strftime("%Y-%m-%d %H:%M:%S%z") == "2020-09-01 14:30:55+0000" data = meta.to_dict() assert data["request_meta"]["target_url"] == target_url assert ( data["server_payload"]["url"] == target_url ) # TODO: not sure if this is guranteed, if target_url ends up being redirected?? ac = data["server_payload"]["archived_snapshots"]["closest"] assert ac["available"] is True assert ac["status"] == "200" assert ac["url"] == answer
def test_check_not_available(): """by default, returns just the available_url""" target_url = "http://danwin.com/is/poop" datatext = FIXTURES_DIR.joinpath("check/available-false.json").read_text() data = jsonlib.loads(datatext) responses.add("GET", wb.url_for_availability(target_url), body=datatext, status=200) result = runner.invoke(checkcli, [target_url]) assert result.output == "\n"
def test_check(): """by default, returns just the available_url""" target_url = "www.whitehouse.gov/issues/immigration/" datatext = FIXTURES_DIR.joinpath("check/available-true.json").read_text() data = jsonlib.loads(datatext) responses.add("GET", wb.url_for_availability(target_url), body=datatext, status=200) result = runner.invoke(checkcli, [target_url]) assert result.output == data["archived_snapshots"]["closest"]["url"] + "\n" assert result.exit_code == 0
def test_check_not_available_w_json(): """by default, returns just the available_url""" target_url = "http://danwin.com/is/poop" datatext = FIXTURES_DIR.joinpath("check/available-false.json").read_text() data = jsonlib.loads(datatext) responses.add("GET", wb.url_for_availability(target_url), body=datatext, status=200) result = runner.invoke(checkcli, [target_url, "-j"]) jd = jsonlib.loads(result.output) assert jd["request_meta"]["target_url"] == target_url assert not jd["snapshot_url"] assert jd["server_payload"]["archived_snapshots"] == {}
def test_check_w_json(): """by default, returns just the available_url""" target_url = "www.whitehouse.gov/issues/immigration/" datatext = FIXTURES_DIR.joinpath("check/available-true.json").read_text() data = jsonlib.loads(datatext) responses.add("GET", wb.url_for_availability(target_url), body=datatext, status=200) result = runner.invoke(checkcli, [target_url, "-j"]) jd = jsonlib.loads(result.output) assert jd["request_meta"]["target_url"] == target_url ad = jd["server_payload"]["archived_snapshots"]["closest"] assert jd["snapshot_url"] == ad["url"] assert ad["available"] is True
def test_check_success_but_not_available(): target_url = "http://danwin.com/is/poop" resptext = FIXTURES_DIR.joinpath("check/available-false.json").read_text() responses.add( "GET", wb.url_for_availability(target_url), body=resptext, ) answer, meta = wb.check_availability(target_url) assert answer is None assert meta.snapshot_url is None assert meta.request_meta["target_url"] == target_url # TODO: not sure if this is guaranteed, if target_url ends up being redirected?? assert meta.server_payload["url"] == target_url assert meta.server_payload["archived_snapshots"] == {}
def test_save_unless_within_hours(): target_url = "http://example.com/foo" payload = { "url": "http://example.com/foo", "archived_snapshots": { "closest": { "available": True, "status": "200", "timestamp": "20151120111111", "url": "http://web.archive.org/web/20151118111111/http://example.com/foo", } }, } responses.add( "GET", wb.url_for_availability(target_url), body=jsonlib.dumps(payload), ) answer, meta = wb.snapshot(target_url, within_hours=24) assert answer == meta.snapshot_url assert meta.request_meta["within_hours"] == 24 assert meta.was_new_snapshot_created() is False assert meta.server_payload == payload # brittle as hell!
def test_snapshot_too_many_for_period(): srcdir = FIXTURES_DIR.joinpath("job-save-too-many-today") submit_resptext = srcdir.joinpath("submit-response.html").read_text() target_url = "https://nytimes.com/" responses.add( "POST", wb.url_for_savepage(target_url), body=submit_resptext, status=200, match=[ responses.urlencoded_params_matcher( {"url": target_url, "capture_all": "on"} ) ], ) # mock request for availability URL responses.add( "GET", wb.url_for_availability(target_url), body=srcdir.joinpath("check-availability.json").read_text(), ) answer, meta = wb.snapshot(target_url) assert answer == meta.snapshot_url assert meta.subcommand == "snapshot" assert meta.was_new_snapshot_created() == False assert ( meta.too_many_during_period() == """This URL has been already captured 10 times today. Please email us at "*****@*****.**" if you would like to discuss this more.""" ) # import pdb; pdb.set_trace() # server payload is the payload returned by availability API response assert meta.server_payload["archived_snapshots"]["closest"]["available"] is True