def test_other_links(): with open("tests/data/other_links.html") as data_body: url = "http://perdu.com/" responses.add( responses.GET, url, body=data_body.read(), adding_headers={ "Location": "https://perdu.com/login" }, status=301 ) resp = requests.get(url, allow_redirects=False) page = Page(resp) assert sorted(page.iter_frames()) == [ "http://perdu.com/frame1.html", "http://perdu.com/frame2.html", "http://perdu.com/iframe.html" ] assert page.scripts == ["http://perdu.com/script.js"] assert page.redirection_url == "https://perdu.com/login" assert set(page.images_urls) == { "http://perdu.com/img/logo.png", "http://perdu.com/img/header.png", "http://perdu.com/img/ads.php?id=5878545" } assert page.js_redirections == ["http://perdu.com/maintenance.html"] assert page.favicon_url == "http://perdu.com/favicon.ico" assert page.html_redirections == ["http://perdu.com/adblock.html"]
def test_valid_content_type(): url = "http://perdu.com/" responses.add( responses.GET, url, status=200, adding_headers={ "Content-Type": "text/html" } ) resp = requests.get(url) page = Page(resp) assert valid_xss_content_type(page) url = "http://perdu.com/picture.png" responses.add( responses.GET, url, status=200, adding_headers={ "Content-Type": "image/png" } ) resp = requests.get(url) page = Page(resp) assert not valid_xss_content_type(page)
def test_persister_forms(): with open("tests/data/forms.html") as data_body: url = "http://perdu.com/" responses.add(responses.GET, url, body=data_body.read()) resp = requests.get(url, allow_redirects=False) page = Page(resp) forms = list(page.iter_forms()) try: os.unlink("/tmp/crawl.db") except FileNotFoundError: pass persister = SqlitePersister("/tmp/crawl.db") persister.set_root_url("http://httpbin.org/") persister.set_to_browse(forms) assert persister.count_paths() == 9 extracted_forms = list(persister.get_to_browse()) assert len(extracted_forms) == 9 assert set(forms) == set(extracted_forms) for form in extracted_forms: if form.file_path == "/upload.php": assert form.file_params[0] == [ "file", ["pix.gif", "GIF89a", "image/gif"] ] elif form.file_path == "/fields.php": assert ["file", "pix.gif"] in form.post_params
async def test_persister_forms(): with open("tests/data/forms.html") as data_body: url = "http://perdu.com/" respx.get(url).mock( return_value=httpx.Response(200, text=data_body.read())) resp = httpx.get(url, allow_redirects=False) page = Page(resp) forms = list(page.iter_forms()) try: os.unlink("/tmp/crawl.db") except FileNotFoundError: pass persister = SqlPersister("/tmp/crawl.db") await persister.create() await persister.set_root_url("http://httpbin.org/") await persister.set_to_browse(forms) assert await persister.count_paths() == 9 extracted_forms = [__ async for __ in persister.get_to_browse()] assert len(extracted_forms) == 9 assert set(forms) == set(extracted_forms) for form in extracted_forms: if form.file_path == "/upload.php": assert form.file_params[0] == [ "file", ("pix.gif", "GIF89a", "image/gif") ] elif form.file_path == "/fields.php": assert ["file", "pix.gif"] in form.post_params
def test_domain_scope(): url = "http://perdu.com/" responses.add(responses.GET, url, body="Hello world!") resp = requests.get(url) page = Page(resp) assert page.is_external_to_domain("http://yolo.tld") assert page.is_external_to_domain("http://www.google.com/") assert page.is_external_to_domain("http://jesuisperdu.com/") assert not page.is_external_to_domain("http://perdu.com/robots.txt") assert not page.is_external_to_domain("http://www.perdu.com/blog/") assert not page.is_external_to_domain("https://perdu.com/blog/") assert not page.is_external_to_domain("http://perdu.com:80/blog/") assert page.is_external_to_domain("http://perdu.com.org/blog/")
def test_domain_scope(): url = "http://perdu.com/" respx.get(url).mock(return_value=httpx.Response(200, text="Hello world!")) resp = httpx.get(url) page = Page(resp) assert page.is_external_to_domain("http://yolo.tld") assert page.is_external_to_domain("http://www.google.com/") assert page.is_external_to_domain("http://jesuisperdu.com/") assert not page.is_external_to_domain("http://perdu.com/robots.txt") assert not page.is_external_to_domain("http://www.perdu.com/blog/") assert not page.is_external_to_domain("https://perdu.com/blog/") assert not page.is_external_to_domain("http://perdu.com:80/blog/") assert page.is_external_to_domain("http://perdu.com.org/blog/")
def test_csp_detection(): url = "http://perdu.com/" responses.add( responses.GET, url, status=200, adding_headers={ "Content-Type": "text/html" } ) resp = requests.get(url) page = Page(resp) assert not has_csp(page) url = "http://perdu.com/http_csp" responses.add( responses.GET, url, status=200, adding_headers={ "Content-Type": "text/html", "Content-Security-Policy": "blahblah;" } ) resp = requests.get(url) page = Page(resp) assert has_csp(page) url = "http://perdu.com/meta_csp" responses.add( responses.GET, url, status=200, adding_headers={ "Content-Type": "text/html" }, body="""<html> <head> <meta http-equiv="Content-Security-Policy" content="default-src 'self'; img-src https://*; child-src 'none';"> </head> <body>Hello there</body> </html>""" ) resp = requests.get(url) page = Page(resp) assert has_csp(page)
def test_http(): url = "http://perdu.com/" responses.add( responses.GET, url, body="Hello world!", adding_headers={ "X-Men": "Wolverine", "Server": "nginx", "Set-Cookie": "session_id=31337;", "Content-Type": "text/html" }, status=418 ) resp = requests.get(url) page = Page(resp) assert page.status == 418 assert page.headers["X-Men"] == "Wolverine" assert page.url == "http://perdu.com/" assert page.server == "nginx" assert page.cookies["session_id"] == "31337" assert page.is_plain assert page.size == page.raw_size != 0 assert page.delay > 0 assert isinstance(page.bytes, bytes) and len(page.bytes) assert page.type == "text/html" assert page.encoding == "ISO-8859-1"
def test_base_relative_links(): with open("tests/data/base_relative_links.html") as data_body: url = "http://perdu.com/" respx.get(url).mock( return_value=httpx.Response(200, text=data_body.read())) resp = httpx.get(url) page = Page(resp) assert set(page.links) == { url, "http://perdu.com/blog/file.html", "http://perdu.com/blog/resource", "http://perdu.com/blog/folder/", "http://perdu.com/blog/folder/file.html", "http://perdu.com/blog/folder/file2.html", "http://perdu.com/folder/file2.html", "http://perdu.com/", "http://perdu.com/blog/", "http://perdu.com/blog/file3.html", "http://perdu.com/blog/?k=v", "http://perdu.com/blog/?k=v2", "http://perdu.com/blog/file3.html?k=v", "http://perdu.com/blog/folder/?k=v", "http://perdu.com/blog/folder?k=v", "http://external.tld/", "http://external.tld/yolo?k=v", }
def test_base_extra_links(): with open("tests/data/base_extra_links.html") as data_body: url = "http://perdu.com/" responses.add( responses.GET, url, body=data_body.read() ) resp = requests.get(url, allow_redirects=False) page = Page(resp) assert set(page.extra_urls) == { "http://perdu.com/blog/", # extracted from base href "http://perdu.com/blog/planets.gif", "http://perdu.com/blog/sun.html", "http://perdu.com/blog/mercur.html", "http://perdu.com/blog/venus.html", "http://perdu.com/blog/link.html", "http://perdu.com/blog/audio.html", "http://perdu.com/blog/embed.html", "http://perdu.com/blog/horse.ogg", "http://perdu.com/blog/horse.mp3", "http://perdu.com/blog/video.html", "http://perdu.com/blog/subtitles_en.vtt", "http://perdu.com/blog/dopequote.html", "http://perdu.com/blog/del.html", "http://perdu.com/blog/ins.html", "http://perdu.com/blog/q.html", "http://perdu.com/blog/data.html", "http://perdu.com/blog/high-def.jpg", "http://perdu.com/blog/low-def.jpg", "http://perdu.com/blog/img_orange_flowers.jpg" }
def test_extra_links(): with open("tests/data/extra_links.html") as data_body: url = "http://perdu.com/" responses.add( responses.GET, url, body=data_body.read() ) resp = requests.get(url, allow_redirects=False) page = Page(resp) assert set(page.extra_urls) == { "http://perdu.com/planets.gif", "http://perdu.com/sun.html", "http://perdu.com/mercur.html", "http://perdu.com/venus.html", "http://perdu.com/link.html", "http://perdu.com/audio.html", "http://perdu.com/embed.html", "http://perdu.com/horse.ogg", "http://perdu.com/horse.mp3", "http://perdu.com/video.html", "http://perdu.com/subtitles_en.vtt", "http://perdu.com/dopequote.html", "http://perdu.com/del.html", "http://perdu.com/ins.html", "http://perdu.com/q.html", "http://perdu.com/data.html", "http://perdu.com/high-def.jpg", "http://perdu.com/low-def.jpg", "http://perdu.com/img_orange_flowers.jpg", "http://perdu.com/style.css?should_not_be_crawled", "http://perdu.com/yolo.js?v=53" }
def test_http(): url = "http://perdu.com/" respx.get(url).mock(return_value=httpx.Response( 418, headers={ "X-Men": "Wolverine", "Server": "nginx", "Set-Cookie": "session_id=31337;", "Content-Type": "text/html; charset=ISO-8859-1" }, text="Hello world!")) resp = httpx.get(url) page = Page(resp) assert page.status == 418 assert page.headers["X-Men"] == "Wolverine" assert page.url == "http://perdu.com/" assert page.server == "nginx" assert page.cookies["session_id"] == "31337" assert page.is_plain assert page.size == page.raw_size != 0 assert page.delay > 0 assert isinstance(page.bytes, bytes) and page.bytes assert page.type == "text/html; charset=iso-8859-1" assert page.encoding == "ISO-8859-1" # see https://github.com/encode/httpx/pull/1269
def test_valid_content_type(): url = "http://perdu.com/" respx.get(url).mock(return_value=httpx.Response( 200, headers={"Content-Type": "text/html"})) resp = httpx.get(url) page = Page(resp) assert valid_xss_content_type(page) url = "http://perdu.com/picture.png" respx.get(url).mock(return_value=httpx.Response( 200, headers={"Content-Type": "image/png"})) resp = httpx.get(url) page = Page(resp) assert not valid_xss_content_type(page)
def test_relative_links(): with open("tests/data/relative_links.html") as data_body: url = "http://perdu.com/" responses.add( responses.GET, url, body=data_body.read() ) resp = requests.get(url) page = Page(resp) assert set(page.links) == { url, "http://perdu.com/file.html", "http://perdu.com/resource", "http://perdu.com/folder/", "http://perdu.com/folder/file.html", "http://perdu.com/folder/file2.html", "http://perdu.com/file3.html", "http://perdu.com/?k=v", "http://perdu.com/file3.html?k=v", "http://perdu.com/folder/?k=v", "http://perdu.com/folder?k=v", "http://external.tld/", "http://external.tld/yolo?k=v", }
def test_button_without_value(): url = "https://crazyandthebrains.net/" body = """<html> <body> <form method="POST" action="/post"> <input type=text name="text" /><br /> <button name="btn" type=submit>submit</button> </form> """ respx.get(url).mock(return_value=httpx.Response(200, text=body)) resp = httpx.get(url, follow_redirects=False) page = Page(resp) form = next(page.iter_forms()) assert form.post_params == [["text", "default"], ["btn", ""]]
def test_email_input(): url = "http://perdu.com/" body = """<html> <body> <form method="POST"> <input type="text" name="email_address" /> </form> </body> </html> """ respx.get(url).mock(return_value=httpx.Response(200, text=body)) resp = httpx.get(url, allow_redirects=False) page = Page(resp) form = next(page.iter_forms()) assert "@" in form.post_params[0][1]
def test_email_input(): url = "http://perdu.com/" responses.add(responses.GET, url, body="""<html> <body> <form method="POST"> <input type="text" name="email_address" /> </form> </body> </html> """) resp = requests.get(url, allow_redirects=False) page = Page(resp) form = next(page.iter_forms()) assert "@" in form.post_params[0][1]
def test_absolute_root(): with open("tests/data/absolute_root_links.html") as data_body: url = "http://perdu.com/" respx.get(url).mock( return_value=httpx.Response(200, text=data_body.read())) resp = httpx.get(url) page = Page(resp) assert page.links == [url]
def test_http_redir(): url = "http://perdu.com/folder" respx.get(url).mock(return_value=httpx.Response( 301, text="Hello world!", headers={"Location": "http://perdu.com/folder/"})) resp = httpx.get(url, follow_redirects=False) page = Page(resp) assert page.is_directory_redirection
def test_json(): url = "http://perdu.com/" respx.get(url).mock(return_value=httpx.Response( 200, json={"key": "v4lu3"}, headers={"Content-Type": "application/json"})) resp = httpx.get(url) page = Page(resp) assert page.json["key"] == "v4lu3"
def test_relative_root(): with open("tests/data/relative_root_links.html") as data_body: url = "http://perdu.com/" respx.get(url).mock( return_value=httpx.Response(200, text=data_body.read())) resp = httpx.get(url) page = Page(resp) # We will get invalid hostnames with dots. Browsers do that too. assert set(page.links) == {url, "http://./", "http://../"}
def test_formactions(): with open("tests/data/formactions.html") as form_action: url = "http://perdu.com/" respx.get(url).mock( return_value=httpx.Response(200, text=form_action.read())) resp = httpx.get(url, follow_redirects=False) page = Page(resp) count = 0 for form in page.iter_forms(): count += 1 if form.file_path == "/form": assert form.post_params == [["name", "doe"]] elif form.file_path == "/form2": assert form.post_params == [["name2", "doe"]] elif form.file_path == "/": assert form.method == "POST" assert form.post_params[0][1] == "doe" assert count == 4
def test_base_other_links(): with open("tests/data/base_other_links.html") as data_body: url = "http://perdu.com/" respx.get(url).mock(return_value=httpx.Response( 301, text=data_body.read(), headers={"Location": "https://perdu.com/login"})) resp = httpx.get(url, follow_redirects=False) page = Page(resp) assert sorted(page.iter_frames()) == [ "http://perdu.com/blog/frame1.html", "http://perdu.com/blog/frame2.html", "http://perdu.com/blog/iframe.html" ] assert page.scripts == ["http://perdu.com/blog/script.js"] assert page.redirection_url == "https://perdu.com/login" assert set(page.images_urls) == {"http://perdu.com/blog/img/logo.png"} assert page.html_redirections == ["http://perdu.com/blog/adblock.html"]
def test_absolute_root(): with open("tests/data/absolute_root_links.html") as data_body: url = "http://perdu.com/" responses.add( responses.GET, url, body=data_body.read() ) resp = requests.get(url) page = Page(resp) assert page.links == [url]
def test_relative_root(): with open("tests/data/relative_root_links.html") as data_body: url = "http://perdu.com/" responses.add( responses.GET, url, body=data_body.read() ) resp = requests.get(url) page = Page(resp) # We will get invalid hostnames with dots. Browsers do that too. assert set(page.links) == {url, "http://./", "http://../"}
def test_csp_detection(): url = "http://perdu.com/" respx.get(url).mock(return_value=httpx.Response( 200, headers={"Content-Type": "text/html"})) resp = httpx.get(url) page = Page(resp) assert not has_csp(page) url = "http://perdu.com/http_csp" respx.get(url).mock( return_value=httpx.Response(200, headers={ "Content-Type": "text/html", "Content-Security-Policy": "blahblah;" })) resp = httpx.get(url) page = Page(resp) assert has_csp(page) url = "http://perdu.com/meta_csp" respx.get(url).mock( return_value=httpx.Response(200, headers={"Content-Type": "text/html"}, text="""<html> <head> <meta http-equiv="Content-Security-Policy" content="default-src 'self'; img-src https://*; child-src 'none';"> </head> <body>Hello there</body> </html>""")) resp = httpx.get(url) page = Page(resp) assert has_csp(page)
def test_other_links(): with open("tests/data/other_links.html") as data_body: url = "http://perdu.com/" respx.get(url).mock(return_value=httpx.Response( 301, text=data_body.read(), headers={"Location": "https://perdu.com/login"})) resp = httpx.get(url, allow_redirects=False) page = Page(resp) assert sorted(page.iter_frames()) == [ "http://perdu.com/frame1.html", "http://perdu.com/frame2.html", "http://perdu.com/iframe.html" ] assert page.scripts == ["http://perdu.com/script.js"] assert page.redirection_url == "https://perdu.com/login" assert set(page.images_urls) == { "http://perdu.com/img/logo.png", "http://perdu.com/img/header.png", "http://perdu.com/img/ads.php?id=5878545" } assert page.js_redirections == ["http://perdu.com/maintenance.html"] assert page.favicon_url == "http://perdu.com/favicon.ico" assert page.html_redirections == ["http://perdu.com/adblock.html"]
def test_http(): url = "http://perdu.com/folder" responses.add( responses.GET, url, body="Hello world!", adding_headers={ "Location": "http://perdu.com/folder/", }, status=301 ) resp = requests.get(url, allow_redirects=False) page = Page(resp) assert page.is_directory_redirection
def test_formactions(): with open("tests/data/formactions.html") as form_action: url = "http://perdu.com/" responses.add( responses.GET, url, body=form_action.read() ) resp = requests.get(url, allow_redirects=False) page = Page(resp) count = 0 for form in page.iter_forms(): count += 1 if form.file_path == "/form": assert form.post_params == [["name", "doe"]] elif form.file_path == "/form2": assert form.post_params == [["name2", "doe"]] elif form.file_path == "/": assert form.method == "POST" assert form.post_params[0][1] == "doe" assert count == 4
def test_js_parser(): with open("tests/data/js_links.html") as data_body: url = "http://perdu.com/" respx.get(url).mock(return_value=httpx.Response(200, text=data_body.read())) resp = httpx.get(url) page = Page(resp) assert set(page.extra_urls) == { "http://perdu.com/onload.html", "http://perdu.com/popup.html", "http://perdu.com/redir.html", "http://perdu.com/concat.html", "http://perdu.com/concat.html?var=value", "http://perdu.com/link.html", }