def test_other_links():
    with open("tests/data/other_links.html") as data_body:
        url = "http://perdu.com/"
        responses.add(
            responses.GET,
            url,
            body=data_body.read(),
            adding_headers={
                "Location": "https://perdu.com/login"
            },
            status=301
        )

        resp = requests.get(url, allow_redirects=False)
        page = Page(resp)

        assert sorted(page.iter_frames()) == [
            "http://perdu.com/frame1.html",
            "http://perdu.com/frame2.html",
            "http://perdu.com/iframe.html"
        ]
        assert page.scripts == ["http://perdu.com/script.js"]
        assert page.redirection_url == "https://perdu.com/login"
        assert set(page.images_urls) == {
            "http://perdu.com/img/logo.png",
            "http://perdu.com/img/header.png",
            "http://perdu.com/img/ads.php?id=5878545"
        }
        assert page.js_redirections == ["http://perdu.com/maintenance.html"]
        assert page.favicon_url == "http://perdu.com/favicon.ico"
        assert page.html_redirections == ["http://perdu.com/adblock.html"]
Exemple #2
0
def test_valid_content_type():
    url = "http://perdu.com/"
    responses.add(
        responses.GET,
        url,
        status=200,
        adding_headers={
            "Content-Type": "text/html"
        }
    )

    resp = requests.get(url)
    page = Page(resp)
    assert valid_xss_content_type(page)

    url = "http://perdu.com/picture.png"
    responses.add(
        responses.GET,
        url,
        status=200,
        adding_headers={
            "Content-Type": "image/png"
        }
    )

    resp = requests.get(url)
    page = Page(resp)
    assert not valid_xss_content_type(page)
Exemple #3
0
def test_persister_forms():
    with open("tests/data/forms.html") as data_body:
        url = "http://perdu.com/"
        responses.add(responses.GET, url, body=data_body.read())

        resp = requests.get(url, allow_redirects=False)
        page = Page(resp)

        forms = list(page.iter_forms())

        try:
            os.unlink("/tmp/crawl.db")
        except FileNotFoundError:
            pass

        persister = SqlitePersister("/tmp/crawl.db")
        persister.set_root_url("http://httpbin.org/")
        persister.set_to_browse(forms)

        assert persister.count_paths() == 9

        extracted_forms = list(persister.get_to_browse())
        assert len(extracted_forms) == 9
        assert set(forms) == set(extracted_forms)

        for form in extracted_forms:
            if form.file_path == "/upload.php":
                assert form.file_params[0] == [
                    "file", ["pix.gif", "GIF89a", "image/gif"]
                ]
            elif form.file_path == "/fields.php":
                assert ["file", "pix.gif"] in form.post_params
Exemple #4
0
async def test_persister_forms():
    with open("tests/data/forms.html") as data_body:
        url = "http://perdu.com/"
        respx.get(url).mock(
            return_value=httpx.Response(200, text=data_body.read()))

        resp = httpx.get(url, allow_redirects=False)
        page = Page(resp)

        forms = list(page.iter_forms())

        try:
            os.unlink("/tmp/crawl.db")
        except FileNotFoundError:
            pass

        persister = SqlPersister("/tmp/crawl.db")
        await persister.create()
        await persister.set_root_url("http://httpbin.org/")
        await persister.set_to_browse(forms)

        assert await persister.count_paths() == 9

        extracted_forms = [__ async for __ in persister.get_to_browse()]
        assert len(extracted_forms) == 9
        assert set(forms) == set(extracted_forms)

        for form in extracted_forms:
            if form.file_path == "/upload.php":
                assert form.file_params[0] == [
                    "file", ("pix.gif", "GIF89a", "image/gif")
                ]
            elif form.file_path == "/fields.php":
                assert ["file", "pix.gif"] in form.post_params
Exemple #5
0
def test_domain_scope():
    url = "http://perdu.com/"
    responses.add(responses.GET, url, body="Hello world!")

    resp = requests.get(url)
    page = Page(resp)
    assert page.is_external_to_domain("http://yolo.tld")
    assert page.is_external_to_domain("http://www.google.com/")
    assert page.is_external_to_domain("http://jesuisperdu.com/")
    assert not page.is_external_to_domain("http://perdu.com/robots.txt")
    assert not page.is_external_to_domain("http://www.perdu.com/blog/")
    assert not page.is_external_to_domain("https://perdu.com/blog/")
    assert not page.is_external_to_domain("http://perdu.com:80/blog/")
    assert page.is_external_to_domain("http://perdu.com.org/blog/")
Exemple #6
0
def test_domain_scope():
    url = "http://perdu.com/"
    respx.get(url).mock(return_value=httpx.Response(200, text="Hello world!"))

    resp = httpx.get(url)
    page = Page(resp)
    assert page.is_external_to_domain("http://yolo.tld")
    assert page.is_external_to_domain("http://www.google.com/")
    assert page.is_external_to_domain("http://jesuisperdu.com/")
    assert not page.is_external_to_domain("http://perdu.com/robots.txt")
    assert not page.is_external_to_domain("http://www.perdu.com/blog/")
    assert not page.is_external_to_domain("https://perdu.com/blog/")
    assert not page.is_external_to_domain("http://perdu.com:80/blog/")
    assert page.is_external_to_domain("http://perdu.com.org/blog/")
Exemple #7
0
def test_csp_detection():
    url = "http://perdu.com/"
    responses.add(
        responses.GET,
        url,
        status=200,
        adding_headers={
            "Content-Type": "text/html"
        }
    )

    resp = requests.get(url)
    page = Page(resp)
    assert not has_csp(page)

    url = "http://perdu.com/http_csp"
    responses.add(
        responses.GET,
        url,
        status=200,
        adding_headers={
            "Content-Type": "text/html",
            "Content-Security-Policy": "blahblah;"
        }
    )

    resp = requests.get(url)
    page = Page(resp)
    assert has_csp(page)

    url = "http://perdu.com/meta_csp"
    responses.add(
        responses.GET,
        url,
        status=200,
        adding_headers={
            "Content-Type": "text/html"
        },
        body="""<html>
        <head>
        <meta http-equiv="Content-Security-Policy" content="default-src 'self'; img-src https://*; child-src 'none';">
        </head>
        <body>Hello there</body>
        </html>"""
    )

    resp = requests.get(url)
    page = Page(resp)
    assert has_csp(page)
Exemple #8
0
def test_http():
    url = "http://perdu.com/"
    responses.add(
        responses.GET,
        url,
        body="Hello world!",
        adding_headers={
            "X-Men": "Wolverine",
            "Server": "nginx",
            "Set-Cookie": "session_id=31337;",
            "Content-Type": "text/html"
        },
        status=418
    )

    resp = requests.get(url)
    page = Page(resp)

    assert page.status == 418
    assert page.headers["X-Men"] == "Wolverine"
    assert page.url == "http://perdu.com/"
    assert page.server == "nginx"
    assert page.cookies["session_id"] == "31337"
    assert page.is_plain
    assert page.size == page.raw_size != 0
    assert page.delay > 0
    assert isinstance(page.bytes, bytes) and len(page.bytes)
    assert page.type == "text/html"
    assert page.encoding == "ISO-8859-1"
Exemple #9
0
def test_base_relative_links():
    with open("tests/data/base_relative_links.html") as data_body:
        url = "http://perdu.com/"
        respx.get(url).mock(
            return_value=httpx.Response(200, text=data_body.read()))

        resp = httpx.get(url)
        page = Page(resp)

        assert set(page.links) == {
            url,
            "http://perdu.com/blog/file.html",
            "http://perdu.com/blog/resource",
            "http://perdu.com/blog/folder/",
            "http://perdu.com/blog/folder/file.html",
            "http://perdu.com/blog/folder/file2.html",
            "http://perdu.com/folder/file2.html",
            "http://perdu.com/",
            "http://perdu.com/blog/",
            "http://perdu.com/blog/file3.html",
            "http://perdu.com/blog/?k=v",
            "http://perdu.com/blog/?k=v2",
            "http://perdu.com/blog/file3.html?k=v",
            "http://perdu.com/blog/folder/?k=v",
            "http://perdu.com/blog/folder?k=v",
            "http://external.tld/",
            "http://external.tld/yolo?k=v",
        }
def test_base_extra_links():
    with open("tests/data/base_extra_links.html") as data_body:
        url = "http://perdu.com/"
        responses.add(
            responses.GET,
            url,
            body=data_body.read()
        )

        resp = requests.get(url, allow_redirects=False)
        page = Page(resp)

        assert set(page.extra_urls) == {
            "http://perdu.com/blog/",  # extracted from base href
            "http://perdu.com/blog/planets.gif",
            "http://perdu.com/blog/sun.html",
            "http://perdu.com/blog/mercur.html",
            "http://perdu.com/blog/venus.html",
            "http://perdu.com/blog/link.html",
            "http://perdu.com/blog/audio.html",
            "http://perdu.com/blog/embed.html",
            "http://perdu.com/blog/horse.ogg",
            "http://perdu.com/blog/horse.mp3",
            "http://perdu.com/blog/video.html",
            "http://perdu.com/blog/subtitles_en.vtt",
            "http://perdu.com/blog/dopequote.html",
            "http://perdu.com/blog/del.html",
            "http://perdu.com/blog/ins.html",
            "http://perdu.com/blog/q.html",
            "http://perdu.com/blog/data.html",
            "http://perdu.com/blog/high-def.jpg",
            "http://perdu.com/blog/low-def.jpg",
            "http://perdu.com/blog/img_orange_flowers.jpg"
        }
def test_extra_links():
    with open("tests/data/extra_links.html") as data_body:
        url = "http://perdu.com/"
        responses.add(
            responses.GET,
            url,
            body=data_body.read()
        )

        resp = requests.get(url, allow_redirects=False)
        page = Page(resp)

        assert set(page.extra_urls) == {
            "http://perdu.com/planets.gif",
            "http://perdu.com/sun.html",
            "http://perdu.com/mercur.html",
            "http://perdu.com/venus.html",
            "http://perdu.com/link.html",
            "http://perdu.com/audio.html",
            "http://perdu.com/embed.html",
            "http://perdu.com/horse.ogg",
            "http://perdu.com/horse.mp3",
            "http://perdu.com/video.html",
            "http://perdu.com/subtitles_en.vtt",
            "http://perdu.com/dopequote.html",
            "http://perdu.com/del.html",
            "http://perdu.com/ins.html",
            "http://perdu.com/q.html",
            "http://perdu.com/data.html",
            "http://perdu.com/high-def.jpg",
            "http://perdu.com/low-def.jpg",
            "http://perdu.com/img_orange_flowers.jpg",
            "http://perdu.com/style.css?should_not_be_crawled",
            "http://perdu.com/yolo.js?v=53"
        }
Exemple #12
0
def test_http():
    url = "http://perdu.com/"
    respx.get(url).mock(return_value=httpx.Response(
        418,
        headers={
            "X-Men": "Wolverine",
            "Server": "nginx",
            "Set-Cookie": "session_id=31337;",
            "Content-Type": "text/html; charset=ISO-8859-1"
        },
        text="Hello world!"))

    resp = httpx.get(url)
    page = Page(resp)

    assert page.status == 418
    assert page.headers["X-Men"] == "Wolverine"
    assert page.url == "http://perdu.com/"
    assert page.server == "nginx"
    assert page.cookies["session_id"] == "31337"
    assert page.is_plain
    assert page.size == page.raw_size != 0
    assert page.delay > 0
    assert isinstance(page.bytes, bytes) and page.bytes
    assert page.type == "text/html; charset=iso-8859-1"
    assert page.encoding == "ISO-8859-1"  # see https://github.com/encode/httpx/pull/1269
Exemple #13
0
def test_valid_content_type():
    url = "http://perdu.com/"
    respx.get(url).mock(return_value=httpx.Response(
        200, headers={"Content-Type": "text/html"}))

    resp = httpx.get(url)
    page = Page(resp)
    assert valid_xss_content_type(page)

    url = "http://perdu.com/picture.png"
    respx.get(url).mock(return_value=httpx.Response(
        200, headers={"Content-Type": "image/png"}))

    resp = httpx.get(url)
    page = Page(resp)
    assert not valid_xss_content_type(page)
def test_relative_links():
    with open("tests/data/relative_links.html") as data_body:
        url = "http://perdu.com/"
        responses.add(
            responses.GET,
            url,
            body=data_body.read()
        )

        resp = requests.get(url)
        page = Page(resp)

        assert set(page.links) == {
            url,
            "http://perdu.com/file.html",
            "http://perdu.com/resource",
            "http://perdu.com/folder/",
            "http://perdu.com/folder/file.html",
            "http://perdu.com/folder/file2.html",
            "http://perdu.com/file3.html",
            "http://perdu.com/?k=v",
            "http://perdu.com/file3.html?k=v",
            "http://perdu.com/folder/?k=v",
            "http://perdu.com/folder?k=v",
            "http://external.tld/",
            "http://external.tld/yolo?k=v",
        }
Exemple #15
0
def test_button_without_value():
    url = "https://crazyandthebrains.net/"
    body = """<html>
    <body>
        <form method="POST" action="/post">
            <input type=text name="text" /><br />
            <button name="btn" type=submit>submit</button>
        </form>
    """

    respx.get(url).mock(return_value=httpx.Response(200, text=body))

    resp = httpx.get(url, follow_redirects=False)
    page = Page(resp)

    form = next(page.iter_forms())
    assert form.post_params == [["text", "default"], ["btn", ""]]
Exemple #16
0
def test_email_input():
    url = "http://perdu.com/"
    body = """<html>
    <body>
    <form method="POST">
    <input type="text" name="email_address" />
    </form>
    </body>
    </html>
    """

    respx.get(url).mock(return_value=httpx.Response(200, text=body))

    resp = httpx.get(url, allow_redirects=False)
    page = Page(resp)

    form = next(page.iter_forms())
    assert "@" in form.post_params[0][1]
def test_email_input():
    url = "http://perdu.com/"
    responses.add(responses.GET,
                  url,
                  body="""<html>
        <body>
        <form method="POST">
        <input type="text" name="email_address" />
        </form>
        </body>
        </html>
        """)

    resp = requests.get(url, allow_redirects=False)
    page = Page(resp)

    form = next(page.iter_forms())
    assert "@" in form.post_params[0][1]
Exemple #18
0
def test_absolute_root():
    with open("tests/data/absolute_root_links.html") as data_body:
        url = "http://perdu.com/"
        respx.get(url).mock(
            return_value=httpx.Response(200, text=data_body.read()))

        resp = httpx.get(url)
        page = Page(resp)

        assert page.links == [url]
Exemple #19
0
def test_http_redir():
    url = "http://perdu.com/folder"
    respx.get(url).mock(return_value=httpx.Response(
        301,
        text="Hello world!",
        headers={"Location": "http://perdu.com/folder/"}))

    resp = httpx.get(url, follow_redirects=False)
    page = Page(resp)
    assert page.is_directory_redirection
def test_json():
    url = "http://perdu.com/"
    respx.get(url).mock(return_value=httpx.Response(
        200,
        json={"key": "v4lu3"},
        headers={"Content-Type": "application/json"}))

    resp = httpx.get(url)
    page = Page(resp)

    assert page.json["key"] == "v4lu3"
Exemple #21
0
def test_relative_root():
    with open("tests/data/relative_root_links.html") as data_body:
        url = "http://perdu.com/"
        respx.get(url).mock(
            return_value=httpx.Response(200, text=data_body.read()))

        resp = httpx.get(url)
        page = Page(resp)

        # We will get invalid hostnames with dots. Browsers do that too.
        assert set(page.links) == {url, "http://./", "http://../"}
Exemple #22
0
def test_formactions():
    with open("tests/data/formactions.html") as form_action:
        url = "http://perdu.com/"
        respx.get(url).mock(
            return_value=httpx.Response(200, text=form_action.read()))

        resp = httpx.get(url, follow_redirects=False)
        page = Page(resp)
        count = 0

        for form in page.iter_forms():
            count += 1
            if form.file_path == "/form":
                assert form.post_params == [["name", "doe"]]
            elif form.file_path == "/form2":
                assert form.post_params == [["name2", "doe"]]
            elif form.file_path == "/":
                assert form.method == "POST"
                assert form.post_params[0][1] == "doe"

        assert count == 4
Exemple #23
0
def test_base_other_links():
    with open("tests/data/base_other_links.html") as data_body:
        url = "http://perdu.com/"
        respx.get(url).mock(return_value=httpx.Response(
            301,
            text=data_body.read(),
            headers={"Location": "https://perdu.com/login"}))

        resp = httpx.get(url, follow_redirects=False)
        page = Page(resp)

        assert sorted(page.iter_frames()) == [
            "http://perdu.com/blog/frame1.html",
            "http://perdu.com/blog/frame2.html",
            "http://perdu.com/blog/iframe.html"
        ]

        assert page.scripts == ["http://perdu.com/blog/script.js"]
        assert page.redirection_url == "https://perdu.com/login"
        assert set(page.images_urls) == {"http://perdu.com/blog/img/logo.png"}

        assert page.html_redirections == ["http://perdu.com/blog/adblock.html"]
def test_absolute_root():
    with open("tests/data/absolute_root_links.html") as data_body:
        url = "http://perdu.com/"
        responses.add(
            responses.GET,
            url,
            body=data_body.read()
        )

        resp = requests.get(url)
        page = Page(resp)

        assert page.links == [url]
def test_relative_root():
    with open("tests/data/relative_root_links.html") as data_body:
        url = "http://perdu.com/"
        responses.add(
            responses.GET,
            url,
            body=data_body.read()
        )

        resp = requests.get(url)
        page = Page(resp)

        # We will get invalid hostnames with dots. Browsers do that too.
        assert set(page.links) == {url, "http://./", "http://../"}
Exemple #26
0
def test_csp_detection():
    url = "http://perdu.com/"
    respx.get(url).mock(return_value=httpx.Response(
        200, headers={"Content-Type": "text/html"}))

    resp = httpx.get(url)
    page = Page(resp)
    assert not has_csp(page)

    url = "http://perdu.com/http_csp"
    respx.get(url).mock(
        return_value=httpx.Response(200,
                                    headers={
                                        "Content-Type": "text/html",
                                        "Content-Security-Policy": "blahblah;"
                                    }))

    resp = httpx.get(url)
    page = Page(resp)
    assert has_csp(page)

    url = "http://perdu.com/meta_csp"

    respx.get(url).mock(
        return_value=httpx.Response(200,
                                    headers={"Content-Type": "text/html"},
                                    text="""<html>
            <head>
            <meta http-equiv="Content-Security-Policy" content="default-src 'self'; img-src https://*; child-src 'none';">
            </head>
            <body>Hello there</body>
            </html>"""))

    resp = httpx.get(url)
    page = Page(resp)
    assert has_csp(page)
def test_other_links():
    with open("tests/data/other_links.html") as data_body:
        url = "http://perdu.com/"
        respx.get(url).mock(return_value=httpx.Response(
            301,
            text=data_body.read(),
            headers={"Location": "https://perdu.com/login"}))

        resp = httpx.get(url, allow_redirects=False)
        page = Page(resp)

        assert sorted(page.iter_frames()) == [
            "http://perdu.com/frame1.html", "http://perdu.com/frame2.html",
            "http://perdu.com/iframe.html"
        ]
        assert page.scripts == ["http://perdu.com/script.js"]
        assert page.redirection_url == "https://perdu.com/login"
        assert set(page.images_urls) == {
            "http://perdu.com/img/logo.png", "http://perdu.com/img/header.png",
            "http://perdu.com/img/ads.php?id=5878545"
        }
        assert page.js_redirections == ["http://perdu.com/maintenance.html"]
        assert page.favicon_url == "http://perdu.com/favicon.ico"
        assert page.html_redirections == ["http://perdu.com/adblock.html"]
Exemple #28
0
def test_http():
    url = "http://perdu.com/folder"
    responses.add(
        responses.GET,
        url,
        body="Hello world!",
        adding_headers={
            "Location": "http://perdu.com/folder/",
        },
        status=301
    )

    resp = requests.get(url, allow_redirects=False)
    page = Page(resp)
    assert page.is_directory_redirection
Exemple #29
0
def test_formactions():
    with open("tests/data/formactions.html") as form_action:
        url = "http://perdu.com/"
        responses.add(
            responses.GET,
            url,
            body=form_action.read()
        )

        resp = requests.get(url, allow_redirects=False)
        page = Page(resp)
        count = 0

        for form in page.iter_forms():
            count += 1
            if form.file_path == "/form":
                assert form.post_params == [["name", "doe"]]
            elif form.file_path == "/form2":
                assert form.post_params == [["name2", "doe"]]
            elif form.file_path == "/":
                assert form.method == "POST"
                assert form.post_params[0][1] == "doe"

        assert count == 4
Exemple #30
0
def test_js_parser():
    with open("tests/data/js_links.html") as data_body:
        url = "http://perdu.com/"
        respx.get(url).mock(return_value=httpx.Response(200, text=data_body.read()))

        resp = httpx.get(url)
        page = Page(resp)

        assert set(page.extra_urls) == {
            "http://perdu.com/onload.html",
            "http://perdu.com/popup.html",
            "http://perdu.com/redir.html",
            "http://perdu.com/concat.html",
            "http://perdu.com/concat.html?var=value",
            "http://perdu.com/link.html",
        }