Esempio n. 1
0
def test_explorer_extract_links():
    crawler = Crawler("http://perdu.com/")
    explorer = Explorer(crawler)
    responses.add(responses.GET,
                  "http://perdu.com/",
                  body="""<html><body>
        <a href="http://perdu.com/index.html"></a>
        <a href="https://perdu.com/secure_index.html"></a>
        <a href="//perdu.com/protocol_relative.html"></a>
        <a href="//lol.com/protocol_relative.html"></a>
        <a href="http://perdu.com:8000/other_port.html"></a>
        <a href="http://microsoft.com/other_domain.html"></a>
        <a href="welcome.html"></a>
        <a href="/about.html"></a>
        <form method="POST" action="http://perdu.com/valid_form.html">
        <input name="field" type="hidden" value="hello"/></form>
        <form method="POST" action="http://external.com/external_form.html">
        <input name="field" type="hidden" value="hello"/></form>
        """)

    request = Request("http://perdu.com/")
    page = crawler.send(request)
    results = list(explorer.extract_links(page, request))
    # We should get 6 resources as the âth from the form will also be used as url
    assert len(results) == 6
Esempio n. 2
0
async def test_explorer_extract_links():
    crawler = AsyncCrawler(Request("http://perdu.com/"))
    explorer = Explorer(crawler, Event())

    respx.get("http://perdu.com/").mock(
        return_value=httpx.Response(200,
                                    text="""<html><body>
            <a href="http://perdu.com/index.html"></a>
            <a href="https://perdu.com/secure_index.html"></a>
            <a href="//perdu.com/protocol_relative.html"></a>
            <a href="//lol.com/protocol_relative.html"></a>
            <a href="http://perdu.com:8000/other_port.html"></a>
            <a href="http://microsoft.com/other_domain.html"></a>
            <a href="welcome.html"></a>
            <a href="/about.html"></a>
            <form method="POST" action="http://perdu.com/valid_form.html">
            <input name="field" type="hidden" value="hello"/></form>
            <form method="POST" action="http://external.com/external_form.html">
            <input name="field" type="hidden" value="hello"/></form>
            """))

    request = Request("http://perdu.com/")
    page = await crawler.async_send(request)
    results = list(explorer.extract_links(page, request))
    # We should get 6 resources as the âth from the form will also be used as url
    assert len(results) == 6
    await crawler.close()
Esempio n. 3
0
async def test_explorer_filtering():
    crawler = AsyncCrawler("http://127.0.0.1:65080/")
    explorer = Explorer(crawler, Event())
    start_urls = deque(["http://127.0.0.1:65080/filters.html"])
    excluded_urls = []
    results = set([resource.url async for resource in explorer.async_explore(start_urls, excluded_urls)])
    # We should have current URL and JS URL but without query string.
    # CSS URL should be excluded
    assert results == {"http://127.0.0.1:65080/filters.html", "http://127.0.0.1:65080/yolo.js"}
    await crawler.close()
Esempio n. 4
0
async def test_explorer_extract_links_from_js():
    crawler = AsyncCrawler(Request("http://perdu.com/"))
    explorer = Explorer(crawler, Event())

    respx.get("http://perdu.com/").mock(
        return_value=httpx.Response(200,
                                    text="""Hello there!
        <a href="http://perdu.com/index.html"></a>
        <script src="main-es5.1211ab72babef8.js"></script>
        """))

    respx.get("http://perdu.com/main-es5.1211ab72babef8.js").mock(
        return_value=httpx.Response(
            200,
            text="""
            AytR: function (e, t, n) {
                'use strict';n.d(t, 'a', (function () {return r}));
                const r = {
                    web: "http://perdu.com/",
                    host: "http://host.perdu.com/",
                    api: "http://perdu.com/api",
                }
            };
            const Ke = [{path: "/admin",submenu: [{path: "/admin/profile",submenu: []},{path: "/admin/users/add",submenu: []}]}],
            Ye = [{path: "/dashboard",submenu: [{path: "/dashboard/results",submenu: []},{path: "/dashboard/result.json",submenu: []}]}];
            router.navigate(["secret", "path"]); router.createUrlTree(["this", "is", "my" + "_path"]);
            router.navigateByUrl(this.url + "/api/admin"); router.parseUrl(this.url + "/test");
            """,
            headers={"content-type": "application/javascript"}))

    request = Request("http://perdu.com/")
    page = await crawler.async_send(request)
    results = list(explorer.extract_links(page, request))
    assert len(results) == 2

    request = Request("http://perdu.com/main-es5.1211ab72babef8.js")
    page = await crawler.async_send(request)

    results = list(explorer.extract_links(page, request))
    # http://host.perdu.com is out of scope since by default scope is folder
    assert len(results) == 12
    assert Request("http://perdu.com/secret/path", "GET",
                   link_depth=1) in results
    await crawler.close()
Esempio n. 5
0
def test_qs_limit():
    crawler = Crawler("http://127.0.0.1:65080/")
    explorer = Explorer(crawler)
    start_urls = deque(["http://127.0.0.1:65080/"])
    excluded_urls = []
    # We should have root url, huge form page, target and target with POST method
    assert len(list(explorer.explore(start_urls, excluded_urls))) == 4

    crawler = Crawler("http://127.0.0.1:65080/")
    explorer = Explorer(crawler)
    # Exclude huge POST form with limit of parameters
    explorer.qs_limit = 500
    start_urls = deque(["http://127.0.0.1:65080/"])
    excluded_urls = []
    # We should have root url, huge form page, target and target with POST method
    assert len(list(explorer.explore(start_urls, excluded_urls))) == 3
Esempio n. 6
0
async def test_qs_limit():
    crawler = AsyncCrawler("http://127.0.0.1:65080/")
    explorer = Explorer(crawler, Event())
    start_urls = deque(["http://127.0.0.1:65080/"])
    excluded_urls = []
    # We should have root url, huge form page, target and target with POST method
    assert len([__ async for __ in explorer.async_explore(start_urls, excluded_urls)]) == 4
    await crawler.close()

    crawler = AsyncCrawler("http://127.0.0.1:65080/")
    explorer = Explorer(crawler, Event())
    # Exclude huge POST form with limit of parameters
    explorer.qs_limit = 500
    start_urls = deque(["http://127.0.0.1:65080/"])
    excluded_urls = []
    # We should have root url, huge form page, target and target with POST method
    assert len([__ async for __ in explorer.async_explore(start_urls, excluded_urls)]) == 3
    await crawler.close()
Esempio n. 7
0
def test_save_and_restore_state():
    # Create a temporary file
    temp_file = NamedTemporaryFile(suffix=".pkl")
    # Get its names
    filename = temp_file.name
    # Delete it
    temp_file.close()
    explorer = Explorer(None)
    # Load on unexisting file
    explorer.load_saved_state(filename)
    assert not explorer._hostnames
    # Modify state, save it
    explorer._hostnames = {"perdu.com"}
    explorer.save_state(filename)
    # State is the same after saving
    assert explorer._hostnames == {"perdu.com"}

    # New tempty explorer
    explorer = Explorer(None)
    # Load previous state
    explorer.load_saved_state(filename)
    assert explorer._hostnames == {"perdu.com"}
    os.unlink(filename)