Exemple #1
0
async def test_qs_limit():
    crawler = AsyncCrawler("http://127.0.0.1:65080/")
    explorer = Explorer(crawler, Event())
    start_urls = deque(["http://127.0.0.1:65080/"])
    excluded_urls = []
    # We should have root url, huge form page, target and target with POST method
    assert len([__ async for __ in explorer.async_explore(start_urls, excluded_urls)]) == 4
    await crawler.close()

    crawler = AsyncCrawler("http://127.0.0.1:65080/")
    explorer = Explorer(crawler, Event())
    # Exclude huge POST form with limit of parameters
    explorer.qs_limit = 500
    start_urls = deque(["http://127.0.0.1:65080/"])
    excluded_urls = []
    # We should have root url, huge form page, target and target with POST method
    assert len([__ async for __ in explorer.async_explore(start_urls, excluded_urls)]) == 3
    await crawler.close()
Exemple #2
0
async def test_explorer_filtering():
    crawler = AsyncCrawler("http://127.0.0.1:65080/")
    explorer = Explorer(crawler, Event())
    start_urls = deque(["http://127.0.0.1:65080/filters.html"])
    excluded_urls = []
    results = set([resource.url async for resource in explorer.async_explore(start_urls, excluded_urls)])
    # We should have current URL and JS URL but without query string.
    # CSS URL should be excluded
    assert results == {"http://127.0.0.1:65080/filters.html", "http://127.0.0.1:65080/yolo.js"}
    await crawler.close()