def test_ignoring_pdf(httpserver: HTTPServer): httpserver.expect_request("/page1").respond_with_data( "<h1>Page 1</h1>", content_type="text/html", ) httpserver.expect_request("/page2.pdf").respond_with_data( "<h1>Page 2</h1>", content_type="application/pdf", ) httpserver.expect_request("/").respond_with_data( f""" <a href="{httpserver.url_for("/page1")}">page1</a> <a href="{httpserver.url_for("/page2.pdf")}">page2</a> """, content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["IgnorePDF"]) crawler.asyncio_crawl(save=False) assert sorted(crawler.all_urls) == sorted( [httpserver.url_for("/page1"), httpserver.url_for("/")])
def test_can_crawl(httpserver: HTTPServer): httpserver.expect_request("/page2").respond_with_data( "<h1>page2</h1>", content_type="text/html") page2_url = httpserver.url_for("/page2") httpserver.expect_request("/").respond_with_data( f'<a href="{page2_url}">page 2</a>', content_type="text/html") crawler = Crawler(httpserver.url_for("/"), verbose=False) crawler.asyncio_crawl(save=False) assert len(crawler.all_urls) == 2 assert page2_url in crawler.all_urls
def test_external(httpserver: HTTPServer): link = "http://example.com/" httpserver.expect_request("/page1").respond_with_data( f'<a href="{link}">external</a>', content_type="text/html", ) httpserver.expect_request("/page2").respond_with_data( f'<a href="{link}">external</a>', content_type="text/html", ) httpserver.expect_request("/").respond_with_data( f""" <img src="internal.png" /> <a href="{httpserver.url_for("/page1")}">page1</a> <a href="{httpserver.url_for("/page2")}">page2</a> """, content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalLinksByURL"]) (res, ) = crawler.asyncio_crawl(save=False) expected_data = [ ResultData( link, sorted( [httpserver.url_for("/page1"), httpserver.url_for("/page2")])) ] assert res.data == expected_data
def test_internal_301(httpserver: HTTPServer): httpserver.expect_request("/page2").respond_with_data( "", status=301, content_type="text/html") page2_url = httpserver.url_for("/page2") httpserver.expect_request("/page3").respond_with_data( f'<a href="{page2_url}">page2</a>', status=301, content_type="text/html") page3_url = httpserver.url_for("/page3") httpserver.expect_request("/").respond_with_data( f""" <a href="{page2_url}">page2</a> <a href="{page3_url}">page3</a> """, content_type="text/html", ) page1_url = httpserver.url_for("/") crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["Internal301"]) (res, ) = crawler.asyncio_crawl(save=False) expected_data = [ ResultData(page2_url, page2_url, sorted([page1_url, page3_url])), ResultData(page3_url, page3_url, [page1_url]), ] assert res.data == expected_data
def test_duplicates(httpserver: HTTPServer): title = "content" httpserver.expect_request("/page2").respond_with_data( get_page("<h1>page2</h1>", title), content_type="text/html") page2_url = httpserver.url_for("/page2") httpserver.expect_request("/page3").respond_with_data( get_page("<h1>page3</h1>", "Other title"), content_type="text/html") page3_url = httpserver.url_for("/page3") httpserver.expect_request("/").respond_with_data( get_page( f""" <h1>page1</h1> <a href="{page2_url}">page 2</a> <a href="{page3_url}">page 3</a> """, title, ), content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["DuplicateTitle"]) (res, ) = crawler.asyncio_crawl(save=False) expected_data = [ ResultData( title, sorted([httpserver.url_for("/"), httpserver.url_for("/page2")])) ] assert res.data == expected_data
def test_not_external(httpserver: HTTPServer): httpserver.expect_request("/").respond_with_data( '<a href="/bob">bob</a>', content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalLinks"]) (res,) = crawler.asyncio_crawl(save=False) assert res.data == []
def test_runs_plugin(httpserver: HTTPServer): httpserver.expect_request("/").respond_with_data("<h1>page2</h1>", content_type="text/html") crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["MultipleH1", "MissingH1"]) results = crawler.asyncio_crawl(save=False) assert len(results) == 2
def test_not_external(httpserver: HTTPServer): httpserver.expect_request("/").respond_with_data( '<img src="internal.png" />', content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalImages"]) (res, ) = crawler.asyncio_crawl(save=False) assert res.data == []
def test_external(httpserver: HTTPServer): link = "http://example.com/" httpserver.expect_request("/").respond_with_data( f'<a href="{link}">external</a>', content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalLinks"]) (res,) = crawler.asyncio_crawl(save=False) expected_data = [ResultData(link)] assert res.data == expected_data
def test_no_duplicates(httpserver: HTTPServer): httpserver.expect_request("/page2").respond_with_data(f"<p>{lorem.paragraph()}</p>", content_type="text/html") page2_link = f'<a href="{httpserver.url_for("/page2")}">page 2</a>' httpserver.expect_request("/page3").respond_with_data(f"<p>{lorem.paragraph()}</p>", content_type="text/html") page3_link = f'<a href="{httpserver.url_for("/page3")}">page 3</a>' httpserver.expect_request("/").respond_with_data(f"{page2_link}{page3_link}", content_type="text/html") crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["DuplicateContent"]) (res,) = crawler.asyncio_crawl(save=False) expected_data = [] assert res.data == expected_data
def test_no_duplicates(httpserver: HTTPServer): httpserver.expect_request("/page2").respond_with_data( "<h1>page2</h1>", content_type="text/html") page2_url = httpserver.url_for("/page2") httpserver.expect_request("/").respond_with_data( f'<h1>page1</h1><a href="{page2_url}">page 2</a>', content_type="text/html") crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["DuplicateH1"]) res = crawler.asyncio_crawl(save=False)[0] assert res.data == []
def test_external(httpserver: HTTPServer): img = "http://example.com/external.png" httpserver.expect_request("/").respond_with_data( f'<img src="{img}" />', content_type="text/html", ) crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalImages"]) (res, ) = crawler.asyncio_crawl(save=False) expected_data = [ResultData(img)] assert res.data == expected_data