Example #1
0
def test_ignoring_pdf(httpserver: HTTPServer):
    httpserver.expect_request("/page1").respond_with_data(
        "<h1>Page 1</h1>",
        content_type="text/html",
    )
    httpserver.expect_request("/page2.pdf").respond_with_data(
        "<h1>Page 2</h1>",
        content_type="application/pdf",
    )

    httpserver.expect_request("/").respond_with_data(
        f"""
        <a href="{httpserver.url_for("/page1")}">page1</a>
        <a href="{httpserver.url_for("/page2.pdf")}">page2</a>
        """,
        content_type="text/html",
    )

    crawler = Crawler(httpserver.url_for("/"),
                      verbose=False,
                      plugins=["IgnorePDF"])
    crawler.asyncio_crawl(save=False)

    assert sorted(crawler.all_urls) == sorted(
        [httpserver.url_for("/page1"),
         httpserver.url_for("/")])
Example #2
0
def test_can_crawl(httpserver: HTTPServer):
    httpserver.expect_request("/page2").respond_with_data(
        "<h1>page2</h1>", content_type="text/html")
    page2_url = httpserver.url_for("/page2")
    httpserver.expect_request("/").respond_with_data(
        f'<a href="{page2_url}">page 2</a>', content_type="text/html")

    crawler = Crawler(httpserver.url_for("/"), verbose=False)
    crawler.asyncio_crawl(save=False)

    assert len(crawler.all_urls) == 2
    assert page2_url in crawler.all_urls
Example #3
0
def test_external(httpserver: HTTPServer):
    link = "http://example.com/"
    httpserver.expect_request("/page1").respond_with_data(
        f'<a href="{link}">external</a>',
        content_type="text/html",
    )
    httpserver.expect_request("/page2").respond_with_data(
        f'<a href="{link}">external</a>',
        content_type="text/html",
    )
    httpserver.expect_request("/").respond_with_data(
        f"""
        <img src="internal.png" />
        <a href="{httpserver.url_for("/page1")}">page1</a>
        <a href="{httpserver.url_for("/page2")}">page2</a>
        """,
        content_type="text/html",
    )

    crawler = Crawler(httpserver.url_for("/"),
                      verbose=False,
                      plugins=["ExternalLinksByURL"])
    (res, ) = crawler.asyncio_crawl(save=False)

    expected_data = [
        ResultData(
            link,
            sorted(
                [httpserver.url_for("/page1"),
                 httpserver.url_for("/page2")]))
    ]

    assert res.data == expected_data
Example #4
0
def test_internal_301(httpserver: HTTPServer):
    httpserver.expect_request("/page2").respond_with_data(
        "", status=301, content_type="text/html")
    page2_url = httpserver.url_for("/page2")

    httpserver.expect_request("/page3").respond_with_data(
        f'<a href="{page2_url}">page2</a>',
        status=301,
        content_type="text/html")
    page3_url = httpserver.url_for("/page3")

    httpserver.expect_request("/").respond_with_data(
        f"""
    <a href="{page2_url}">page2</a>
    <a href="{page3_url}">page3</a>
    """,
        content_type="text/html",
    )
    page1_url = httpserver.url_for("/")

    crawler = Crawler(httpserver.url_for("/"),
                      verbose=False,
                      plugins=["Internal301"])
    (res, ) = crawler.asyncio_crawl(save=False)

    expected_data = [
        ResultData(page2_url, page2_url, sorted([page1_url, page3_url])),
        ResultData(page3_url, page3_url, [page1_url]),
    ]

    assert res.data == expected_data
Example #5
0
def test_duplicates(httpserver: HTTPServer):
    title = "content"
    httpserver.expect_request("/page2").respond_with_data(
        get_page("<h1>page2</h1>", title), content_type="text/html")
    page2_url = httpserver.url_for("/page2")

    httpserver.expect_request("/page3").respond_with_data(
        get_page("<h1>page3</h1>", "Other title"), content_type="text/html")
    page3_url = httpserver.url_for("/page3")

    httpserver.expect_request("/").respond_with_data(
        get_page(
            f"""
        <h1>page1</h1>
        <a href="{page2_url}">page 2</a>
        <a href="{page3_url}">page 3</a>
        """,
            title,
        ),
        content_type="text/html",
    )

    crawler = Crawler(httpserver.url_for("/"),
                      verbose=False,
                      plugins=["DuplicateTitle"])
    (res, ) = crawler.asyncio_crawl(save=False)

    expected_data = [
        ResultData(
            title,
            sorted([httpserver.url_for("/"),
                    httpserver.url_for("/page2")]))
    ]

    assert res.data == expected_data
Example #6
0
def test_not_external(httpserver: HTTPServer):
    httpserver.expect_request("/").respond_with_data(
        '<a href="/bob">bob</a>',
        content_type="text/html",
    )

    crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalLinks"])
    (res,) = crawler.asyncio_crawl(save=False)

    assert res.data == []
Example #7
0
def test_runs_plugin(httpserver: HTTPServer):
    httpserver.expect_request("/").respond_with_data("<h1>page2</h1>",
                                                     content_type="text/html")

    crawler = Crawler(httpserver.url_for("/"),
                      verbose=False,
                      plugins=["MultipleH1", "MissingH1"])
    results = crawler.asyncio_crawl(save=False)

    assert len(results) == 2
Example #8
0
def test_not_external(httpserver: HTTPServer):
    httpserver.expect_request("/").respond_with_data(
        '<img src="internal.png" />',
        content_type="text/html",
    )

    crawler = Crawler(httpserver.url_for("/"),
                      verbose=False,
                      plugins=["ExternalImages"])
    (res, ) = crawler.asyncio_crawl(save=False)

    assert res.data == []
Example #9
0
def test_external(httpserver: HTTPServer):
    link = "http://example.com/"
    httpserver.expect_request("/").respond_with_data(
        f'<a href="{link}">external</a>',
        content_type="text/html",
    )

    crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["ExternalLinks"])
    (res,) = crawler.asyncio_crawl(save=False)

    expected_data = [ResultData(link)]

    assert res.data == expected_data
def test_no_duplicates(httpserver: HTTPServer):
    httpserver.expect_request("/page2").respond_with_data(f"<p>{lorem.paragraph()}</p>", content_type="text/html")
    page2_link = f'<a href="{httpserver.url_for("/page2")}">page 2</a>'
    httpserver.expect_request("/page3").respond_with_data(f"<p>{lorem.paragraph()}</p>", content_type="text/html")
    page3_link = f'<a href="{httpserver.url_for("/page3")}">page 3</a>'

    httpserver.expect_request("/").respond_with_data(f"{page2_link}{page3_link}", content_type="text/html")

    crawler = Crawler(httpserver.url_for("/"), verbose=False, plugins=["DuplicateContent"])
    (res,) = crawler.asyncio_crawl(save=False)

    expected_data = []

    assert res.data == expected_data
Example #11
0
def test_no_duplicates(httpserver: HTTPServer):
    httpserver.expect_request("/page2").respond_with_data(
        "<h1>page2</h1>", content_type="text/html")
    page2_url = httpserver.url_for("/page2")
    httpserver.expect_request("/").respond_with_data(
        f'<h1>page1</h1><a href="{page2_url}">page 2</a>',
        content_type="text/html")

    crawler = Crawler(httpserver.url_for("/"),
                      verbose=False,
                      plugins=["DuplicateH1"])
    res = crawler.asyncio_crawl(save=False)[0]

    assert res.data == []
Example #12
0
def test_external(httpserver: HTTPServer):
    img = "http://example.com/external.png"
    httpserver.expect_request("/").respond_with_data(
        f'<img src="{img}" />',
        content_type="text/html",
    )

    crawler = Crawler(httpserver.url_for("/"),
                      verbose=False,
                      plugins=["ExternalImages"])
    (res, ) = crawler.asyncio_crawl(save=False)

    expected_data = [ResultData(img)]

    assert res.data == expected_data