Example #1
0
def test_external_external(servers):
    """ Redirections tested via added 2nd domain and extra external domains. """

    CONTENT = """ Example of the index page
        <a href="{}">external link 1</a> | <a href="{}">external link 2</a>
    """

    # this urls not suppose to be found in index
    external_urls = (
        "http://example.com",
        "http://example.org",
    )

    # external domain with catfished urls
    linked_domain = servers[0].router({
        '^/$':
        Page(CONTENT.format(*external_urls)).exists(),
    })

    site_to_index = servers[1].router({
        '^/$':
        Page("<a href='{0}'>{0}</a>".format(linked_domain)).exists(),
    })

    c = Crawler(Settings(site_to_index, check_external_urls=True))
    c.start()

    # convert index to list
    links = [link.url() for link in c.index]

    assert linked_domain in links
    assert external_urls[0] not in links
    assert external_urls[1] not in links
Example #2
0
def test_double_start(simple_site):

    c = Crawler(Settings(simple_site, threads=10))
    c.start()

    # should not take same time again.
    c.start()
Example #3
0
def test_crawling_advanced(
    site_with_links,
    check_external,
    threads,
    ignore_domains,
    ignore_pathes,
    results,
):
    options = {
        'check_external_urls': check_external,
        'stay_within_path': False,
        'threads': threads,
        'ignore_domains': ignore_domains,
        'ignore_pathes': ignore_pathes,
    }
    c = Crawler(Settings(site_with_links, **options))
    c.start()

    indexed, failed, succeed, ignored, redirected = results

    assert len(c.index) == indexed
    assert len(c.redirected) == redirected
    assert len(c.failed) == failed
    assert len(c.succeed) == succeed
    assert len(c.ignored) == ignored
Example #4
0
def test_robots_txt_reject_all_off(server):

    robots_txt = Page("User-agent: *\nDisallow: /").mime('text/plain').exists()
    address = server.router(pages(robots_txt))

    c = Crawler(Settings(address, check_robots_txt=False))
    c.start()
Example #5
0
def test_redirected_links(server):

    from random import sample

    pages = list(range(1, 51))
    format_link = lambda x: "<a href='/link-%s'>link</a>" % x

    routes = {
        '^/$': Page(" / ".join(map(format_link, sample(pages, 4)))).exists(),
        '^/link-\d{1,2}$': Page("").exists().redirects(pattern='%s/'),
    }

    for step in pages:
        route_key = '^/link-%s/$' % step
        route_contents = Page(" / ".join(map(format_link,
                                             sample(pages, 4)))).exists()
        routes.update({route_key: route_contents})

    address = server.router(routes)

    settings = Settings(address, threads=10)
    c = Crawler(settings)
    c.start()

    assert 1 < len(c.index) <= (2 * len(pages) + 1)
    assert 1 < len(c.redirected) <= len(pages)
    assert len(c.failed) == 0
Example #6
0
def test_crawler_update_link(server):

    address = server.router({
        '^/$':
        Page("ok").exists().unlock_after(2),
        '^/link-\d{1,2}$':
        Page("").exists().redirects(pattern='%s/'),
    })

    c = Crawler(Settings(address, retry=1))
    c.start()

    assert len(c.failed) == 1
    url = c.failed[0]
    # print("OUT", url, url.status)
    c.update(url)
    assert len(c.failed) == 1

    # доадати в тест url p редіректоv раніше не бачений
    with pytest.raises(TypeError):
        c.update(address + 1)

    url = address + "/link-1"
    c.update(url)
    assert len(c.failed) == 1
    assert len(c.index) == 2
    assert len(c.undefined) == 1
Example #7
0
def test_defaults(server, threads):
    """ Solo/Multi - Threading with default settings """

    # there are 2*3 links on the page, and half of them are working
    links_number = 3

    HTML_FORMATTER = lambda x: "<a href='{}-{{0}}'>{{0}}</a>".format(
        x)  #pylint: disable-msg=W0108
    LINK_FORMATTER = lambda x: HTML_FORMATTER("link").format(x)
    LIMK_FORMATTER = lambda x: HTML_FORMATTER("limk").format(x)

    index_html = "<!-- index page -->"
    index_html += " - ".join(map(LINK_FORMATTER,
                                 range(links_number)))  # 10 good links
    index_html += " - ".join(map(LIMK_FORMATTER,
                                 range(links_number)))  # 10 bad links

    address = server.router({
        '^/$': Page(index_html).exists(),
        'link-\d{1,}': Page("ok").exists(),
        'limk-\d{1,}': Page("error").not_exists(),
    })

    c = Crawler(Settings(address, threads=threads))
    c.start()

    assert len(c.index) == (1 + 2 * links_number)
    assert len(c.failed) == links_number
    assert len(c.succeed) == (1 + links_number)
    assert not c.ignored
Example #8
0
def test_within_site_root(server):
    """
        This Test checks a case when url without trailing slash is ignored
        because it's not stays within path.
    """

    addr = server.acquire_new_addr()
    CONTENT = """
        <a href="http://{0}:{1}">link</a>
        <a href="http://{0}:{1}/">link</a>
    """.format(*addr)

    CONTENT_DOCS = CONTENT.replace('">',
                                   '/docs/">').replace('//docs/', '/docs')

    address = server.router({
        '^/$': Page(CONTENT).exists(),
        '^/docs/?$': Page(CONTENT_DOCS).exists(),
    })

    for base in {address.rstrip("/") + "/", address.rstrip("/") + "/docs/"}:
        settings = Settings(base, stay_within_path=True)
        c = Crawler(settings)
        c.start()

        assert len(c.ignored) == 0
Example #9
0
def test_robots_txt_reject_all(server):

    robots_txt = Page("User-agent: *\nDisallow: /").mime('text/plain').exists()
    address = server.router(pages(robots_txt))

    with pytest.raises(DeadlinksIgnoredURL):
        c = Crawler(Settings(address))
        c.start()
Example #10
0
def test_robots_txt_allow_user_agent(server):

    robots_txt = Page("User-agent: *\nDisallow: /link").mime(
        'text/plain').exists()
    address = server.router(pages(robots_txt))

    c = Crawler(Settings(address))
    c.start()

    assert len(c.ignored) == 100
Example #11
0
def test_crawling_retry(server, unlocked_after, do_retries, fails):
    address = server.router({
        '^/$':
        Page("ok").exists().unlock_after(unlocked_after),
    })

    c = Crawler(Settings(address, retry=do_retries))
    c.start()

    assert len(c.failed) == fails
Example #12
0
def test_failed_domain():
    """ Some random domain should fails (robots.txt fails to be retrived)"""

    from random import choice
    from string import ascii_lowercase

    domain = "http://%s.com/" % ''.join(
        [choice(ascii_lowercase) for x in range(42)])
    c = Crawler(Settings(domain))
    c.start()

    assert len(c.failed) == 1
Example #13
0
def test_failed_google():

    c = Crawler(
        Settings(
            "http://google.com/search/about/",
            **{
                'stay_within_path': True,
                'check_external_urls': False,
            },
        ))
    c.start()

    assert len(c.succeed) == 1
Example #14
0
def test_redirections(server):

    address = server.router({
        '^/$':
        Page("<a href='/link-1'></a>").exists(),
        '^/link-\d{1,}$':
        Page("ok").exists().redirects(pattern='%s/'),
        '^/link-\d{1,}/$':
        Page("ok").exists(),
    })

    c = Crawler(Settings(address))
    c.start()

    assert len(c.redirected) == 1, "NOT"
    assert len(c.succeed) == 2, "OK"
Example #15
0
def test_index_within_path(simple_site, stay_within_path, check_external,
                           results):

    baseurl = "{}/{}".format(simple_site.rstrip("/"), "projects/")
    options = {
        'stay_within_path': stay_within_path,
        'check_external_urls': check_external,
        'threads': 10,
    }
    c = Crawler(Settings(baseurl, **options))
    c.start()

    exists, failed, ignored = results

    assert len(c.succeed) == exists
    assert len(c.failed) == failed
    assert len(c.ignored) == ignored
Example #16
0
def test_mailto(server):
    """ Extra mailto test. """

    MAILTO = "mailto:[email protected]"
    CONTENT = """  <a href="{}">mail link</a>""".format(MAILTO)

    address = server.router({
        '^/$': Page(CONTENT).exists(),
    })

    c = Crawler(Settings(address, check_external_urls=True))
    c.start()

    assert len(c.ignored) == 1
    assert MAILTO in c.ignored

    assert len(c.failed) == 0
    assert len(c.index) == 2
Example #17
0
def test_no_index_page(server):

    from random import sample

    pages = list(range(1, 51))
    format_link = lambda x: "<a href='/link-%s'>link</a>" % x

    routes = {
        '^/$': Page("").exists(),
    }

    for step in pages:
        route_key = '^/link-%s/$' % step
        route_contents = Page(" / ".join(map(format_link,
                                             sample(pages, 4)))).exists()
        routes.update({route_key: route_contents})

    address = server.router(routes)

    settings = Settings(address, threads=10)
    c = Crawler(settings)
    c.start()

    assert len(c.index) == 1
Example #18
0
def test_gobyexample():
    """ special case - aws substitute robots.txt """

    with pytest.raises(DeadlinksIgnoredURL):
        c = Crawler(Settings("https://gobyexample.com"))
        c.start()