def test_external_external(servers): """ Redirections tested via added 2nd domain and extra external domains. """ CONTENT = """ Example of the index page <a href="{}">external link 1</a> | <a href="{}">external link 2</a> """ # this urls not suppose to be found in index external_urls = ( "http://example.com", "http://example.org", ) # external domain with catfished urls linked_domain = servers[0].router({ '^/$': Page(CONTENT.format(*external_urls)).exists(), }) site_to_index = servers[1].router({ '^/$': Page("<a href='{0}'>{0}</a>".format(linked_domain)).exists(), }) c = Crawler(Settings(site_to_index, check_external_urls=True)) c.start() # convert index to list links = [link.url() for link in c.index] assert linked_domain in links assert external_urls[0] not in links assert external_urls[1] not in links
def test_double_start(simple_site): c = Crawler(Settings(simple_site, threads=10)) c.start() # should not take same time again. c.start()
def test_crawling_advanced( site_with_links, check_external, threads, ignore_domains, ignore_pathes, results, ): options = { 'check_external_urls': check_external, 'stay_within_path': False, 'threads': threads, 'ignore_domains': ignore_domains, 'ignore_pathes': ignore_pathes, } c = Crawler(Settings(site_with_links, **options)) c.start() indexed, failed, succeed, ignored, redirected = results assert len(c.index) == indexed assert len(c.redirected) == redirected assert len(c.failed) == failed assert len(c.succeed) == succeed assert len(c.ignored) == ignored
def test_robots_txt_reject_all_off(server): robots_txt = Page("User-agent: *\nDisallow: /").mime('text/plain').exists() address = server.router(pages(robots_txt)) c = Crawler(Settings(address, check_robots_txt=False)) c.start()
def test_redirected_links(server): from random import sample pages = list(range(1, 51)) format_link = lambda x: "<a href='/link-%s'>link</a>" % x routes = { '^/$': Page(" / ".join(map(format_link, sample(pages, 4)))).exists(), '^/link-\d{1,2}$': Page("").exists().redirects(pattern='%s/'), } for step in pages: route_key = '^/link-%s/$' % step route_contents = Page(" / ".join(map(format_link, sample(pages, 4)))).exists() routes.update({route_key: route_contents}) address = server.router(routes) settings = Settings(address, threads=10) c = Crawler(settings) c.start() assert 1 < len(c.index) <= (2 * len(pages) + 1) assert 1 < len(c.redirected) <= len(pages) assert len(c.failed) == 0
def test_crawler_update_link(server): address = server.router({ '^/$': Page("ok").exists().unlock_after(2), '^/link-\d{1,2}$': Page("").exists().redirects(pattern='%s/'), }) c = Crawler(Settings(address, retry=1)) c.start() assert len(c.failed) == 1 url = c.failed[0] # print("OUT", url, url.status) c.update(url) assert len(c.failed) == 1 # доадати в тест url p редіректоv раніше не бачений with pytest.raises(TypeError): c.update(address + 1) url = address + "/link-1" c.update(url) assert len(c.failed) == 1 assert len(c.index) == 2 assert len(c.undefined) == 1
def test_defaults(server, threads): """ Solo/Multi - Threading with default settings """ # there are 2*3 links on the page, and half of them are working links_number = 3 HTML_FORMATTER = lambda x: "<a href='{}-{{0}}'>{{0}}</a>".format( x) #pylint: disable-msg=W0108 LINK_FORMATTER = lambda x: HTML_FORMATTER("link").format(x) LIMK_FORMATTER = lambda x: HTML_FORMATTER("limk").format(x) index_html = "<!-- index page -->" index_html += " - ".join(map(LINK_FORMATTER, range(links_number))) # 10 good links index_html += " - ".join(map(LIMK_FORMATTER, range(links_number))) # 10 bad links address = server.router({ '^/$': Page(index_html).exists(), 'link-\d{1,}': Page("ok").exists(), 'limk-\d{1,}': Page("error").not_exists(), }) c = Crawler(Settings(address, threads=threads)) c.start() assert len(c.index) == (1 + 2 * links_number) assert len(c.failed) == links_number assert len(c.succeed) == (1 + links_number) assert not c.ignored
def test_within_site_root(server): """ This Test checks a case when url without trailing slash is ignored because it's not stays within path. """ addr = server.acquire_new_addr() CONTENT = """ <a href="http://{0}:{1}">link</a> <a href="http://{0}:{1}/">link</a> """.format(*addr) CONTENT_DOCS = CONTENT.replace('">', '/docs/">').replace('//docs/', '/docs') address = server.router({ '^/$': Page(CONTENT).exists(), '^/docs/?$': Page(CONTENT_DOCS).exists(), }) for base in {address.rstrip("/") + "/", address.rstrip("/") + "/docs/"}: settings = Settings(base, stay_within_path=True) c = Crawler(settings) c.start() assert len(c.ignored) == 0
def test_robots_txt_reject_all(server): robots_txt = Page("User-agent: *\nDisallow: /").mime('text/plain').exists() address = server.router(pages(robots_txt)) with pytest.raises(DeadlinksIgnoredURL): c = Crawler(Settings(address)) c.start()
def test_robots_txt_allow_user_agent(server): robots_txt = Page("User-agent: *\nDisallow: /link").mime( 'text/plain').exists() address = server.router(pages(robots_txt)) c = Crawler(Settings(address)) c.start() assert len(c.ignored) == 100
def test_crawling_retry(server, unlocked_after, do_retries, fails): address = server.router({ '^/$': Page("ok").exists().unlock_after(unlocked_after), }) c = Crawler(Settings(address, retry=do_retries)) c.start() assert len(c.failed) == fails
def test_failed_domain(): """ Some random domain should fails (robots.txt fails to be retrived)""" from random import choice from string import ascii_lowercase domain = "http://%s.com/" % ''.join( [choice(ascii_lowercase) for x in range(42)]) c = Crawler(Settings(domain)) c.start() assert len(c.failed) == 1
def test_failed_google(): c = Crawler( Settings( "http://google.com/search/about/", **{ 'stay_within_path': True, 'check_external_urls': False, }, )) c.start() assert len(c.succeed) == 1
def test_redirections(server): address = server.router({ '^/$': Page("<a href='/link-1'></a>").exists(), '^/link-\d{1,}$': Page("ok").exists().redirects(pattern='%s/'), '^/link-\d{1,}/$': Page("ok").exists(), }) c = Crawler(Settings(address)) c.start() assert len(c.redirected) == 1, "NOT" assert len(c.succeed) == 2, "OK"
def test_index_within_path(simple_site, stay_within_path, check_external, results): baseurl = "{}/{}".format(simple_site.rstrip("/"), "projects/") options = { 'stay_within_path': stay_within_path, 'check_external_urls': check_external, 'threads': 10, } c = Crawler(Settings(baseurl, **options)) c.start() exists, failed, ignored = results assert len(c.succeed) == exists assert len(c.failed) == failed assert len(c.ignored) == ignored
def test_mailto(server): """ Extra mailto test. """ MAILTO = "mailto:[email protected]" CONTENT = """ <a href="{}">mail link</a>""".format(MAILTO) address = server.router({ '^/$': Page(CONTENT).exists(), }) c = Crawler(Settings(address, check_external_urls=True)) c.start() assert len(c.ignored) == 1 assert MAILTO in c.ignored assert len(c.failed) == 0 assert len(c.index) == 2
def test_no_index_page(server): from random import sample pages = list(range(1, 51)) format_link = lambda x: "<a href='/link-%s'>link</a>" % x routes = { '^/$': Page("").exists(), } for step in pages: route_key = '^/link-%s/$' % step route_contents = Page(" / ".join(map(format_link, sample(pages, 4)))).exists() routes.update({route_key: route_contents}) address = server.router(routes) settings = Settings(address, threads=10) c = Crawler(settings) c.start() assert len(c.index) == 1
def test_gobyexample(): """ special case - aws substitute robots.txt """ with pytest.raises(DeadlinksIgnoredURL): c = Crawler(Settings("https://gobyexample.com")) c.start()