def test_crawl_follow_external_links_false(): serve() rsps = list( http_crawler.crawl('http://localhost:8000/', follow_external_links=False)) assert len(rsps) == 12 urls = [rsp.url for rsp in rsps] assert len(urls) == len(set(urls)) assert set(urls) == { 'http://localhost:8000/', 'http://localhost:8000/pages/page-1/', 'http://localhost:8000/pages/page-2/', 'http://localhost:8000/pages/page-3/', 'http://localhost:8000/assets/styles.css', 'http://localhost:8000/assets/styles-2.css', 'http://localhost:8000/assets/image.jpg', 'http://localhost:8000/assets/script.js', 'http://localhost:8000/assets/tile-1.jpg', 'http://localhost:8000/assets/tile-2.jpg', 'http://localhost:8000/assets/somefont.eot', 'http://localhost:8000/assets/somefont.ttf', }
def test_crawl_ignore_fragments_false(): serve() rsps = list(http_crawler.crawl('http://localhost:8000/', ignore_fragments=False)) # assert len(rsps) == 14 urls = [rsp.url for rsp in rsps] # assert len(urls) == len(set(urls)) assert set(urls) == { 'http://localhost:8000/', 'http://localhost:8000/pages/page-1/', 'http://localhost:8000/pages/page-1/#anchor', 'http://localhost:8000/pages/page-2/', 'http://localhost:8000/pages/page-2/#anchor', 'http://localhost:8000/pages/page-3/', 'http://localhost:8000/pages/page-3/#anchor', 'http://localhost:8000/assets/styles.css', 'http://localhost:8000/assets/styles-2.css', 'http://localhost:8000/assets/image.jpg', 'http://localhost:8000/assets/script.js', 'http://localhost:8000/assets/tile-1.jpg', 'http://localhost:8000/assets/tile-2.jpg', 'http://localhost:8000/assets/somefont.eot', 'http://localhost:8000/assets/somefont.ttf', 'http://localhost:8001/pages/page-1/', 'http://localhost:8001/pages/page-1/#anchor', }
def test_crawl_follow_external_links_false(): serve() rsps = list(http_crawler.crawl('http://localhost:8000/', follow_external_links=False)) assert len(rsps) == 12 urls = [rsp.url for rsp in rsps] assert len(urls) == len(set(urls)) assert set(urls) == { 'http://localhost:8000/', 'http://localhost:8000/pages/page-1/', 'http://localhost:8000/pages/page-2/', 'http://localhost:8000/pages/page-3/', 'http://localhost:8000/assets/styles.css', 'http://localhost:8000/assets/styles-2.css', 'http://localhost:8000/assets/image.jpg', 'http://localhost:8000/assets/script.js', 'http://localhost:8000/assets/tile-1.jpg', 'http://localhost:8000/assets/tile-2.jpg', 'http://localhost:8000/assets/somefont.eot', 'http://localhost:8000/assets/somefont.ttf', }
def test_crawl_ignore_fragments_false(): serve() rsps = list(http_crawler.crawl('http://localhost:8000/', ignore_fragments=False)) # assert len(rsps) == 14 urls = [rsp.url for rsp in rsps] # assert len(urls) == len(set(urls)) assert set(urls) == { 'http://localhost:8000/', 'http://localhost:8000/pages/page-1/', 'http://localhost:8000/pages/page-1/#anchor', 'http://localhost:8000/pages/page-2/', 'http://localhost:8000/pages/page-2/#anchor', 'http://localhost:8000/pages/page-3/', 'http://localhost:8000/pages/page-3/#anchor', 'http://localhost:8000/assets/styles.css', 'http://localhost:8000/assets/styles-2.css', 'http://localhost:8000/assets/image.jpg', 'http://localhost:8000/assets/script.js', 'http://localhost:8000/assets/tile-1.jpg', 'http://localhost:8000/assets/tile-2.jpg', 'http://localhost:8000/assets/somefont.eot', 'http://localhost:8000/assets/somefont.ttf', 'http://localhost:8001/pages/page-1/', 'http://localhost:8001/pages/page-1/#anchor', }
def test_no_links_are_broken(baseurl): responses = [] for rsp in crawl(baseurl, follow_external_links=False): responses.append(rsp) failed_responses = [rsp for rsp in responses if rsp.status_code != 200] failures = [f'{rsp.url} ({rsp.status_code})' for rsp in failed_responses] print('\n'.join(failures)) assert len(failures) == 0
def buildsite(self, port): call_command('loadpages') output_path = os.path.join(settings.BASE_DIR, 'output') shutil.rmtree(output_path, ignore_errors=True) crawl_options = getattr(settings, 'DJANGO_AMBER_CRAWL_OPTIONS', {}) for rsp in http_crawler.crawl('http://localhost:{}/'.format(port), **crawl_options): rsp.raise_for_status() parsed_url = http_crawler.urlparse(rsp.url) if parsed_url.netloc != 'localhost:{}'.format(port): # This is an external request, which we don't care about continue path = parsed_url.path segments = path.split('/') assert segments[0] == '' if segments[-1] == '': rel_dir_path = os.path.join(*segments[:-1]) filename = 'index.html' elif '.' in segments[-1]: rel_dir_path = os.path.join(*segments[:-1]) filename = segments[-1] else: rel_dir_path = os.path.join(*segments) filename = 'index.html' dir_path = os.path.join(output_path, rel_dir_path) os.makedirs(dir_path, exist_ok=True) with open(os.path.join(dir_path, filename), 'wb') as f: f.write(rsp.content) cname = getattr(settings, 'DJANGO_AMBER_CNAME', None) if cname: with open(os.path.join(output_path, 'CNAME'), 'w') as f: f.write(cname)
def test_ssl_verify_true(): serve() with pytest.raises(SSLError): list(http_crawler.crawl('https://localhost:8002', verify=True))
def test_ssl_verify_false(): serve() rsps = list(http_crawler.crawl('https://localhost:8002', verify=False)) assert len(rsps) == 1