def test_crawl_follow_external_links_false():
    serve()

    rsps = list(
        http_crawler.crawl('http://localhost:8000/',
                           follow_external_links=False))

    assert len(rsps) == 12

    urls = [rsp.url for rsp in rsps]

    assert len(urls) == len(set(urls))
    assert set(urls) == {
        'http://localhost:8000/',
        'http://localhost:8000/pages/page-1/',
        'http://localhost:8000/pages/page-2/',
        'http://localhost:8000/pages/page-3/',
        'http://localhost:8000/assets/styles.css',
        'http://localhost:8000/assets/styles-2.css',
        'http://localhost:8000/assets/image.jpg',
        'http://localhost:8000/assets/script.js',
        'http://localhost:8000/assets/tile-1.jpg',
        'http://localhost:8000/assets/tile-2.jpg',
        'http://localhost:8000/assets/somefont.eot',
        'http://localhost:8000/assets/somefont.ttf',
    }
Esempio n. 2
0
def test_crawl_ignore_fragments_false():
    serve()

    rsps = list(http_crawler.crawl('http://localhost:8000/',
                ignore_fragments=False))

    # assert len(rsps) == 14

    urls = [rsp.url for rsp in rsps]

    # assert len(urls) == len(set(urls))
    assert set(urls) == {
        'http://localhost:8000/',
        'http://localhost:8000/pages/page-1/',
        'http://localhost:8000/pages/page-1/#anchor',
        'http://localhost:8000/pages/page-2/',
        'http://localhost:8000/pages/page-2/#anchor',
        'http://localhost:8000/pages/page-3/',
        'http://localhost:8000/pages/page-3/#anchor',
        'http://localhost:8000/assets/styles.css',
        'http://localhost:8000/assets/styles-2.css',
        'http://localhost:8000/assets/image.jpg',
        'http://localhost:8000/assets/script.js',
        'http://localhost:8000/assets/tile-1.jpg',
        'http://localhost:8000/assets/tile-2.jpg',
        'http://localhost:8000/assets/somefont.eot',
        'http://localhost:8000/assets/somefont.ttf',
        'http://localhost:8001/pages/page-1/',
        'http://localhost:8001/pages/page-1/#anchor',
    }
Esempio n. 3
0
def test_crawl_follow_external_links_false():
    serve()

    rsps = list(http_crawler.crawl('http://localhost:8000/',
                                   follow_external_links=False))

    assert len(rsps) == 12

    urls = [rsp.url for rsp in rsps]

    assert len(urls) == len(set(urls))
    assert set(urls) == {
        'http://localhost:8000/',
        'http://localhost:8000/pages/page-1/',
        'http://localhost:8000/pages/page-2/',
        'http://localhost:8000/pages/page-3/',
        'http://localhost:8000/assets/styles.css',
        'http://localhost:8000/assets/styles-2.css',
        'http://localhost:8000/assets/image.jpg',
        'http://localhost:8000/assets/script.js',
        'http://localhost:8000/assets/tile-1.jpg',
        'http://localhost:8000/assets/tile-2.jpg',
        'http://localhost:8000/assets/somefont.eot',
        'http://localhost:8000/assets/somefont.ttf',
    }
Esempio n. 4
0
def test_crawl_ignore_fragments_false():
    serve()

    rsps = list(http_crawler.crawl('http://localhost:8000/',
                                   ignore_fragments=False))

    # assert len(rsps) == 14

    urls = [rsp.url for rsp in rsps]

    # assert len(urls) == len(set(urls))
    assert set(urls) == {
        'http://localhost:8000/',
        'http://localhost:8000/pages/page-1/',
        'http://localhost:8000/pages/page-1/#anchor',
        'http://localhost:8000/pages/page-2/',
        'http://localhost:8000/pages/page-2/#anchor',
        'http://localhost:8000/pages/page-3/',
        'http://localhost:8000/pages/page-3/#anchor',
        'http://localhost:8000/assets/styles.css',
        'http://localhost:8000/assets/styles-2.css',
        'http://localhost:8000/assets/image.jpg',
        'http://localhost:8000/assets/script.js',
        'http://localhost:8000/assets/tile-1.jpg',
        'http://localhost:8000/assets/tile-2.jpg',
        'http://localhost:8000/assets/somefont.eot',
        'http://localhost:8000/assets/somefont.ttf',
        'http://localhost:8001/pages/page-1/',
        'http://localhost:8001/pages/page-1/#anchor',
    }
def test_no_links_are_broken(baseurl):
    responses = []
    for rsp in crawl(baseurl, follow_external_links=False):
        responses.append(rsp)

    failed_responses = [rsp for rsp in responses if rsp.status_code != 200]
    failures = [f'{rsp.url} ({rsp.status_code})' for rsp in failed_responses]
    print('\n'.join(failures))
    assert len(failures) == 0
Esempio n. 6
0
    def buildsite(self, port):
        call_command('loadpages')

        output_path = os.path.join(settings.BASE_DIR, 'output')
        shutil.rmtree(output_path, ignore_errors=True)

        crawl_options = getattr(settings, 'DJANGO_AMBER_CRAWL_OPTIONS', {})

        for rsp in http_crawler.crawl('http://localhost:{}/'.format(port), **crawl_options):
            rsp.raise_for_status()

            parsed_url = http_crawler.urlparse(rsp.url)

            if parsed_url.netloc != 'localhost:{}'.format(port):
                # This is an external request, which we don't care about
                continue

            path = parsed_url.path
            segments = path.split('/')
            assert segments[0] == ''
            if segments[-1] == '':
                rel_dir_path = os.path.join(*segments[:-1])
                filename = 'index.html'
            elif '.' in segments[-1]:
                rel_dir_path = os.path.join(*segments[:-1])
                filename = segments[-1]
            else:
                rel_dir_path = os.path.join(*segments)
                filename = 'index.html'

            dir_path = os.path.join(output_path, rel_dir_path)
            os.makedirs(dir_path, exist_ok=True)
            with open(os.path.join(dir_path, filename), 'wb') as f:
                f.write(rsp.content)

        cname = getattr(settings, 'DJANGO_AMBER_CNAME', None)

        if cname:
            with open(os.path.join(output_path, 'CNAME'), 'w') as f:
                f.write(cname)
Esempio n. 7
0
def test_ssl_verify_true():
    serve()

    with pytest.raises(SSLError):
        list(http_crawler.crawl('https://localhost:8002', verify=True))
Esempio n. 8
0
def test_ssl_verify_false():
    serve()

    rsps = list(http_crawler.crawl('https://localhost:8002', verify=False))

    assert len(rsps) == 1