Esempio n. 1
0
def test_decode():
    '''Test how responses are being decoded.'''
    assert decode_response(b'\x1f\x8babcdef') is not None
    assert decode_response(b'\x1f\x8babcdef') is not None
    mock = Mock()
    mock.data = (b' ')
    assert decode_response(mock) is not None
Esempio n. 2
0
def test_fetch():
    '''test URL fetching'''
    assert utils.fetch_url('1234') == ''
    assert utils.fetch_url('https://httpbin.org/status/404') is None
    assert utils.decode_response(b'\x1f\x8babcdef') is not None
    assert utils.fetch_url('https://expired.badssl.com/',
                           no_ssl=True) is not None
Esempio n. 3
0
def test_extraction():
    '''Test simple link extraction'''
    url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org'
    # fixing partial URLs
    assert fix_relative_urls('https://example.org', 'https://example.org/test.html') == 'https://example.org/test.html'
    assert fix_relative_urls('https://example.org', '/test.html') == 'https://example.org/test.html'
    assert fix_relative_urls('https://example.org', 'test.html') == 'https://example.org/test.html'
    assert fix_relative_urls('https://example.org', '../../test.html') == 'https://example.org/test.html'
    # link handling
    assert sitemaps.handle_link(url, url, domain, baseurl, None) == (url, '0')
    assert sitemaps.handle_link('https://mydomain.wordpress.com/1', 'https://example.org/sitemap.xml', 'example.org', 'https://example.org', None) == ('https://mydomain.wordpress.com/1', 'link')
    # safety belts
    assert sitemaps.check_sitemap('http://example.org/sitemap.xml.gz', b'\x1f\x8bABC') is None
    assert sitemaps.check_sitemap('http://example.org/sitemap.xml', 'ABC') is None
    # parsing a file
    filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml')
    with open(filepath) as f:
        teststring = f.read()
    contents = sitemaps.check_sitemap('http://example.org/sitemap.xml', teststring)
    sitemapurls, linklist = sitemaps.extract_sitemap_links(contents, url, domain, baseurl, None)
    assert len(sitemapurls) == 0 and len(linklist) == 84
    # hreflang
    assert sitemaps.extract_sitemap_langlinks(teststring, url, domain, baseurl, None) == ([], [])
    # nested sitemaps
    url, domain, baseurl = 'http://www.example.com/sitemap.xml', 'example.com', 'http://www.example.com'
    filepath = os.path.join(RESOURCES_DIR, 'sitemap2.xml')
    with open(filepath) as f:
        teststring = f.read()
    sitemapurls, linklist = sitemaps.extract_sitemap_links(teststring, url, domain, baseurl, None)
    assert sitemapurls == ['http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz'] and linklist == []
    # invalid
    assert sitemaps.extract_sitemap_links('<html>\n</html>', url, domain, baseurl, None) == ([], [])
    # hreflang
    filepath = os.path.join(RESOURCES_DIR, 'sitemap-hreflang.xml')
    with open(filepath) as f:
        teststring = f.read()
    sitemapsurls, linklist = sitemaps.extract_sitemap_langlinks(teststring, url, domain, baseurl, target_lang='de')
    assert sitemapsurls == ['http://www.example.com/sitemap-de.xml.gz']
    assert len(linklist) > 0
    # GZ-compressed sitemaps
    url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org'
    filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml.gz')
    with open(filepath, 'rb') as f:
        teststring = f.read()
    teststring = decode_response(teststring)
    contents = sitemaps.check_sitemap('http://example.org/sitemap.xml.gz', teststring)
    sitemapurls, linklist = sitemaps.extract_sitemap_links(contents, url, domain, baseurl, None)
    assert len(sitemapurls) == 0 and len(linklist) == 84
    # check contents
    assert sitemaps.check_sitemap('http://example.org/sitemap.xml.gz?value=1', teststring) is not None
    # TXT links
    assert sitemaps.process_sitemap('https://test.org/sitemap', 'test.org', 'https://test.org/', 'Tralala\nhttps://test.org/1\nhttps://test.org/2') == ([], ['https://test.org/1', 'https://test.org/2'])
    # unique and sorted URLs
    urlfilter = 'category'
    myurls = ['/category/xyz', '/category/abc', '/cat/test', '/category/abc']
    assert filter_urls(myurls, urlfilter) == ['/category/abc', '/category/xyz']
Esempio n. 4
0
def test_fetch():
    '''test URL fetching'''
    assert utils.fetch_url('1234') == ''
    assert utils.fetch_url('https://httpbin.org/status/404') is None
    assert utils.decode_response(b'\x1f\x8babcdef') is not None
    assert utils.fetch_url('https://expired.badssl.com/',
                           no_ssl=True) is not None
    # no decoding
    response = utils.fetch_url('https://httpbin.org/status/200', decode=False)
    assert response == ''
    # response object
    url = 'https://httpbin.org/encoding/utf8'
    response = utils._send_request(url, False, DEFAULT_CONFIG)
    myobject = utils._handle_response(url, response, False, DEFAULT_CONFIG)
    assert myobject.data.startswith(b'<h1>Unicode Demo</h1>')
    # straight handling of response object
    assert utils.load_html(response) is not None
    # nothing to see here
    assert extract(response, url=response.geturl(), config=ZERO_CONFIG) is None
    # user-agents rotation
    assert utils._parse_config(UA_CONFIG) == ['Firefox', 'Chrome']
    custom = utils._determine_headers(UA_CONFIG)
    assert custom['User-Agent'] == 'Chrome' or custom['User-Agent'] == 'Firefox'