def test_decode(): '''Test how responses are being decoded.''' assert decode_response(b'\x1f\x8babcdef') is not None assert decode_response(b'\x1f\x8babcdef') is not None mock = Mock() mock.data = (b' ') assert decode_response(mock) is not None
def test_fetch(): '''test URL fetching''' assert utils.fetch_url('1234') == '' assert utils.fetch_url('https://httpbin.org/status/404') is None assert utils.decode_response(b'\x1f\x8babcdef') is not None assert utils.fetch_url('https://expired.badssl.com/', no_ssl=True) is not None
def test_extraction(): '''Test simple link extraction''' url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org' # fixing partial URLs assert fix_relative_urls('https://example.org', 'https://example.org/test.html') == 'https://example.org/test.html' assert fix_relative_urls('https://example.org', '/test.html') == 'https://example.org/test.html' assert fix_relative_urls('https://example.org', 'test.html') == 'https://example.org/test.html' assert fix_relative_urls('https://example.org', '../../test.html') == 'https://example.org/test.html' # link handling assert sitemaps.handle_link(url, url, domain, baseurl, None) == (url, '0') assert sitemaps.handle_link('https://mydomain.wordpress.com/1', 'https://example.org/sitemap.xml', 'example.org', 'https://example.org', None) == ('https://mydomain.wordpress.com/1', 'link') # safety belts assert sitemaps.check_sitemap('http://example.org/sitemap.xml.gz', b'\x1f\x8bABC') is None assert sitemaps.check_sitemap('http://example.org/sitemap.xml', 'ABC') is None # parsing a file filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml') with open(filepath) as f: teststring = f.read() contents = sitemaps.check_sitemap('http://example.org/sitemap.xml', teststring) sitemapurls, linklist = sitemaps.extract_sitemap_links(contents, url, domain, baseurl, None) assert len(sitemapurls) == 0 and len(linklist) == 84 # hreflang assert sitemaps.extract_sitemap_langlinks(teststring, url, domain, baseurl, None) == ([], []) # nested sitemaps url, domain, baseurl = 'http://www.example.com/sitemap.xml', 'example.com', 'http://www.example.com' filepath = os.path.join(RESOURCES_DIR, 'sitemap2.xml') with open(filepath) as f: teststring = f.read() sitemapurls, linklist = sitemaps.extract_sitemap_links(teststring, url, domain, baseurl, None) assert sitemapurls == ['http://www.example.com/sitemap1.xml.gz', 'http://www.example.com/sitemap2.xml.gz'] and linklist == [] # invalid assert sitemaps.extract_sitemap_links('<html>\n</html>', url, domain, baseurl, None) == ([], []) # hreflang filepath = os.path.join(RESOURCES_DIR, 'sitemap-hreflang.xml') with open(filepath) as f: teststring = f.read() sitemapsurls, linklist = sitemaps.extract_sitemap_langlinks(teststring, url, domain, baseurl, target_lang='de') assert sitemapsurls == ['http://www.example.com/sitemap-de.xml.gz'] assert len(linklist) > 0 # GZ-compressed sitemaps url, domain, baseurl = 'https://www.sitemaps.org/sitemap.xml', 'sitemaps.org', 'https://www.sitemaps.org' filepath = os.path.join(RESOURCES_DIR, 'sitemap.xml.gz') with open(filepath, 'rb') as f: teststring = f.read() teststring = decode_response(teststring) contents = sitemaps.check_sitemap('http://example.org/sitemap.xml.gz', teststring) sitemapurls, linklist = sitemaps.extract_sitemap_links(contents, url, domain, baseurl, None) assert len(sitemapurls) == 0 and len(linklist) == 84 # check contents assert sitemaps.check_sitemap('http://example.org/sitemap.xml.gz?value=1', teststring) is not None # TXT links assert sitemaps.process_sitemap('https://test.org/sitemap', 'test.org', 'https://test.org/', 'Tralala\nhttps://test.org/1\nhttps://test.org/2') == ([], ['https://test.org/1', 'https://test.org/2']) # unique and sorted URLs urlfilter = 'category' myurls = ['/category/xyz', '/category/abc', '/cat/test', '/category/abc'] assert filter_urls(myurls, urlfilter) == ['/category/abc', '/category/xyz']
def test_fetch(): '''test URL fetching''' assert utils.fetch_url('1234') == '' assert utils.fetch_url('https://httpbin.org/status/404') is None assert utils.decode_response(b'\x1f\x8babcdef') is not None assert utils.fetch_url('https://expired.badssl.com/', no_ssl=True) is not None # no decoding response = utils.fetch_url('https://httpbin.org/status/200', decode=False) assert response == '' # response object url = 'https://httpbin.org/encoding/utf8' response = utils._send_request(url, False, DEFAULT_CONFIG) myobject = utils._handle_response(url, response, False, DEFAULT_CONFIG) assert myobject.data.startswith(b'<h1>Unicode Demo</h1>') # straight handling of response object assert utils.load_html(response) is not None # nothing to see here assert extract(response, url=response.geturl(), config=ZERO_CONFIG) is None # user-agents rotation assert utils._parse_config(UA_CONFIG) == ['Firefox', 'Chrome'] custom = utils._determine_headers(UA_CONFIG) assert custom['User-Agent'] == 'Chrome' or custom['User-Agent'] == 'Firefox'