def test_seen(): c = {'Robots': {'RobotsCacheSize': 1, 'RobotsCacheTimeout': 1}} config.set_config(c) dl = datalayer.Datalayer() assert not dl.seen_url(URL('http://example.com')) dl.add_seen_url(URL('http://example.com')) assert dl.seen_url(URL('http://example.com'))
def create_queue(): queue = asyncio.Queue() # add a fake domain to make sure the dns doesn't send unknown hosts to a search # note that mail.foo.com and mx.foo.com don't generally get bogus answers, it's foo.com or www.foo.com that do for _ in range(2): r = random.Random() host = str(r.randrange(1000000000)) + str(r.randrange(1000000000)) + str(r.randrange(1000000000)) queue.put_nowait((URL('http://' + host + '.com'), 'fake')) # read list of domains to query -- from alexa top million head, tail = os.path.split(__file__) alexa = os.path.join(head, os.pardir, 'data', 'top-1k.txt') alexa_count = 0 try: with open(alexa, 'r') as f: print('Using top-1k from Alexa, expect a few failures') for line in f: queue.put_nowait((URL('http://'+line.strip()), 'real')) alexa_count += 1 if alexa_count > args.count: break except FileNotFoundError: # the alexa file wasn't available (it is not in the repo) so just do a few print('Cannot find top-1k file, so all queries are www.google.com') for _ in range(args.count): queue.put_nowait((URL('http://www.google.com'), 'real')) return queue
def test_facets_from_embeds(): embeds = (URL('http://example.com'), URL('http://cdn.ampproject.org'), URL('googletagmanager.com?asdf&id=GTM-ZZZXXX&fdsa'), URL('https://www.facebook.com/tr?foo&id=1234567890123456')) facets = facet.facets_from_embeds(embeds) assert facets == [('thing-google amp', True), ('thing-google tag manager', 'GTM-ZZZXXX'), ('thing-facebook events', '1234567890123456')]
def test_summarize(capsys): c = {'Robots': {'RobotsCacheSize': 1, 'RobotsCacheTimeout': 1}} config.set_config(c) dl = datalayer.Datalayer() dl.add_seen_url(URL('http://example.com')) dl.add_seen_url(URL('http://example2.com')) dl.summarize() out, err = capsys.readouterr() assert len(err) == 0 assert out.startswith('2 seen_urls')
async def test_cocrawler(capsys): config.config(None, None) # we have to get around the useragent checks config.write('pytest', 'UserAgent', 'MyPrefix') config.write('http://example.com/pytest-test-cocrawler.py', 'UserAgent', 'URL') # and configure url_allowed config.write('AllDomains', 'Plugins', 'url_allowed') crawler = cocrawler.Crawler() crawler.add_url(0, {'url': URL('http://example1.com/')}) crawler.add_url(0, {'url': URL('http://example2.com/')}) crawler.add_url(0, {'url': URL('http://example3.com/')}) assert crawler.qsize == 3 f = tempfile.NamedTemporaryFile(delete=False) name = f.name with open(name, 'wb') as f: crawler.save(f) assert crawler.qsize == 0 crawler.add_url(0, {'url': URL('http://example4.com/')}) assert crawler.qsize == 1 with open(name, 'rb') as f: crawler.load(f) assert crawler.qsize == 3 os.unlink(name) assert not os.path.exists(name) # clear out the existing capture out, err = capsys.readouterr() crawler.summarize() out, err = capsys.readouterr() assert err == '' assert len( out) >= 200 # not a very good test, but at least it is something await crawler.close() # needed for smooth shutdown
def test_do_burner_work_html(): urlj = URL('http://example.com') test_html_bytes = test_html.encode(encoding='utf-8', errors='replace') headers = {} links, embeds, sha1, facets = parse.do_burner_work_html(test_html, test_html_bytes, headers, url=urlj) assert len(links) == 3 assert len(embeds) == 2 linkset = set(u.url for u in links) embedset = set(u.url for u in embeds) assert 'http://example.com/foo3.html' in linkset assert 'http://example.com/foo.gif' in embedset assert sha1 == 'sha1:3458e0857ec379ec56d4c7fb39d33c90c8b5ae93' # as a handwave, let's expect these defective pages to also work. test_html_bytes = test_html_no_body.encode(encoding='utf-8', errors='replace') links, embeds, sha1, facets = parse.do_burner_work_html(test_html, test_html_bytes, headers, url=urlj) assert len(links) == 3 assert len(embeds) == 2 test_html_bytes = test_html_no_head.encode(encoding='utf-8', errors='replace') links, embeds, sha1, facets = parse.do_burner_work_html(test_html, test_html_bytes, headers, url=urlj) assert len(links) == 3 assert len(embeds) == 2 test_html_bytes = test_html_no_nothing.encode(encoding='utf-8', errors='replace') links, embeds, sha1, facets = parse.do_burner_work_html(test_html, test_html_bytes, headers, url=urlj) assert len(links) == 3 assert len(embeds) == 2
def test_do_burner_work_html(): urlj = URL('http://example.com') test_html_bytes = test_html.encode(encoding='utf-8', errors='replace') headers = {} links, embeds, sha1, facets, base = parse.do_burner_work_html(test_html, test_html_bytes, headers, url=urlj) assert len(links) == 4 assert len(embeds) == 2 linkset = set(u.url for u in links) embedset = set(e.url for e in embeds) assert 'http://example.com/foo3.html' in linkset assert 'http://example.com/foo.gif' in embedset assert sha1 == 'sha1:cdcb087d39afd827d5d523e165a6566d65a2e9b3' assert base is None # as a handwave, let's expect these defective pages to also work. test_html_bytes = test_html_no_body.encode(encoding='utf-8', errors='replace') links, embeds, sha1, facets, base = parse.do_burner_work_html(test_html_no_body, test_html_bytes, headers, url=urlj) assert len(links) == 3 assert len(embeds) == 2 test_html_bytes = test_html_no_head.encode(encoding='utf-8', errors='replace') links, embeds, sha1, facets, base = parse.do_burner_work_html(test_html_no_head, test_html_bytes, headers, url=urlj) assert len(links) == 3 assert len(embeds) == 1 test_html_bytes = test_html_no_nothing.encode(encoding='utf-8', errors='replace') links, embeds, sha1, facets, base = parse.do_burner_work_html(test_html_no_nothing, test_html_bytes, headers, url=urlj) assert len(links) == 3 assert len(embeds) == 1
async def test_prefetch(): url = URL('http://example.com/') config.config(None, None) resolver = dns.get_resolver() iplist = await dns.prefetch(url, resolver) assert len(iplist) > 0 iplist2 = await dns.prefetch(url, resolver) assert iplist == iplist2
def test_cocrawler(capsys): config = conf.config(None, None, confighome=False) # ok, we have to get around the useragent checks config['UserAgent']['MyPrefix'] = 'pytest' config['UserAgent']['URL'] = 'http://example.com/pytest-test-cocrawler.py' loop = asyncio.get_event_loop() crawler = cocrawler.Crawler(loop, config) crawler.add_url(0, URL('http://example1.com/'), seed=True) crawler.add_url(0, URL('http://example2.com/'), seed=True) crawler.add_url(0, URL('http://example3.com/'), seed=True) assert crawler.qsize == 3 f = tempfile.NamedTemporaryFile(delete=False) name = f.name with open(name, 'wb') as f: crawler.save(f) assert crawler.qsize == 0 crawler.add_url(0, URL('http://example4.com/'), seed=True) assert crawler.qsize == 1 with open(name, 'rb') as f: crawler.load(f) assert crawler.qsize == 3 os.unlink(name) assert not os.path.exists(name) # clear out the existing capture out, err = capsys.readouterr() crawler.summarize() out, err = capsys.readouterr() assert err == '' assert len(out) >= 242 # not a very good test, but at least it is something
def test_setup_seeds_prefix(): seeds = { 'http://example.com/asdf', 'http://example.com/a', 'http://example.com/a', 'http://example.com/b', 'http://example.com/asdff', 'http://example2.com/a' } url_allowed.setup(policy='SeedsPrefix') url_allowed.setup_seeds([URL(s) for s in seeds]) SEEDS = {'example.com': {'/a', '/b'}, 'example2.com': {'/a'}} assert SEEDS == url_allowed.SEEDS
def test_saveload(): tf = tempfile.NamedTemporaryFile(delete=False) name = tf.name c = {'Robots': {'RobotsCacheSize': 1, 'RobotsCacheTimeout': 1}} config.set_config(c) dl = datalayer.Datalayer() dl.add_seen_url(URL('http://example.com')) assert dl.seen_url(URL('http://example.com')) with open(name, 'wb') as f: dl.save(f) dl.add_seen_url(URL('http://example2.com')) with open(name, 'rb') as f: dl.load(f) assert dl.seen_url(URL('http://example.com')) assert not dl.seen_url(URL('http://example2.com')) os.unlink(name) assert not os.path.exists(name)
async def test_prefetch_dns(): url = URL('http://google.com/') mock_url = None resolver = aiohttp.resolver.AsyncResolver(nameservers=ns) connector = aiohttp.connector.TCPConnector(resolver=resolver, family=socket.AF_INET) session = aiohttp.ClientSession(connector=connector) # whew iplist = await dns.prefetch_dns(url, mock_url, session) assert len(iplist) > 0
def test_extension_allowed(): assert url_allowed.extension_allowed(URL('https://example.com/')) assert url_allowed.extension_allowed( URL('https://example.com/thing.with.dots/')) assert url_allowed.extension_allowed( URL('https://example.com/thing.with.dots')) assert url_allowed.extension_allowed(URL('https://example.com/index.html')) assert not url_allowed.extension_allowed( URL('https://example.com/foo.jpg')) assert not url_allowed.extension_allowed( URL('https://example.com/foo.tar.gz'))
'127.0.0.1', description='desc', creator='test', operator='alice') sub.create_default_info('1.0', '0.99', '127.0.0.1') fake_dns_result = [{ 'host': '172.217.6.78' }, { 'host': '172.217.6.78' }, { 'host': '172.217.6.78' }] with mock.patch('cocrawler.warc.timestamp_now', return_value='20190215073137'): main.write_dns(fake_dns_result, 10, URL('http://google.com')) fake_url = 'https://www.google.com/' fake_req_headers = [('Host', 'www.google.com')] fake_resp_headers = [(b'Content-Type', b'text/html; charset=UTF-8')] fake_payload = b'<html><body>Hello, world!</body></html>' # to make sure that warcio is actually using our digest, multilate it # this means we can't use a warc checker! fake_digest = 'sha1:FAKE_DIGEST' main.write_request_response_pair(fake_url, '1.2.3.4', fake_req_headers, fake_resp_headers, False,
def test_URL(): url = URL('http://www.example.com/') assert url.url == 'http://www.example.com/' assert list(url.urlparse) == ['http', 'www.example.com', '/', '', '', ''] assert url.netloc == 'www.example.com' assert url.hostname == 'www.example.com' assert url.hostname_without_www == 'example.com' assert url.registered_domain == 'example.com' assert url.original_frag is None url = URL('http://www.example.com/#foo#foo') assert url.original_frag == 'foo#foo' url = URL('http://www.example.com/#') assert url.original_frag is None # canonicalization url = URL('http://www.example.com/?') assert url.url == 'http://www.example.com/' url = URL('http://www.example.com') assert url.url == 'http://www.example.com/' url = URL('http://www.example.com/?#') assert url.url == 'http://www.example.com/' url = URL('http://www.example.com/foo') assert url.url == 'http://www.example.com/foo' url = URL('http://www.example.com/foo/') assert url.url == 'http://www.example.com/foo/' # urljoin urlj1 = URL('http://www.example.com/foo/') urlj2 = 'http://www.example.com/foo/' url = URL('foo', urljoin=urlj1) assert url.url == 'http://www.example.com/foo/foo' url = URL('foo', urljoin=urlj1) assert url.url == 'http://www.example.com/foo/foo' url = URL('/bar', urljoin=urlj1) assert url.url == 'http://www.example.com/bar' url = URL('/bar', urljoin=urlj2) assert url.url == 'http://www.example.com/bar' url = URL('http://sub.example.com/', urljoin=urlj1) assert url.url == 'http://sub.example.com/' url = URL('http://sub.example.com/', urljoin=urlj2) assert url.url == 'http://sub.example.com/' # read-only with pytest.raises(AttributeError): url.url = 'foo'
def test_facets_from_embeds(): embeds = set((URL('http://example.com'), URL('http://cdn.ampproject.org'))) facets = facet.facets_from_embeds(embeds) assert facets == [('google amp', True)]
def test_URL(): url = URL('http://www.example.com/') assert url.url == 'http://www.example.com/' assert list(url.urlsplit) == ['http', 'www.example.com', '/', '', ''] assert url.netloc == 'www.example.com' assert url.hostname == 'www.example.com' assert url.hostname_without_www == 'example.com' assert url.registered_domain == 'example.com' assert url.original_frag is None url = URL('http://www.example.com/#foo#foo') assert url.original_frag == '#foo#foo' url = URL('http://www.example.com/#') assert url.original_frag is None # canonicalization url = URL('http://www.example.com/?') assert url.url == 'http://www.example.com/' url = URL('http://www.example.com') assert url.url == 'http://www.example.com/' url = URL('http://www.example.com/?#') assert url.url == 'http://www.example.com/' url = URL('http://www.example.com/foo') assert url.url == 'http://www.example.com/foo' url = URL('http://www.example.com/foo/') assert url.url == 'http://www.example.com/foo/' # urljoin urlj1 = URL('http://www.example.com/foo/') urlj2 = 'http://www.example.com/foo/' url = URL('foo', urljoin=urlj1) assert url.url == 'http://www.example.com/foo/foo' url = URL('foo', urljoin=urlj1) assert url.url == 'http://www.example.com/foo/foo' url = URL('/bar', urljoin=urlj1) assert url.url == 'http://www.example.com/bar' url = URL('/bar', urljoin=urlj2) assert url.url == 'http://www.example.com/bar' url = URL('http://sub.example.com/', urljoin=urlj1) assert url.url == 'http://sub.example.com/' url = URL('http://sub.example.com/', urljoin=urlj2) assert url.url == 'http://sub.example.com/' url = URL('foo', urljoin='http://example.com/subdir/') # base can cause this assert url.url == 'http://example.com/subdir/foo' # read-only with pytest.raises(AttributeError): url.url = 'foo' # urljoin examples from RFC 3986 -- python takes care of . and .. urlj = URL('http://a/b/c/d;p?q') # assert URL('g:h', urljoin=urlj).url == 'g:h' # absolute url missing hostname assert URL('g', urljoin=urlj).url == 'http://a/b/c/g' assert URL('./g', urljoin=urlj).url == 'http://a/b/c/g' assert URL('g/', urljoin=urlj).url == 'http://a/b/c/g/' assert URL('/g', urljoin=urlj).url == 'http://a/g' assert URL('//g', urljoin=urlj).url == 'http://g/' # altered because I insist on the trailing / assert URL('?y', urljoin=urlj).url == 'http://a/b/c/d;p?y' assert URL('g?y', urljoin=urlj).url == 'http://a/b/c/g?y' assert URL('#s', urljoin=urlj).url == 'http://a/b/c/d;p?q' # I drop the frag assert URL('g#s', urljoin=urlj).url == 'http://a/b/c/g' # I drop the frag assert URL('g?y#s', urljoin=urlj).url == 'http://a/b/c/g?y' # I drop the frag assert URL(';x', urljoin=urlj).url == 'http://a/b/c/;x' assert URL('g;x', urljoin=urlj).url == 'http://a/b/c/g;x' assert URL('g;x?y#s', urljoin=urlj).url == 'http://a/b/c/g;x?y' # I drop the frag assert URL('', urljoin=urlj).url == 'http://a/b/c/d;p?q' assert URL('.', urljoin=urlj).url == 'http://a/b/c/' assert URL('./', urljoin=urlj).url == 'http://a/b/c/' assert URL('..', urljoin=urlj).url == 'http://a/b/' assert URL('../', urljoin=urlj).url == 'http://a/b/' assert URL('../g', urljoin=urlj).url == 'http://a/b/g' assert URL('../..', urljoin=urlj).url == 'http://a/' assert URL('../../', urljoin=urlj).url == 'http://a/' assert URL('../../g', urljoin=urlj).url == 'http://a/g'
def test_URL(): url = URL('http://www.example.com/') assert url.url == 'http://www.example.com/' assert list(url.urlsplit) == ['http', 'www.example.com', '/', '', ''] assert url.netloc == 'www.example.com' assert url.hostname == 'www.example.com' assert url.hostname_without_www == 'example.com' assert url.registered_domain == 'example.com' assert url.original_frag is None url = URL('http://www.example.com/#foo#foo') assert url.original_frag == '#foo#foo' url = URL('http://www.example.com/#') assert url.original_frag is None # canonicalization url = URL('http://www.example.com/?') assert url.url == 'http://www.example.com/' url = URL('http://www.example.com') assert url.url == 'http://www.example.com/' url = URL('http://www.example.com/?#') assert url.url == 'http://www.example.com/' url = URL('http://www.example.com/foo') assert url.url == 'http://www.example.com/foo' url = URL('http://www.example.com/foo/') assert url.url == 'http://www.example.com/foo/' # urljoin urlj1 = URL('http://www.example.com/foo/') urlj2 = 'http://www.example.com/foo/' url = URL('foo', urljoin=urlj1) assert url.url == 'http://www.example.com/foo/foo' url = URL('foo', urljoin=urlj1) assert url.url == 'http://www.example.com/foo/foo' url = URL('/bar', urljoin=urlj1) assert url.url == 'http://www.example.com/bar' url = URL('/bar', urljoin=urlj2) assert url.url == 'http://www.example.com/bar' url = URL('http://sub.example.com/', urljoin=urlj1) assert url.url == 'http://sub.example.com/' url = URL('http://sub.example.com/', urljoin=urlj2) assert url.url == 'http://sub.example.com/' url = URL('foo', urljoin='http://example.com/subdir/') # base can cause this assert url.url == 'http://example.com/subdir/foo' # read-only with pytest.raises(AttributeError): url.url = 'foo' # urljoin examples from RFC 3986 -- python takes care of . and .. urlj = URL('http://a/b/c/d;p?q') # assert URL('g:h', urljoin=urlj).url == 'g:h' # absolute url missing hostname assert URL('g', urljoin=urlj).url == 'http://a/b/c/g' assert URL('./g', urljoin=urlj).url == 'http://a/b/c/g' assert URL('g/', urljoin=urlj).url == 'http://a/b/c/g/' assert URL('/g', urljoin=urlj).url == 'http://a/g' assert URL( '//g', urljoin=urlj ).url == 'http://g/' # altered because I insist on the trailing / assert URL('?y', urljoin=urlj).url == 'http://a/b/c/d;p?y' assert URL('g?y', urljoin=urlj).url == 'http://a/b/c/g?y' assert URL('#s', urljoin=urlj).url == 'http://a/b/c/d;p?q' # I drop the frag assert URL('g#s', urljoin=urlj).url == 'http://a/b/c/g' # I drop the frag assert URL('g?y#s', urljoin=urlj).url == 'http://a/b/c/g?y' # I drop the frag assert URL(';x', urljoin=urlj).url == 'http://a/b/c/;x' assert URL('g;x', urljoin=urlj).url == 'http://a/b/c/g;x' assert URL('g;x?y#s', urljoin=urlj).url == 'http://a/b/c/g;x?y' # I drop the frag assert URL('', urljoin=urlj).url == 'http://a/b/c/d;p?q' assert URL('.', urljoin=urlj).url == 'http://a/b/c/' assert URL('./', urljoin=urlj).url == 'http://a/b/c/' assert URL('..', urljoin=urlj).url == 'http://a/b/' assert URL('../', urljoin=urlj).url == 'http://a/b/' assert URL('../g', urljoin=urlj).url == 'http://a/b/g' assert URL('../..', urljoin=urlj).url == 'http://a/' assert URL('../../', urljoin=urlj).url == 'http://a/' assert URL('../../g', urljoin=urlj).url == 'http://a/g'
def test_special_redirect(): sr = urls.special_redirect assert sr(URL('http://example.com/'), URL('http://example.com/foo')) is None assert sr(URL('http://example.com/'), URL('https://example.com/foo')) is None assert sr(URL('http://example.com/'), URL('https://www.example.com/foo')) is None assert sr(URL('http://example.com/'), URL('http://example.com/?foo=1')) is None assert sr(URL('http://example.com/'), URL('http://example.com/bar?foo=1')) is None url1 = URL('http://example.com/') assert sr(url1, url1) == 'same' assert sr(url1, URL('https://example.com/')) == 'tohttps' assert sr(url1, URL('http://www.example.com/')) == 'towww' assert sr(url1, URL('https://www.example.com/')) == 'towww+tohttps' url2str = 'http://www.example.com/' url2 = URL(url2str) assert sr(url2, URL('https://www.example.com/')) == 'tohttps' assert sr(url2, URL('http://example.com/')) == 'tononwww' assert sr(url2, URL('https://example.com/')) == 'tononwww+tohttps' assert sr(url2str, 'https://www.example.com/') == 'tohttps' assert sr(url2str, 'http://example.com/') == 'tononwww' assert sr(url2str, 'https://example.com/') == 'tononwww+tohttps' url3 = URL('https://www.example.com/') assert sr(url3, URL('http://www.example.com/')) == 'tohttp' assert sr(url3, URL('https://example.com/')) == 'tononwww' assert sr(url3, URL('http://example.com/')) == 'tononwww+tohttp' url4 = URL('https://example.com/') assert sr(url4, URL('http://www.example.com/')) == 'towww+tohttp' url5 = URL('https://example.com/foo') url6 = URL('https://example.com/foo/') assert sr(url5, url6) == 'addslash' assert sr(url6, url5) == 'removeslash'
def test_scheme_allowed(): assert url_allowed.scheme_allowed(URL('http://example.com')) assert url_allowed.scheme_allowed(URL('https://example.com')) assert not url_allowed.scheme_allowed(URL('ftp://example.com'))
def test_url_allowed(): assert not url_allowed.url_allowed(URL('ftp://example.com')) url_allowed.setup(policy='SeedsDomain') url_allowed.setup_seeds([URL('http://example.com')]) assert url_allowed.url_allowed(URL('http://example.com')) assert url_allowed.url_allowed(URL('http://www.example.com')) assert url_allowed.url_allowed(URL('http://sub.example.com')) url_allowed.setup(policy='SeedsHostname') url_allowed.setup_seeds([URL('http://example.com')]) assert url_allowed.url_allowed(URL('http://example.com')) assert url_allowed.url_allowed(URL('http://www.example.com')) assert not url_allowed.url_allowed(URL('http://sub.example.com')) url_allowed.setup(policy='SeedsPrefix') url_allowed.setup_seeds([URL('http://example.com/prefix1')]) url_allowed.setup_seeds([URL('http://example2.com/prefix2/')]) assert not url_allowed.url_allowed(URL('http://example.com')) assert url_allowed.url_allowed(URL('http://www.example.com/prefix11')) assert not url_allowed.url_allowed(URL('http://example2.com')) assert not url_allowed.url_allowed(URL('http://www.example2.com/prefix21')) assert not url_allowed.url_allowed(URL('http://www.example2.com/prefix2')) assert url_allowed.url_allowed(URL('http://www.example2.com/prefix2/')) assert url_allowed.url_allowed(URL('http://www.example2.com/prefix2/foo')) url_allowed.setup(policy='OnlySeeds') url_allowed.setup_seeds([URL('http://example.com')]) assert url_allowed.url_allowed(URL('http://example.com')) assert not url_allowed.url_allowed(URL('http://example.com/foo')) url_allowed.setup(policy='AllDomains') url_allowed.setup_seeds([URL('http://example.com')]) assert url_allowed.url_allowed(URL('http://example.com')) assert url_allowed.url_allowed(URL('http://exa2mple.com')) assert url_allowed.url_allowed(URL('http://exa3mple.com'))