def test_favicons(self): # This site has a favicon url1 = 'http://example1.com/favicon.ico' httpretty.register_uri(httpretty.HEAD, url1, body='', adding_headers={ "Content-type": "image/x-ico", }) # This site has no favicon url2 = 'http://example2.com/favicon.ico' httpretty.register_uri(httpretty.HEAD, url2, status=404, body='Not found', adding_headers={ "Content-type": "text/plain", }) config = Config( urls=['http://example1.com/path/', 'http://example2.com/']) checker = load_favicons.Checker(config=config) result = checker.run() pprint(result) self.assertEqual( result, { 'http://example1.com/path/': { 'url': 'http://example1.com/favicon.ico' } })
def test_feed_rss2(self): """ Checks RSS 2.0 """ feed = """<?xml version="1.0"?> <rss version="2.0"> <channel> <title>Liftoff News</title> <link>http://liftoff.msfc.nasa.gov/</link> <description>Liftoff to Space Exploration.</description> <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate> <item> <title>Star City</title> <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link> <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate> <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid> </item> <item> <description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st.</description> <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate> <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid> </item> </channel> </rss> """ feed_url = 'http://example.com/feed.xml' httpretty.register_uri(httpretty.GET, feed_url, body=feed, adding_headers={ "Content-type": "application/rss+xml", }) # mocking a previous result from some page results = { 'html_head': { 'http://example.com/': { 'link_rss_atom': ['http://example.com/feed.xml'] } } } config = Config(urls=['http://example.com/']) checker = load_feeds.Checker(config=config, previous_results=results) result = checker.run() pprint(result) self.assertEqual( result, { 'http://example.com/feed.xml': { 'exception': None, 'title': 'Liftoff News', 'latest_entry': datetime(2003, 6, 3, 9, 39, 21), 'first_entry': datetime(2003, 5, 30, 11, 6, 42), 'average_interval': 340359, 'num_entries': 2, } })
def test_redirect(self): url = 'http://www.example.com/' url2 = 'http://www2.example.com/' httpretty.register_uri(httpretty.HEAD, url, status=302, body="", adding_headers={"Location": url2}) httpretty.register_uri(httpretty.HEAD, url2, status=200, body="<html></html>") config = Config(urls=[url]) checker = url_reachability.Checker(config=config, previous_results={}) result = checker.run() self.assertIn(url, result) self.assertEqual(result[url]['url'], url) self.assertEqual(result[url]['status'], 200) self.assertIsNone(result[url]['exception']) self.assertTrue(0 < result[url]['duration'] < 100) self.assertEqual(len(result[url]['redirect_history']), 1) self.assertEqual(result[url]['redirect_history'][0]['status'], 302) self.assertEqual(result[url]['redirect_history'][0]['redirect_to'], url2)
def test_frameset_negative(self): page_body = """ <!doctype html public "-//w3c//dtd html 4.0 transitional//en"> <html> <head> <title>A frameset page</title> </head> <body> <p>Here we have some body content</p> </body> </html> """ url = 'http://example.com/' httpretty.register_uri(httpretty.GET, url, body=page_body) results = {} config = Config(urls=[url]) page_content_checker = page_content.Checker(config=config, previous_results={}) results['page_content'] = page_content_checker.run() checker = frameset.Checker(config=page_content_checker.config, previous_results=results) result = checker.run() urls_after = checker.config.urls self.assertEqual(result, { 'http://example.com/': {'frameset': False} }) self.assertEqual(urls_after, ['http://example.com/'])
def test_kaarst(self): """Real-workd example""" url = 'https://www.gruenekaarst.de/' config = Config(urls=[url]) checker = certificate.Checker(config=config, previous_results={}) result = checker.run() self.assertIn(url, result) self.assertIsNone(result[url]['exception']) self.assertEqual(result[url]['issuer']['O'], 'Sectigo Limited')
def test_tls_v_1_2(self): """Load a certificate for a TLS v1.2 server""" url = 'https://tls-v1-2.badssl.com:1012/' config = Config(urls=[url]) checker = certificate.Checker(config=config, previous_results={}) result = checker.run() self.assertIn(url, result) self.assertIsNone(result[url]['exception']) self.assertEqual(result[url]['subject']['CN'], '*.badssl.com')
def test_google(self): """Load cert from a site that should work""" url = 'https://www.google.com/' config = Config(urls=[url]) checker = certificate.Checker(config=config, previous_results={}) result = checker.run() self.assertIn(url, result) self.assertIsNone(result[url]['exception']) self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services')
def perform_checks(input_url): """ Executes all our URL/site checks and returns a big-ass result dict. """ # The sequence of checks to run. Order is important! # Checks which expand the URLs list must come first. # After that, dependencies (encoded in the checks) have to be fulfilled. check_modules = [ ('domain_variations', domain_variations), ('http_and_https', http_and_https), ('dns_resolution', dns_resolution), ('url_reachability', url_reachability), ('certificate', certificate), ('url_canonicalization', url_canonicalization), ('page_content', page_content), ('duplicate_content', duplicate_content), ('charset', charset), ('html_head', html_head), ('frameset', frameset), ('hyperlinks', hyperlinks), ('generator', generator), ('load_favicons', load_favicons), ('load_feeds', load_feeds), ('load_in_browser', load_in_browser), ] results = {} config = Config( urls=[input_url], user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' + 'Safari/537.36 green-spider/0.2') for check_name, check in check_modules: checker = check.Checker(config=config, previous_results=results) # see if dependencies are met dependencies = checker.depends_on_results() if dependencies != []: for dep in dependencies: if (dep not in results or results[dep] is None or results[dep] == {} or results[dep] == []): logging.debug( "Skipping check %s as dependency %s is not met" % (check_name, dep)) continue result = checker.run() results[check_name] = result # update config for the next check config = checker.config logging.debug("config after check %s: %r" % (check_name, config)) return results
def test_simple(self): url = 'http://example.org/' config = Config(urls=[url]) checker = domain_variations.Checker(config=config, previous_results={}) checker.run() config_after = checker.config self.assertEqual(config_after.urls, ['http://example.org/', 'http://www.example.org/'])
def runTest(self): """Resolves www.google.com""" url = 'https://www.google.com/' config = Config(urls=[url]) checker = dns_resolution.Checker(config=config, previous_results={}) result = checker.run() self.assertIn(url, result) self.assertEqual(result[url]['hostname'], 'www.google.com') self.assertTrue(result[url], 'resolvable_ipv4') self.assertTrue(result[url], 'resolvable_ipv6') self.assertIsInstance(result[url]['ipv4_addresses'], list) self.assertNotEqual(result[url]['ipv4_addresses'], [])
def test_empty_feed_rss2(self): """ Checks RSS 2.0 """ feed = """<?xml version="1.0"?> <rss version="2.0"> <channel> <title>Empty Feed</title> <link>http://example.com/</link> <pubDate></pubDate> </channel> </rss> """ feed_url = 'http://example.com/feed.xml' httpretty.register_uri(httpretty.GET, feed_url, body=feed, adding_headers={ "Content-type": "application/rss+xml", }) # mocking a previous result from some page results = { 'html_head': { 'http://example.com/': { 'link_rss_atom': ['http://example.com/feed.xml'] } } } config = Config(urls=['http://example.com/']) checker = load_feeds.Checker(config=config, previous_results=results) result = checker.run() pprint(result) self.assertEqual( result, { 'http://example.com/feed.xml': { 'exception': None, 'title': 'Empty Feed', 'latest_entry': None, 'first_entry': None, 'average_interval': None, 'num_entries': 0, } })
def test_success(self): url = 'http://www.example.com/' httpretty.register_uri(httpretty.HEAD, url, status=200, body="<html></html>") config = Config(urls=[url]) checker = url_reachability.Checker(config=config, previous_results={}) result = checker.run() self.assertEqual(result[url]['url'], url) self.assertEqual(result[url]['redirect_history'], []) self.assertEqual(result[url]['status'], 200) self.assertIsNone(result[url]['exception']) self.assertTrue(0 < result[url]['duration'] < 100)
def test_basics(self): """Loads a simple HTML web page to check basic functionality""" url = 'https://httpbin.org/html' config = Config(urls=[url]) checker = load_in_browser.Checker(config=config, previous_results={}) result = checker.run() self.assertIn(url, result) self.assertIn('cookies', result[url]) self.assertIn('font_families', result[url]) self.assertIn('logs', result[url]) self.assertIn('min_document_width', result[url]) self.assertIn('sizes', result[url]) self.assertTrue(result[url]['min_document_width'] < 360) self.assertEqual(result[url]['cookies'], []) self.assertEqual(result[url]['logs'], []) self.assertEqual(result[url]['font_families'], ['"times new roman"'])
def test_notfound(self): url = 'http://www.example.com/' httpretty.register_uri(httpretty.HEAD, url, status=404, body="<html><body>Not found</body></html>") config = Config(urls=[url]) checker = url_reachability.Checker(config=config, previous_results={}) result = checker.run() self.assertEqual(result[url]['url'], url) self.assertEqual(result[url]['redirect_history'], []) self.assertEqual(result[url]['status'], 404) self.assertIsNone(result[url]['exception']) newconfig = checker.config self.assertEqual(len(newconfig.urls), 0)
def test_identical(self): page_body = """ <html> <head> <title>Title</title> </head> <body> <h1 class="title">Headline</h1> <p class="intro">Second paragraph with <strong>strong words</strong></p> <p class="text">Third paragraph</p> <ul class="somelist"> <li>A list item</li> </ul> </body> </html> """ url1 = 'http://example.com/' httpretty.register_uri(httpretty.GET, url1, body=page_body) url2 = 'http://www.example.com/' httpretty.register_uri(httpretty.GET, url2, body=page_body) results = {} config = Config(urls=[url1, url2]) page_content_checker = page_content.Checker(config=config, previous_results={}) results['page_content'] = page_content_checker.run() checker = duplicate_content.Checker(config=page_content_checker.config, previous_results=results) result = checker.run() urls_after = checker.config.urls self.assertEqual( result, { 'http://example.com/ http://www.example.com/': { 'exception': None, 'similarity': 1.0 } }) self.assertEqual(urls_after, ['http://example.com/'])
def test_http_response(self): url = 'http://www.example.com/' httpretty.register_uri(httpretty.GET, url, body="""<html> <head> <meta http-equiv="Content-type" value="text/html; charset=foo"> <meta charset="utf-8"> <title>Hello</title> </head> </html>""", adding_headers={ "Content-Type": "text/html; charset=ISO-8859-1", }) results = {} config = Config(urls=[url]) page_content_checker = page_content.Checker(config=config, previous_results={}) results['page_content'] = page_content_checker.run() self.assertIn(url, results['page_content']) self.assertIn('response_headers', results['page_content'][url]) self.assertIn('content-type', results['page_content'][url]['response_headers']) charset_checker = charset.Checker(config=page_content_checker.config, previous_results=results) result = charset_checker.run() self.assertIn(url, result) self.assertEqual( result[url], { 'meta_charset_tag': 'utf-8', 'content_type_header_charset': 'iso-8859-1', 'charset': 'utf-8', 'valid': True, 'exception': None, })
def test_frameset_positive(self): page_body = """ <!doctype html public "-//w3c//dtd html 4.0 transitional//en"> <html> <head> <title>A frameset page</title> </head> <frameset framespacing="0" border="false" frameborder="0" rows="30,*"> <frame name="top" src="top.htm" scrolling="no"> <frame name="base" src="titel.htm" target="_top"> <noframes> <body> <p>Here we have some body content</p> </body> </noframes> </frameset> </html> """ url = 'http://example.com/' httpretty.register_uri(httpretty.GET, url, body=page_body) results = {} config = Config(urls=[url]) page_content_checker = page_content.Checker(config=config, previous_results={}) results['page_content'] = page_content_checker.run() checker = frameset.Checker(config=page_content_checker.config, previous_results=results) result = checker.run() urls_after = checker.config.urls self.assertEqual(result, { 'http://example.com/': {'frameset': True} }) self.assertEqual(urls_after, ['http://example.com/'])
def test_links(self): page_body = """ <html> <head> <title>Title</title> </head> <body> <a href="/">Home</a> <a href="/sub/">Sub page</a> <a href="/"> Spaces </a> <a href="https://www.google.com/">External</a> <a href="/" style="display: hidden">Hidden</a> <a href="/" style="display: none">Hidden</a> </body> </html> """ url = 'http://example.com/' httpretty.register_uri(httpretty.GET, url, body=page_body) results = {} config = Config(urls=[url]) page_content_checker = page_content.Checker(config=config, previous_results={}) results['page_content'] = page_content_checker.run() checker = hyperlinks.Checker(config=page_content_checker.config, previous_results=results) result = checker.run() urls_after = checker.config.urls self.assertEqual( result, { 'http://example.com/': { 'links': [ { 'href': '/', 'text': 'Home' }, { 'href': '/sub/', 'text': 'Sub page' }, { 'href': '/', 'text': 'Spaces' }, { 'href': 'https://www.google.com/', 'text': 'External' }, { 'href': '/', 'text': 'Hidden' }, { 'href': '/', 'text': 'Hidden' }, ], 'exception': None, } }) self.assertEqual(urls_after, ['http://example.com/'])
def perform_checks(input_url): """ Executes all our URL/site checks and returns a big-ass result dict. """ # The sequence of checks to run. Order is important! # Checks which expand the URLs list must come first. # After that, dependencies (encoded in the checks) have to be fulfilled. check_modules = [ ('domain_variations', domain_variations), ('http_and_https', http_and_https), ('dns_resolution', dns_resolution), ('url_reachability', url_reachability), ('certificate', certificate), ('url_canonicalization', url_canonicalization), ('page_content', page_content), ('duplicate_content', duplicate_content), ('charset', charset), ('html_head', html_head), ('frameset', frameset), ('hyperlinks', hyperlinks), ('generator', generator), ('load_favicons', load_favicons), ('load_feeds', load_feeds), ('load_in_browser', load_in_browser), ] results = {} # TODO: # Set screenshot_bucket_name and storage_credentials_path # based on flags. config = Config( urls=[input_url], user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' + 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' + 'Safari/537.36 green-spider/0.2', screenshot_bucket_name='green-spider-screenshots.sendung.de', screenshot_datastore_kind='webscreenshot', storage_credentials_path='/secrets/screenshots-uploader.json', datastore_credentials_path='/secrets/datastore-writer.json') # Iterate over all checks. for check_name, check in check_modules: # checker is the individual test/assertion handler we instantiate # for each check step. checker = check.Checker(config=config, previous_results=results) # Ensure that dependencies are met for the checker. dependencies = checker.depends_on_results() if dependencies != []: for dep in dependencies: if (dep not in results or results[dep] is None or results[dep] == {} or results[dep] == []): logging.debug( "Skipping check %s as dependency %s is not met" % (check_name, dep)) continue # Execute the checker's main function. result = checker.run() results[check_name] = result # Execute any cleanup/aftermath function (if given) for the checker. modified_results = checker.post_hook(result) if modified_results is not None: results[check_name] = modified_results # Update config for the next check(s) in the sequence. config = checker.config logging.debug("config after check %s: %r" % (check_name, config)) return results