def test_favicons(self):
        # This site has a favicon
        url1 = 'http://example1.com/favicon.ico'
        httpretty.register_uri(httpretty.HEAD,
                               url1,
                               body='',
                               adding_headers={
                                   "Content-type": "image/x-ico",
                               })

        # This site has no favicon
        url2 = 'http://example2.com/favicon.ico'
        httpretty.register_uri(httpretty.HEAD,
                               url2,
                               status=404,
                               body='Not found',
                               adding_headers={
                                   "Content-type": "text/plain",
                               })

        config = Config(
            urls=['http://example1.com/path/', 'http://example2.com/'])
        checker = load_favicons.Checker(config=config)

        result = checker.run()
        pprint(result)

        self.assertEqual(
            result, {
                'http://example1.com/path/': {
                    'url': 'http://example1.com/favicon.ico'
                }
            })
    def test_feed_rss2(self):
        """
        Checks RSS 2.0
        """

        feed = """<?xml version="1.0"?>
            <rss version="2.0">
                <channel>
                    <title>Liftoff News</title>
                    <link>http://liftoff.msfc.nasa.gov/</link>
                    <description>Liftoff to Space Exploration.</description>
                    <pubDate>Tue, 10 Jun 2003 04:00:00 GMT</pubDate>
                    <item>
                        <title>Star City</title>
                        <link>http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp</link>
                        <pubDate>Tue, 03 Jun 2003 09:39:21 GMT</pubDate>
                        <guid>http://liftoff.msfc.nasa.gov/2003/06/03.html#item573</guid>
                    </item>
                    <item>
                        <description>Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a &lt;a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm"&gt;partial eclipse of the Sun&lt;/a&gt; on Saturday, May 31st.</description>
                        <pubDate>Fri, 30 May 2003 11:06:42 GMT</pubDate>
                        <guid>http://liftoff.msfc.nasa.gov/2003/05/30.html#item572</guid>
                    </item>
                </channel>
            </rss>
        """

        feed_url = 'http://example.com/feed.xml'
        httpretty.register_uri(httpretty.GET,
                               feed_url,
                               body=feed,
                               adding_headers={
                                   "Content-type": "application/rss+xml",
                               })

        # mocking a previous result from some page
        results = {
            'html_head': {
                'http://example.com/': {
                    'link_rss_atom': ['http://example.com/feed.xml']
                }
            }
        }
        config = Config(urls=['http://example.com/'])
        checker = load_feeds.Checker(config=config, previous_results=results)

        result = checker.run()
        pprint(result)

        self.assertEqual(
            result, {
                'http://example.com/feed.xml': {
                    'exception': None,
                    'title': 'Liftoff News',
                    'latest_entry': datetime(2003, 6, 3, 9, 39, 21),
                    'first_entry': datetime(2003, 5, 30, 11, 6, 42),
                    'average_interval': 340359,
                    'num_entries': 2,
                }
            })
    def test_redirect(self):
        url = 'http://www.example.com/'
        url2 = 'http://www2.example.com/'
        httpretty.register_uri(httpretty.HEAD,
                               url,
                               status=302,
                               body="",
                               adding_headers={"Location": url2})
        httpretty.register_uri(httpretty.HEAD,
                               url2,
                               status=200,
                               body="<html></html>")

        config = Config(urls=[url])
        checker = url_reachability.Checker(config=config, previous_results={})
        result = checker.run()

        self.assertIn(url, result)
        self.assertEqual(result[url]['url'], url)
        self.assertEqual(result[url]['status'], 200)
        self.assertIsNone(result[url]['exception'])
        self.assertTrue(0 < result[url]['duration'] < 100)
        self.assertEqual(len(result[url]['redirect_history']), 1)
        self.assertEqual(result[url]['redirect_history'][0]['status'], 302)
        self.assertEqual(result[url]['redirect_history'][0]['redirect_to'],
                         url2)
Example #4
0
    def test_frameset_negative(self):
        page_body = """
            <!doctype html public "-//w3c//dtd html 4.0 transitional//en">
            <html>
                <head>
                    <title>A frameset page</title>
                </head>
                <body>
                    <p>Here we have some body content</p>
                </body>
            </html>
        """

        url = 'http://example.com/'
        httpretty.register_uri(httpretty.GET, url, body=page_body)

        results = {}

        config = Config(urls=[url])
        page_content_checker = page_content.Checker(config=config, previous_results={})
        results['page_content'] = page_content_checker.run()

        checker = frameset.Checker(config=page_content_checker.config,
                                   previous_results=results)
        result = checker.run()
        urls_after = checker.config.urls

        self.assertEqual(result, {
            'http://example.com/': {'frameset': False}
        })
        self.assertEqual(urls_after, ['http://example.com/'])
 def test_kaarst(self):
     """Real-workd example"""
     url = 'https://www.gruenekaarst.de/'
     config = Config(urls=[url])
     checker = certificate.Checker(config=config, previous_results={})
     result = checker.run()
     self.assertIn(url, result)
     self.assertIsNone(result[url]['exception'])
     self.assertEqual(result[url]['issuer']['O'], 'Sectigo Limited')
 def test_tls_v_1_2(self):
     """Load a certificate for a TLS v1.2 server"""
     url = 'https://tls-v1-2.badssl.com:1012/'
     config = Config(urls=[url])
     checker = certificate.Checker(config=config, previous_results={})
     result = checker.run()
     self.assertIn(url, result)
     self.assertIsNone(result[url]['exception'])
     self.assertEqual(result[url]['subject']['CN'], '*.badssl.com')
 def test_google(self):
     """Load cert from a site that should work"""
     url = 'https://www.google.com/'
     config = Config(urls=[url])
     checker = certificate.Checker(config=config, previous_results={})
     result = checker.run()
     self.assertIn(url, result)
     self.assertIsNone(result[url]['exception'])
     self.assertEqual(result[url]['issuer']['O'], 'Google Trust Services')
def perform_checks(input_url):
    """
    Executes all our URL/site checks and returns a big-ass result dict.
    """

    # The sequence of checks to run. Order is important!
    # Checks which expand the URLs list must come first.
    # After that, dependencies (encoded in the checks) have to be fulfilled.
    check_modules = [
        ('domain_variations', domain_variations),
        ('http_and_https', http_and_https),
        ('dns_resolution', dns_resolution),
        ('url_reachability', url_reachability),
        ('certificate', certificate),
        ('url_canonicalization', url_canonicalization),
        ('page_content', page_content),
        ('duplicate_content', duplicate_content),
        ('charset', charset),
        ('html_head', html_head),
        ('frameset', frameset),
        ('hyperlinks', hyperlinks),
        ('generator', generator),
        ('load_favicons', load_favicons),
        ('load_feeds', load_feeds),
        ('load_in_browser', load_in_browser),
    ]

    results = {}

    config = Config(
        urls=[input_url],
        user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' +
        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' +
        'Safari/537.36 green-spider/0.2')

    for check_name, check in check_modules:
        checker = check.Checker(config=config, previous_results=results)

        # see if dependencies are met
        dependencies = checker.depends_on_results()
        if dependencies != []:
            for dep in dependencies:
                if (dep not in results or results[dep] is None
                        or results[dep] == {} or results[dep] == []):
                    logging.debug(
                        "Skipping check %s as dependency %s is not met" %
                        (check_name, dep))
                    continue

        result = checker.run()
        results[check_name] = result

        # update config for the next check
        config = checker.config
        logging.debug("config after check %s: %r" % (check_name, config))

    return results
    def test_simple(self):
        url = 'http://example.org/'
        config = Config(urls=[url])
        checker = domain_variations.Checker(config=config, previous_results={})
        checker.run()
        config_after = checker.config

        self.assertEqual(config_after.urls,
                         ['http://example.org/', 'http://www.example.org/'])
Example #10
0
    def runTest(self):
        """Resolves www.google.com"""
        url = 'https://www.google.com/'
        config = Config(urls=[url])
        checker = dns_resolution.Checker(config=config, previous_results={})
        result = checker.run()

        self.assertIn(url, result)
        self.assertEqual(result[url]['hostname'], 'www.google.com')
        self.assertTrue(result[url], 'resolvable_ipv4')
        self.assertTrue(result[url], 'resolvable_ipv6')
        self.assertIsInstance(result[url]['ipv4_addresses'], list)
        self.assertNotEqual(result[url]['ipv4_addresses'], [])
    def test_empty_feed_rss2(self):
        """
        Checks RSS 2.0
        """

        feed = """<?xml version="1.0"?>
            <rss version="2.0">
                <channel>
                    <title>Empty Feed</title>
                    <link>http://example.com/</link>
                    <pubDate></pubDate>
                </channel>
            </rss>
        """

        feed_url = 'http://example.com/feed.xml'
        httpretty.register_uri(httpretty.GET,
                               feed_url,
                               body=feed,
                               adding_headers={
                                   "Content-type": "application/rss+xml",
                               })

        # mocking a previous result from some page
        results = {
            'html_head': {
                'http://example.com/': {
                    'link_rss_atom': ['http://example.com/feed.xml']
                }
            }
        }
        config = Config(urls=['http://example.com/'])
        checker = load_feeds.Checker(config=config, previous_results=results)

        result = checker.run()
        pprint(result)

        self.assertEqual(
            result, {
                'http://example.com/feed.xml': {
                    'exception': None,
                    'title': 'Empty Feed',
                    'latest_entry': None,
                    'first_entry': None,
                    'average_interval': None,
                    'num_entries': 0,
                }
            })
    def test_success(self):
        url = 'http://www.example.com/'
        httpretty.register_uri(httpretty.HEAD,
                               url,
                               status=200,
                               body="<html></html>")

        config = Config(urls=[url])
        checker = url_reachability.Checker(config=config, previous_results={})
        result = checker.run()

        self.assertEqual(result[url]['url'], url)
        self.assertEqual(result[url]['redirect_history'], [])
        self.assertEqual(result[url]['status'], 200)
        self.assertIsNone(result[url]['exception'])
        self.assertTrue(0 < result[url]['duration'] < 100)
    def test_basics(self):
        """Loads a simple HTML web page to check basic functionality"""
        url = 'https://httpbin.org/html'
        config = Config(urls=[url])
        checker = load_in_browser.Checker(config=config, previous_results={})
        result = checker.run()

        self.assertIn(url, result)
        self.assertIn('cookies', result[url])
        self.assertIn('font_families', result[url])
        self.assertIn('logs', result[url])
        self.assertIn('min_document_width', result[url])
        self.assertIn('sizes', result[url])

        self.assertTrue(result[url]['min_document_width'] < 360)
        self.assertEqual(result[url]['cookies'], [])
        self.assertEqual(result[url]['logs'], [])
        self.assertEqual(result[url]['font_families'], ['"times new roman"'])
    def test_notfound(self):
        url = 'http://www.example.com/'
        httpretty.register_uri(httpretty.HEAD,
                               url,
                               status=404,
                               body="<html><body>Not found</body></html>")

        config = Config(urls=[url])
        checker = url_reachability.Checker(config=config, previous_results={})
        result = checker.run()

        self.assertEqual(result[url]['url'], url)
        self.assertEqual(result[url]['redirect_history'], [])
        self.assertEqual(result[url]['status'], 404)
        self.assertIsNone(result[url]['exception'])

        newconfig = checker.config

        self.assertEqual(len(newconfig.urls), 0)
    def test_identical(self):
        page_body = """
            <html>
                <head>
                    <title>Title</title>
                </head>
                <body>
                    <h1 class="title">Headline</h1>
                    <p class="intro">Second paragraph with <strong>strong words</strong></p>
                    <p class="text">Third paragraph</p>
                    <ul class="somelist">
                        <li>A list item</li>
                    </ul>
                </body>
            </html>
        """

        url1 = 'http://example.com/'
        httpretty.register_uri(httpretty.GET, url1, body=page_body)

        url2 = 'http://www.example.com/'
        httpretty.register_uri(httpretty.GET, url2, body=page_body)

        results = {}

        config = Config(urls=[url1, url2])
        page_content_checker = page_content.Checker(config=config,
                                                    previous_results={})
        results['page_content'] = page_content_checker.run()

        checker = duplicate_content.Checker(config=page_content_checker.config,
                                            previous_results=results)
        result = checker.run()
        urls_after = checker.config.urls

        self.assertEqual(
            result, {
                'http://example.com/ http://www.example.com/': {
                    'exception': None,
                    'similarity': 1.0
                }
            })
        self.assertEqual(urls_after, ['http://example.com/'])
Example #16
0
    def test_http_response(self):
        url = 'http://www.example.com/'
        httpretty.register_uri(httpretty.GET,
                               url,
                               body="""<html>
                <head>
                <meta http-equiv="Content-type" value="text/html; charset=foo">
                <meta charset="utf-8">
                <title>Hello</title>
                </head>
            </html>""",
                               adding_headers={
                                   "Content-Type":
                                   "text/html; charset=ISO-8859-1",
                               })

        results = {}

        config = Config(urls=[url])
        page_content_checker = page_content.Checker(config=config,
                                                    previous_results={})
        results['page_content'] = page_content_checker.run()

        self.assertIn(url, results['page_content'])
        self.assertIn('response_headers', results['page_content'][url])
        self.assertIn('content-type',
                      results['page_content'][url]['response_headers'])

        charset_checker = charset.Checker(config=page_content_checker.config,
                                          previous_results=results)
        result = charset_checker.run()

        self.assertIn(url, result)
        self.assertEqual(
            result[url], {
                'meta_charset_tag': 'utf-8',
                'content_type_header_charset': 'iso-8859-1',
                'charset': 'utf-8',
                'valid': True,
                'exception': None,
            })
Example #17
0
    def test_frameset_positive(self):
        page_body = """
            <!doctype html public "-//w3c//dtd html 4.0 transitional//en">
            <html>
                <head>
                    <title>A frameset page</title>
                </head>
                <frameset framespacing="0" border="false" frameborder="0" rows="30,*">
                    <frame name="top" src="top.htm" scrolling="no">
                    <frame name="base" src="titel.htm" target="_top">
                    <noframes>
                        <body>
                            <p>Here we have some body content</p>
                        </body>
                    </noframes>
                </frameset>
            </html>
        """

        url = 'http://example.com/'
        httpretty.register_uri(httpretty.GET, url, body=page_body)

        results = {}

        config = Config(urls=[url])
        page_content_checker = page_content.Checker(config=config, previous_results={})
        results['page_content'] = page_content_checker.run()

        checker = frameset.Checker(config=page_content_checker.config,
                                   previous_results=results)
        result = checker.run()
        urls_after = checker.config.urls

        self.assertEqual(result, {
            'http://example.com/': {'frameset': True}
        })
        self.assertEqual(urls_after, ['http://example.com/'])
Example #18
0
    def test_links(self):
        page_body = """
            <html>
                <head>
                    <title>Title</title>
                </head>
                <body>
                    <a href="/">Home</a>
                    <a href="/sub/">Sub page</a>
                    <a href="/"> Spaces </a>
                    <a href="https://www.google.com/">External</a>
                    <a href="/" style="display: hidden">Hidden</a>
                    <a href="/" style="display: none">Hidden</a>
                </body>
            </html>
        """

        url = 'http://example.com/'
        httpretty.register_uri(httpretty.GET, url, body=page_body)

        results = {}

        config = Config(urls=[url])
        page_content_checker = page_content.Checker(config=config,
                                                    previous_results={})
        results['page_content'] = page_content_checker.run()

        checker = hyperlinks.Checker(config=page_content_checker.config,
                                     previous_results=results)
        result = checker.run()
        urls_after = checker.config.urls

        self.assertEqual(
            result, {
                'http://example.com/': {
                    'links': [
                        {
                            'href': '/',
                            'text': 'Home'
                        },
                        {
                            'href': '/sub/',
                            'text': 'Sub page'
                        },
                        {
                            'href': '/',
                            'text': 'Spaces'
                        },
                        {
                            'href': 'https://www.google.com/',
                            'text': 'External'
                        },
                        {
                            'href': '/',
                            'text': 'Hidden'
                        },
                        {
                            'href': '/',
                            'text': 'Hidden'
                        },
                    ],
                    'exception':
                    None,
                }
            })
        self.assertEqual(urls_after, ['http://example.com/'])
Example #19
0
def perform_checks(input_url):
    """
    Executes all our URL/site checks and returns a big-ass result dict.
    """

    # The sequence of checks to run. Order is important!
    # Checks which expand the URLs list must come first.
    # After that, dependencies (encoded in the checks) have to be fulfilled.
    check_modules = [
        ('domain_variations', domain_variations),
        ('http_and_https', http_and_https),
        ('dns_resolution', dns_resolution),
        ('url_reachability', url_reachability),
        ('certificate', certificate),
        ('url_canonicalization', url_canonicalization),
        ('page_content', page_content),
        ('duplicate_content', duplicate_content),
        ('charset', charset),
        ('html_head', html_head),
        ('frameset', frameset),
        ('hyperlinks', hyperlinks),
        ('generator', generator),
        ('load_favicons', load_favicons),
        ('load_feeds', load_feeds),
        ('load_in_browser', load_in_browser),
    ]

    results = {}

    # TODO:
    # Set screenshot_bucket_name and storage_credentials_path
    # based on flags.
    config = Config(
        urls=[input_url],
        user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) ' +
        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 ' +
        'Safari/537.36 green-spider/0.2',
        screenshot_bucket_name='green-spider-screenshots.sendung.de',
        screenshot_datastore_kind='webscreenshot',
        storage_credentials_path='/secrets/screenshots-uploader.json',
        datastore_credentials_path='/secrets/datastore-writer.json')

    # Iterate over all checks.
    for check_name, check in check_modules:

        # checker is the individual test/assertion handler we instantiate
        # for each check step.
        checker = check.Checker(config=config, previous_results=results)

        # Ensure that dependencies are met for the checker.
        dependencies = checker.depends_on_results()
        if dependencies != []:
            for dep in dependencies:
                if (dep not in results or results[dep] is None
                        or results[dep] == {} or results[dep] == []):
                    logging.debug(
                        "Skipping check %s as dependency %s is not met" %
                        (check_name, dep))
                    continue

        # Execute the checker's main function.
        result = checker.run()
        results[check_name] = result

        # Execute any cleanup/aftermath function (if given) for the checker.
        modified_results = checker.post_hook(result)
        if modified_results is not None:
            results[check_name] = modified_results

        # Update config for the next check(s) in the sequence.
        config = checker.config
        logging.debug("config after check %s: %r" % (check_name, config))

    return results