Example #1
0
class TestCases(unittest.TestCase):
    # Open and close file from data/
    def setUp(self):
        self.testfile = open(HTMLDATA)
        self.data = self.testfile.read()
        self.url = "https://example.com"
        self.check = Checker(self.url)
        self.parser = Parser()

    def tearDown(self):
        self.testfile.close()

    # Parser gives expected values
    def test_parser_expected_output(self):
        links = self.parser.feed_me(self.data)
        expected_output = [
            "style.css",
            "scripts.js",
            "http://baddomain.com/i-donut-exist",
            "image.png",
            "www.anotherbaddomain.com/multithreading-is-fun",
            "https://example.com/i-have-links",
            "https://example.com",
        ]
        self.assertEqual(links, expected_output)

    # Checker uses correct domain for comparison
    def test_domain_extraction(self):
        self.assertEqual(self.check.extract_domain(self.url), "example.com")

    # Checker doesn't add visited links to queue
    def test_process_queue_length(self):
        self.pagedata = {
            "url": "https://example.com/test-page.html",
            "parent": "https://example.com/test-page.html",
            "data":
            '<!DOCTYPE html>\n<html>\n\n    <head>\n        <title>Test Data Page</title>\n\n        <meta charset="utf-8">\n        <meta http-equiv="Content-type" content="text/html; charset=UTF-8">\n        <meta name="viewport" content="width=device-width, initial-scale=1">\n        <link rel="stylesheet" href="style.css" type="text/css">\n        <script type="text/javascript" src="scripts.js"></script>\n    </head>\n\n    <body>\n        <div>\n            <h1>Test Data Page</h1>\n            <p>This page does not exist: <a href="/i-donut-exist">Whale</a></p>\n            <p>This is not a link: <a>No Spoon</a></p>\n            <img src="image.png" />\n            <p>This page does not exist: <a href="/multithreading-is-fun">Petunias</a></p>\n            <p>This page contains more links: <a href="/i-have-links">Crawl Me</a></p>\n            <p>This domain is for use in illustrative examples in documents. You may use this\n                domain in literature without prior coordination or asking for permission: <a\n                    href="https://example.com">Example</a></p>\n        </div>\n\n\n    </body>\n\n</html>',
            "valid_content_type": True,
        }
        # There are 7 links in pagedata["data"]
        first_parse = 7
        self.check.parse_page(self.pagedata)
        self.assertEqual(len(self.check.TO_PROCESS.queue), first_parse)
        self.check.visited.add("https://example.com/style.css")
        # Checker should add to queue all but the one visited link
        second_parse = 13
        self.check.parse_page(self.pagedata)
        self.assertEqual(len(self.check.TO_PROCESS.queue), second_parse)
Example #2
0
 def setUp(self):
     self.testfile = open(HTMLDATA)
     self.data = self.testfile.read()
     self.url = "https://example.com"
     self.check = Checker(self.url, Config())
     self.parser = Parser(Config())