Example #1
0
 def test_for_adding_relation_between_pages(self):
     p1 = WebPage("test1", load_page=False)
     p2 = WebPage("test2", load_page=False)
     wg = WebGraph()
     wg.add_relation(p1, p2, directed=False)
     self.assertIn(p2, wg.graph[p1])
     self.assertIn(p1, wg.graph[p2])
Example #2
0
 def test_for_presence_of_page_in_graph(self):
     wg = WebGraph()
     p1 = wg.add_page(WebPage("http://localhost:5000/test",
                              load_page=False))
     p2 = WebPage("http://localhost:5000/home", load_page=False)
     self.assertTrue("http://localhost:5000/test" in wg)
     self.assertTrue(p1 in wg)
     self.assertFalse(p2 in wg)
Example #3
0
 def test_get_page_with_getitem_operator(self):
     wg = WebGraph()
     p1 = wg.add_page(WebPage("http://localhost:5000/test",
                              load_page=False))
     p2 = wg.add_page(WebPage("http://localhost:5000/home",
                              load_page=False))
     self.assertEqual(wg.get_page("http://localhost:5000/test"), p1)
     self.assertEqual(wg.get_page(p2), p2)
Example #4
0
 def test_add_relation_does_not_recreate_pages(self):
     p1 = WebPage("test1", load_page=False)
     p2 = WebPage("test2", load_page=False)
     wg = WebGraph()
     wg.add_page(p1)
     wg.add_page(p2)
     wg.add_relation(p1, p2, directed=False)
     self.assertEqual(len(wg), 2)
Example #5
0
 def test_path_between_directly_related_pages_contains_only_them(self):
     wg = WebGraph()
     p1 = wg.add_page(WebPage("http://localhost:5000/test",
                              load_page=False))
     p2 = wg.add_page(WebPage("http://localhost:5000/home",
                              load_page=False))
     wg.add_relation(p1, p2, directed=False)
     path = wg.find_path(p1, p2)
     self.assertCountEqual(path, (p1, p2))
Example #6
0
 def test_reload_only_headers(self, head_mock):
     response = Mock()
     response.content = b""
     response.headers = {"Content-Type": "text/html"}
     head_mock.return_value = response
     page = WebPage("http://localhost:5000", load_page=False)
     page.reload(head_request=True)
     self.assertEqual(page.content, b"")
     self.assertEqual(page.headers, {"Content-Type": "text/html"})
Example #7
0
 def test_find_path_finds_indirect_paths(self):
     wg = WebGraph()
     p1 = wg.add_page(WebPage("http://localhost:5000/test",
                              load_page=False))
     p2 = wg.add_page(WebPage("http://localhost:5000/home",
                              load_page=False))
     p3 = wg.add_page(WebPage("http://localhost:5000/new", load_page=False))
     wg.add_relation(p1, p2, directed=False)
     wg.add_relation(p2, p3, directed=False)
     self.assertEqual(len(wg), 3)
     path = wg.find_path(p1, p3)
     self.assertCountEqual(path, (p1, p2, p3))
Example #8
0
 def test_find_path_returns_None_when_no_path(self):
     wg = WebGraph()
     p1 = wg.add_page(WebPage("http://localhost:5000/test",
                              load_page=False))
     p2 = wg.add_page(WebPage("http://localhost:5000/home",
                              load_page=False))
     p3 = wg.add_page(WebPage("http://localhost:5000/next",
                              load_page=False))
     p4 = wg.add_page(WebPage("http://localhost:5000/prev",
                              load_page=False))
     wg.add_relation(p1, p2, directed=False)
     wg.add_relation(p3, p4, directed=False)
     path = wg.find_path(p1, p4)
     self.assertIsNone(path)
Example #9
0
 def test_for_extracting_urls(self, get_mock):
     response = Mock()
     response.content = b"""
         <html>
         <body>
             <a href='test.html'>Test</a>
             <a href="mailto:[email protected]">E-Mail</a>
             Contact: [email protected]
         </body>
         </html>
     """
     response.encoding = "utf-8"
     response.headers = {"Content-Type": "text/html"}
     get_mock.return_value = response
     page = WebPage("http://localhost:5000", load_page=False)
     page.reload()
     links = find_urls(page)
     self.assertTrue(len(links), 1)
     self.assertEqual(links[0], "test.html")
Example #10
0
 def test_find_paths_finds_the_shortest_path(self):
     wg = WebGraph()
     p1 = wg.add_page(WebPage("http://localhost:5000/test",
                              load_page=False))
     p2 = wg.add_page(WebPage("http://localhost:5000/home",
                              load_page=False))
     p3 = wg.add_page(WebPage("http://localhost:5000/next",
                              load_page=False))
     p4 = wg.add_page(WebPage("http://localhost:5000/prev",
                              load_page=False))
     p5 = wg.add_page(WebPage("http://localhost:5000/back",
                              load_page=False))
     wg.add_relation(p1, p2, directed=False)
     wg.add_relation(p2, p3, directed=False)
     wg.add_relation(p3, p4, directed=False)
     wg.add_relation(p4, p5, directed=False)
     wg.add_relation(p1, p4, directed=False)
     path = wg.find_path(p1, p5)
     self.assertCountEqual(path, (p1, p4, p5))
Example #11
0
 def test_find_nearest_neighbours_returns_closest_pages(self):
     wg = WebGraph()
     p = [
         wg.add_page(WebPage("fake %d" % i, load_page=False))
         for i in range(7)
     ]
     '''
       / 1 - 2 - 3
     0           |
       \ 4 - 5 - 6
     '''
     wg.add_relation(p[0], p[1], directed=False)
     wg.add_relation(p[1], p[2], directed=False)
     wg.add_relation(p[2], p[3], directed=False)
     wg.add_relation(p[0], p[4], directed=False)
     wg.add_relation(p[4], p[5], directed=False)
     wg.add_relation(p[5], p[6], directed=False)
     wg.add_relation(p[3], p[6], directed=False)
     pages = wg.find_nearest_neighbours(p[0], max_dist=2)
     pages = [page for page, dist in pages]
     self.assertCountEqual(pages, [p[1], p[2], p[4], p[5]])
Example #12
0
 def test_for_finding_sites(self):
     page = WebPage("http://localhost:5000/test")
     result = search_webpage(page)
     self.assertEqual(len(result.urls), 1)
     self.assertIn("http://localhost:5000/test", list(result.urls))
Example #13
0
 def test_get_page_returns_correct_page(self):
     wg = WebGraph()
     wg.add_page(WebPage("http://localhost:5000/test", load_page=False))
     wg.add_page(WebPage("http://localhost:5000/home", load_page=False))
     page = wg.get_page("http://localhost:5000/test", create_new=False)
     self.assertIsNotNone(page)
Example #14
0
 def test_for_finding_emails(self):
     page = WebPage("http://*****:*****@for.it", result.emails)
Example #15
0
 def test_reload_sets_content_and_headers_attribute(self):
     page = WebPage("http://localhost:5000")
     self.assertEqual(page.content, b"<html></html>")
     self.assertEqual(page.headers, {"Content-Type": "text/html"})
Example #16
0
 def test_add_page_adds_new_page(self):
     wg = WebGraph()
     page = wg.add_page(
         WebPage("http://localhost:5000/test", load_page=False))
     self.assertEqual(page, wg.get_page("http://localhost:5000/test"))
Example #17
0
 def test_constructor_loads_page_by_default(self, get_mock):
     p1 = WebPage("http://localhost:5000/")
     get_mock.assert_called_once_with("http://localhost:5000/", params=None)
Example #18
0
 def test_for_turning_off_loading_page_in_constructor(self, get_mock):
     p1 = WebPage("http://localhost:5000", load_page=False)
     self.assertFalse(get_mock.called)
Example #19
0
 def test_pages_with_the_same_url_are_equal(self):
     p1 = WebPage("http://www.test.page.com")
     p2 = WebPage("http://www.test.page.com")
     self.assertEqual(p1, p2)
Example #20
0
 def test_raises_error_when_changing_url(self):
     p1 = WebPage("http://www.you.can.change.me")
     with self.assertRaises(AttributeError):
         p1.url = "http://www.I.can.do.everything"
Example #21
0
 def test_pages_with_the_same_url_have_similar_hash(self):
     p1 = WebPage("http://www.test.page.com")
     p2 = WebPage("http://www.test.page.com")
     self.assertEqual(hash(p1), hash(p2))
Example #22
0
 def test_for_creating_webpage_from_repr(self):
     p1 = WebPage("http://www.test.page.com", load_page=False)
     p2 = eval(repr(p1))
     self.assertEqual(p2.url, p1.url)
     self.assertFalse(p2.loaded)
Example #23
0
 def test_add_page_called_with_parent_adds_relation_between_pages(self):
     wg = WebGraph()
     root = WebPage(url="http://localhost:5000/", load_page=False)
     page = wg.add_page("http://localhost:5000/test", parent=root)
     self.assertIn(page, wg.graph[root])
Example #24
0
 def test_max_depth_limits_depth_of_traversed_web_pages(self):
     page = WebPage("http://localhost:5000")
     sm = SearchManager(max_workers=5)
     sm.search(page, max_depth=1)
     self.assertEqual(len(sm.visited), 11)
Example #25
0
 def test_find_path_returns_None_when_page_not_in_graph(self):
     wg = WebGraph()
     p1 = wg.add_page(WebPage("http://localhost:5000/test",
                              load_page=False))
     p2 = WebPage("http://localhost:5000/home", load_page=False)
     self.assertIsNone(wg.find_path(p1, p2))
Example #26
0
 def test_updates_graph_of_web_site(self):
     page = WebPage("http://localhost:5000")
     sm = SearchManager(max_workers=5)
     sm.search(page, max_depth=100)
     self.assertEqual(len(sm.webgraph.graph[page]), 10)
Example #27
0
    print("\nPress CTRL+C to stop the script.\n")

    sm = SearchManager(max_workers=args.max_workers)

    if args.verbose:

        def complete(future):
            print("COMPLETE: %s" % future.result().url)

        sm.callback = complete

    if args.skip:
        sm.add_filter(avoid_extensions(args.skip))

    # Run cralwer
    sm.search(WebPage(args.url),
              max_depth=args.max_depth,
              within_domain=args.domain_limited)

    if args.verbose:
        print("\nEmails:")
        if sm.emails:
            for email in sm.emails:
                print("\t%s" % email)
        else:
            print("-no emails found")

        print("\nVisited web pages:")
        for page in sm.visited:
            print("\t%s" % page.url)