def test_for_adding_relation_between_pages(self): p1 = WebPage("test1", load_page=False) p2 = WebPage("test2", load_page=False) wg = WebGraph() wg.add_relation(p1, p2, directed=False) self.assertIn(p2, wg.graph[p1]) self.assertIn(p1, wg.graph[p2])
def test_for_presence_of_page_in_graph(self): wg = WebGraph() p1 = wg.add_page(WebPage("http://localhost:5000/test", load_page=False)) p2 = WebPage("http://localhost:5000/home", load_page=False) self.assertTrue("http://localhost:5000/test" in wg) self.assertTrue(p1 in wg) self.assertFalse(p2 in wg)
def test_get_page_with_getitem_operator(self): wg = WebGraph() p1 = wg.add_page(WebPage("http://localhost:5000/test", load_page=False)) p2 = wg.add_page(WebPage("http://localhost:5000/home", load_page=False)) self.assertEqual(wg.get_page("http://localhost:5000/test"), p1) self.assertEqual(wg.get_page(p2), p2)
def test_add_relation_does_not_recreate_pages(self): p1 = WebPage("test1", load_page=False) p2 = WebPage("test2", load_page=False) wg = WebGraph() wg.add_page(p1) wg.add_page(p2) wg.add_relation(p1, p2, directed=False) self.assertEqual(len(wg), 2)
def test_path_between_directly_related_pages_contains_only_them(self): wg = WebGraph() p1 = wg.add_page(WebPage("http://localhost:5000/test", load_page=False)) p2 = wg.add_page(WebPage("http://localhost:5000/home", load_page=False)) wg.add_relation(p1, p2, directed=False) path = wg.find_path(p1, p2) self.assertCountEqual(path, (p1, p2))
def test_reload_only_headers(self, head_mock): response = Mock() response.content = b"" response.headers = {"Content-Type": "text/html"} head_mock.return_value = response page = WebPage("http://localhost:5000", load_page=False) page.reload(head_request=True) self.assertEqual(page.content, b"") self.assertEqual(page.headers, {"Content-Type": "text/html"})
def test_find_path_finds_indirect_paths(self): wg = WebGraph() p1 = wg.add_page(WebPage("http://localhost:5000/test", load_page=False)) p2 = wg.add_page(WebPage("http://localhost:5000/home", load_page=False)) p3 = wg.add_page(WebPage("http://localhost:5000/new", load_page=False)) wg.add_relation(p1, p2, directed=False) wg.add_relation(p2, p3, directed=False) self.assertEqual(len(wg), 3) path = wg.find_path(p1, p3) self.assertCountEqual(path, (p1, p2, p3))
def test_find_path_returns_None_when_no_path(self): wg = WebGraph() p1 = wg.add_page(WebPage("http://localhost:5000/test", load_page=False)) p2 = wg.add_page(WebPage("http://localhost:5000/home", load_page=False)) p3 = wg.add_page(WebPage("http://localhost:5000/next", load_page=False)) p4 = wg.add_page(WebPage("http://localhost:5000/prev", load_page=False)) wg.add_relation(p1, p2, directed=False) wg.add_relation(p3, p4, directed=False) path = wg.find_path(p1, p4) self.assertIsNone(path)
def test_for_extracting_urls(self, get_mock): response = Mock() response.content = b""" <html> <body> <a href='test.html'>Test</a> <a href="mailto:[email protected]">E-Mail</a> Contact: [email protected] </body> </html> """ response.encoding = "utf-8" response.headers = {"Content-Type": "text/html"} get_mock.return_value = response page = WebPage("http://localhost:5000", load_page=False) page.reload() links = find_urls(page) self.assertTrue(len(links), 1) self.assertEqual(links[0], "test.html")
def test_find_paths_finds_the_shortest_path(self): wg = WebGraph() p1 = wg.add_page(WebPage("http://localhost:5000/test", load_page=False)) p2 = wg.add_page(WebPage("http://localhost:5000/home", load_page=False)) p3 = wg.add_page(WebPage("http://localhost:5000/next", load_page=False)) p4 = wg.add_page(WebPage("http://localhost:5000/prev", load_page=False)) p5 = wg.add_page(WebPage("http://localhost:5000/back", load_page=False)) wg.add_relation(p1, p2, directed=False) wg.add_relation(p2, p3, directed=False) wg.add_relation(p3, p4, directed=False) wg.add_relation(p4, p5, directed=False) wg.add_relation(p1, p4, directed=False) path = wg.find_path(p1, p5) self.assertCountEqual(path, (p1, p4, p5))
def test_find_nearest_neighbours_returns_closest_pages(self): wg = WebGraph() p = [ wg.add_page(WebPage("fake %d" % i, load_page=False)) for i in range(7) ] ''' / 1 - 2 - 3 0 | \ 4 - 5 - 6 ''' wg.add_relation(p[0], p[1], directed=False) wg.add_relation(p[1], p[2], directed=False) wg.add_relation(p[2], p[3], directed=False) wg.add_relation(p[0], p[4], directed=False) wg.add_relation(p[4], p[5], directed=False) wg.add_relation(p[5], p[6], directed=False) wg.add_relation(p[3], p[6], directed=False) pages = wg.find_nearest_neighbours(p[0], max_dist=2) pages = [page for page, dist in pages] self.assertCountEqual(pages, [p[1], p[2], p[4], p[5]])
def test_for_finding_sites(self): page = WebPage("http://localhost:5000/test") result = search_webpage(page) self.assertEqual(len(result.urls), 1) self.assertIn("http://localhost:5000/test", list(result.urls))
def test_get_page_returns_correct_page(self): wg = WebGraph() wg.add_page(WebPage("http://localhost:5000/test", load_page=False)) wg.add_page(WebPage("http://localhost:5000/home", load_page=False)) page = wg.get_page("http://localhost:5000/test", create_new=False) self.assertIsNotNone(page)
def test_for_finding_emails(self): page = WebPage("http://*****:*****@for.it", result.emails)
def test_reload_sets_content_and_headers_attribute(self): page = WebPage("http://localhost:5000") self.assertEqual(page.content, b"<html></html>") self.assertEqual(page.headers, {"Content-Type": "text/html"})
def test_add_page_adds_new_page(self): wg = WebGraph() page = wg.add_page( WebPage("http://localhost:5000/test", load_page=False)) self.assertEqual(page, wg.get_page("http://localhost:5000/test"))
def test_constructor_loads_page_by_default(self, get_mock): p1 = WebPage("http://localhost:5000/") get_mock.assert_called_once_with("http://localhost:5000/", params=None)
def test_for_turning_off_loading_page_in_constructor(self, get_mock): p1 = WebPage("http://localhost:5000", load_page=False) self.assertFalse(get_mock.called)
def test_pages_with_the_same_url_are_equal(self): p1 = WebPage("http://www.test.page.com") p2 = WebPage("http://www.test.page.com") self.assertEqual(p1, p2)
def test_raises_error_when_changing_url(self): p1 = WebPage("http://www.you.can.change.me") with self.assertRaises(AttributeError): p1.url = "http://www.I.can.do.everything"
def test_pages_with_the_same_url_have_similar_hash(self): p1 = WebPage("http://www.test.page.com") p2 = WebPage("http://www.test.page.com") self.assertEqual(hash(p1), hash(p2))
def test_for_creating_webpage_from_repr(self): p1 = WebPage("http://www.test.page.com", load_page=False) p2 = eval(repr(p1)) self.assertEqual(p2.url, p1.url) self.assertFalse(p2.loaded)
def test_add_page_called_with_parent_adds_relation_between_pages(self): wg = WebGraph() root = WebPage(url="http://localhost:5000/", load_page=False) page = wg.add_page("http://localhost:5000/test", parent=root) self.assertIn(page, wg.graph[root])
def test_max_depth_limits_depth_of_traversed_web_pages(self): page = WebPage("http://localhost:5000") sm = SearchManager(max_workers=5) sm.search(page, max_depth=1) self.assertEqual(len(sm.visited), 11)
def test_find_path_returns_None_when_page_not_in_graph(self): wg = WebGraph() p1 = wg.add_page(WebPage("http://localhost:5000/test", load_page=False)) p2 = WebPage("http://localhost:5000/home", load_page=False) self.assertIsNone(wg.find_path(p1, p2))
def test_updates_graph_of_web_site(self): page = WebPage("http://localhost:5000") sm = SearchManager(max_workers=5) sm.search(page, max_depth=100) self.assertEqual(len(sm.webgraph.graph[page]), 10)
print("\nPress CTRL+C to stop the script.\n") sm = SearchManager(max_workers=args.max_workers) if args.verbose: def complete(future): print("COMPLETE: %s" % future.result().url) sm.callback = complete if args.skip: sm.add_filter(avoid_extensions(args.skip)) # Run cralwer sm.search(WebPage(args.url), max_depth=args.max_depth, within_domain=args.domain_limited) if args.verbose: print("\nEmails:") if sm.emails: for email in sm.emails: print("\t%s" % email) else: print("-no emails found") print("\nVisited web pages:") for page in sm.visited: print("\t%s" % page.url)