def test_extract_relative_urls(self): page = """ <html> <body> <p> <a href="foo.html">FooPage</a> </p> <div> <div> <div> <div> <a href="/sub/page/../bar.html">BarPage</a> </div> </div> </div> </div> </body> </html> """ expected_links = [ Link(self.crawled_page_url, "/foo.html"), Link(self.crawled_page_url, "/sub/bar.html"), ] actual_links = LinkExtractor.extract(self.crawled_page_url, page) self.assertEqual(actual_links, expected_links)
class TestPage(unittest.TestCase): @patch("crawler.pages.page.LinkExtractor.extract", return_value=[ Link("http://www.example.com", "foo/index.html"), Link("http://www.example.com", "bar/index.html"), ]) def test_link(self, mock_link_extractor): link = Link("http://www.example.com/", "index.html") page = Page(link, "mocked_page_body") self.assertEqual(page.link, link) mock_link_extractor.assert_called_with( "http://www.example.com/index.html", "mocked_page_body") @patch("crawler.pages.page.LinkExtractor.extract", return_value=[ Link("http://www.example.com", "foo/index.html"), Link("http://www.example.com", "bar/index.html"), ]) def test_out_links(self, mocked_link_extractor): link = Link("http://www.example.com/", "index.html") page = Page(link, "mocked_page_body") self.assertEqual(page.out_links, [ Link("http://www.example.com", "foo/index.html"), Link("http://www.example.com", "bar/index.html"), ])
def test_extract_multiple_links(self): page = """ <html> <head> <title>Test Page Two Links</title> </head> <body> <p> <a href="http://www.example.com/foo.html">FooPage</a> </p> <div> <div> <div> <div> <a href="https://www.example.com/sub/page/bar.html">BarPage</a> </div> </div> </div> </div> </body> </html> """ expected_links = [ Link(self.crawled_page_url, "/foo.html"), Link(self.crawled_page_url, "/sub/page/bar.html"), ] actual_links = LinkExtractor.extract(self.crawled_page_url, page) self.assertEqual(actual_links, expected_links)
def test_equal_different_not_equivalent_complex_path(self): self.assertNotEqual( Link(self.crawled_page, "http://www.example.com/sub/path/../foo/bar.html"), Link(self.crawled_page, "http://www.example.net/sub/path/bar.html"), )
def test_out_links(self, mocked_link_extractor): link = Link("http://www.example.com/", "index.html") page = Page(link, "mocked_page_body") self.assertEqual(page.out_links, [ Link("http://www.example.com", "foo/index.html"), Link("http://www.example.com", "bar/index.html"), ])
def test_link(self, mock_link_extractor): link = Link("http://www.example.com/", "index.html") page = Page(link, "mocked_page_body") self.assertEqual(page.link, link) mock_link_extractor.assert_called_with( "http://www.example.com/index.html", "mocked_page_body")
def extract(crawled_page_url, page_text): """Given a web page will extract all <a> links, turn them into crawler.links.link.Link instances and return the results. Note: Will silently ignore all invalid (semantically, not whether they lead somewhere) links, and all Links with an unknown url scheme Args: crawled_page_url (string): The url of the crawled page page_text (string): The web page text Returns: list: List of crawler.links.link.Link instances representing every unique link on the page """ parsed_page = PyQuery(page_text) links = [] anchor_elements = parsed_page("a[href]") for anchor_element in anchor_elements: try: link = Link(crawled_page_url, anchor_element.attrib["href"]) links.append(link) except (InvalidPathError, UnknownSchemeError): next return links
def __init__(self, start_domain): """Initialiser Args: start_domain (string): The domain to start crawling """ self._start_link = Link(start_domain, "/") self.site_map = SiteMap() self._links_to_visit = set()
def test_get(self): responses.add( **{ "method": responses.GET, "url": "http://www.example.com/index.html", "body": TestPageFetcher.MOCK_PAGE, "status": 200, "content_type": "application/html", }) expected_out_links = [ Link("http://www.example.com/index.html", "/foo.html"), Link("http://www.example.com/index.html", "/sub/page/bar.html"), ] expected_link = Link("http://www.example.com/", "index.html") actual_page = PageFetcher.get( Link("http://www.example.com/", "index.html")) self.assertEqual(actual_page.link, expected_link) self.assertEqual(actual_page.out_links, expected_out_links)
def test_extract_includes_external_links(self): page = """ <html> <body> <p> <a href="http://www.example.com/foo.html">FooPage</a> <a href="http://example.com/bar.html">BarPage</a> <a href="http://www.example.net/baz.html">BazPage</a> </p> </body> </html> """ expected_links = [ Link(self.crawled_page_url, "/foo.html"), Link(self.crawled_page_url, "http://example.com/bar.html"), Link(self.crawled_page_url, "http://www.example.net/baz.html") ] actual_links = LinkExtractor.extract(self.crawled_page_url, page) self.assertEqual(actual_links, expected_links)
def test_extract_one_link(self): page = """ <html> <head> <title>Test Page One Link</title> </head> <body> <p> <a href="http://www.example.com/foo.html">FooPage</a> </p> </body> </html> """ expected_links = [ Link(self.crawled_page_url, "/foo.html"), ] actual_links = LinkExtractor.extract(self.crawled_page_url, page) self.assertEqual(actual_links, expected_links)
def test_discards_invalid_links(self): page = """ <html> <body> <p> <a href="ftp://www.example.com/foo.html">FooPage</a> <a href="example.com/../../bar.html">BarPage</a> <a href="/baz.html">BazPage</a> </p> </body> </html> """ expected_links = [ Link("http://www.example.com", "/baz.html"), ] actual_links = LinkExtractor.extract(self.crawled_page_url, page) self.assertEqual(actual_links, expected_links)
def test_in_crawled_domain(self): link = Link(self.crawled_page, "") self.assertTrue(link.in_crawled_domain())
def test_url_simple_absolute_url_different_port(self): self.assertEqual( Link(self.crawled_page, "http://www.example.net:123/index.html").url, "http://www.example.net:123/index.html", )
def test_url_simple_absolute_url_same_domain(self): self.assertEqual( Link(self.crawled_page, "http://www.example.com/index.html").url, "http://www.example.com/index.html", )
def test_url_complex_relative_url(self): self.assertEqual( Link(self.crawled_page, "/sub/path/../path/index.html").url, "http://www.example.com/sub/path/index.html", )
def test_equal_different_port(self): self.assertNotEqual( Link(self.crawled_page, "index.html"), Link(self.crawled_page, "index.htm"), )
def test_in_crawled_domain_parent_domain(self): link = Link(self.crawled_page, "http://example.com/foo.html") self.assertFalse(link.in_crawled_domain())
def test_in_crawled_domain_with_absolute_path_different_port(self): link = Link(self.crawled_page, "http://www.example.com:123/foo.html") self.assertTrue(link.in_crawled_domain())
def test_init_with_simple_relative_path_that_escapes_root(self): with self.assertRaises(InvalidPathError): Link(self.crawled_page, "../foo.html")
def test_hash(self): self.assertEqual( Link(self.crawled_page, "/foo/bar.html").__hash__(), hash("www.example.com/foo/bar.html"), )
def test_equal_different_tld(self): self.assertNotEqual( Link(self.crawled_page, "http://www.example.com/index.html"), Link(self.crawled_page, "http://www.example.net/index.html"), )
def test_equal_different_subdomain(self): self.assertNotEqual( Link(self.crawled_page, "http://www.example.com/index.html"), Link(self.crawled_page, "http://foo.example.com/index.html"), )
def test_in_crawled_domain_with_relative_path(self): link = Link(self.crawled_page, "foo.html") self.assertTrue(link.in_crawled_domain())
def test_init_with_unknown_scheme(self): with self.assertRaises(UnknownSchemeError): Link(self.crawled_page, "foo://127.0.0.1")
def test_init_with_complex_relative_path_that_escapes_root(self): with self.assertRaises(InvalidPathError): Link(self.crawled_page, "/path/foo/../../../bar.html")
def test_in_crawled_domain_different_scheme(self): link = Link(self.crawled_page, "https://www.example.com/foo.html") self.assertTrue(link.in_crawled_domain())
def test_url_simple_relative_url(self): self.assertEqual( Link(self.crawled_page, "index.html").url, "http://www.example.com/index.html", )
def test_init_with_complex_absolute_path_that_escapes_root(self): with self.assertRaises(InvalidPathError): Link(self.crawled_page, "http://www.example.com/path/../foo/../../index.html")
def test_url_crawled_subpage_relative_url(self): self.assertEqual( Link("http://www.example.com/sub/path/index.html", "foo.html").url, "http://www.example.com/sub/path/foo.html", )