def test_extract_relative_urls(self):
        page = """
        <html>
        <body>
            <p>
                <a href="foo.html">FooPage</a>
            </p>
            <div>
                <div>
                    <div>
                        <div>
                            <a href="/sub/page/../bar.html">BarPage</a>
                        </div>
                    </div>
                </div>
            </div>
        </body>
        </html>
        """
        expected_links = [
            Link(self.crawled_page_url, "/foo.html"),
            Link(self.crawled_page_url, "/sub/bar.html"),
        ]

        actual_links = LinkExtractor.extract(self.crawled_page_url, page)

        self.assertEqual(actual_links, expected_links)
class TestPage(unittest.TestCase):
    @patch("crawler.pages.page.LinkExtractor.extract",
           return_value=[
               Link("http://www.example.com", "foo/index.html"),
               Link("http://www.example.com", "bar/index.html"),
           ])
    def test_link(self, mock_link_extractor):
        link = Link("http://www.example.com/", "index.html")
        page = Page(link, "mocked_page_body")

        self.assertEqual(page.link, link)
        mock_link_extractor.assert_called_with(
            "http://www.example.com/index.html", "mocked_page_body")

    @patch("crawler.pages.page.LinkExtractor.extract",
           return_value=[
               Link("http://www.example.com", "foo/index.html"),
               Link("http://www.example.com", "bar/index.html"),
           ])
    def test_out_links(self, mocked_link_extractor):
        link = Link("http://www.example.com/", "index.html")
        page = Page(link, "mocked_page_body")

        self.assertEqual(page.out_links, [
            Link("http://www.example.com", "foo/index.html"),
            Link("http://www.example.com", "bar/index.html"),
        ])
    def test_extract_multiple_links(self):
        page = """
        <html>
        <head>
            <title>Test Page Two Links</title>
        </head>
        <body>
            <p>
                <a href="http://www.example.com/foo.html">FooPage</a>
            </p>
            <div>
                <div>
                    <div>
                        <div>
                            <a href="https://www.example.com/sub/page/bar.html">BarPage</a>
                        </div>
                    </div>
                </div>
            </div>
        </body>
        </html>
        """
        expected_links = [
            Link(self.crawled_page_url, "/foo.html"),
            Link(self.crawled_page_url, "/sub/page/bar.html"),
        ]

        actual_links = LinkExtractor.extract(self.crawled_page_url, page)

        self.assertEqual(actual_links, expected_links)
 def test_equal_different_not_equivalent_complex_path(self):
     self.assertNotEqual(
         Link(self.crawled_page,
              "http://www.example.com/sub/path/../foo/bar.html"),
         Link(self.crawled_page,
              "http://www.example.net/sub/path/bar.html"),
     )
    def test_out_links(self, mocked_link_extractor):
        link = Link("http://www.example.com/", "index.html")
        page = Page(link, "mocked_page_body")

        self.assertEqual(page.out_links, [
            Link("http://www.example.com", "foo/index.html"),
            Link("http://www.example.com", "bar/index.html"),
        ])
    def test_link(self, mock_link_extractor):
        link = Link("http://www.example.com/", "index.html")
        page = Page(link, "mocked_page_body")

        self.assertEqual(page.link, link)
        mock_link_extractor.assert_called_with(
            "http://www.example.com/index.html", "mocked_page_body")
Exemple #7
0
    def extract(crawled_page_url, page_text):
        """Given a web page will extract all <a> links, turn them into crawler.links.link.Link instances
           and return the results.

           Note: Will silently ignore all invalid (semantically, not whether they lead somewhere) links, and all
                Links with an unknown url scheme

           Args:
               crawled_page_url (string): The url of the crawled page
               page_text (string): The web page text

           Returns:
               list: List of crawler.links.link.Link instances representing every unique link on the page
        """

        parsed_page = PyQuery(page_text)
        links = []

        anchor_elements = parsed_page("a[href]")
        for anchor_element in anchor_elements:
            try:
                link = Link(crawled_page_url, anchor_element.attrib["href"])
                links.append(link)
            except (InvalidPathError, UnknownSchemeError):
                next

        return links
Exemple #8
0
    def __init__(self, start_domain):
        """Initialiser

            Args:
                start_domain (string): The domain to start crawling
        """
        self._start_link = Link(start_domain, "/")
        self.site_map = SiteMap()
        self._links_to_visit = set()
    def test_get(self):
        responses.add(
            **{
                "method": responses.GET,
                "url": "http://www.example.com/index.html",
                "body": TestPageFetcher.MOCK_PAGE,
                "status": 200,
                "content_type": "application/html",
            })

        expected_out_links = [
            Link("http://www.example.com/index.html", "/foo.html"),
            Link("http://www.example.com/index.html", "/sub/page/bar.html"),
        ]
        expected_link = Link("http://www.example.com/", "index.html")

        actual_page = PageFetcher.get(
            Link("http://www.example.com/", "index.html"))

        self.assertEqual(actual_page.link, expected_link)
        self.assertEqual(actual_page.out_links, expected_out_links)
    def test_extract_includes_external_links(self):
        page = """
        <html>
        <body>
            <p>
                <a href="http://www.example.com/foo.html">FooPage</a>
                <a href="http://example.com/bar.html">BarPage</a>
                <a href="http://www.example.net/baz.html">BazPage</a>
            </p>
        </body>
        </html>

        """
        expected_links = [
            Link(self.crawled_page_url, "/foo.html"),
            Link(self.crawled_page_url, "http://example.com/bar.html"),
            Link(self.crawled_page_url, "http://www.example.net/baz.html")
        ]

        actual_links = LinkExtractor.extract(self.crawled_page_url, page)

        self.assertEqual(actual_links, expected_links)
    def test_extract_one_link(self):
        page = """
        <html>
        <head>
            <title>Test Page One Link</title>
        </head>
        <body>
            <p>
                <a href="http://www.example.com/foo.html">FooPage</a>
            </p>
        </body>
        </html>
        """
        expected_links = [
            Link(self.crawled_page_url, "/foo.html"),
        ]

        actual_links = LinkExtractor.extract(self.crawled_page_url, page)

        self.assertEqual(actual_links, expected_links)
    def test_discards_invalid_links(self):
        page = """
        <html>
        <body>
            <p>
                <a href="ftp://www.example.com/foo.html">FooPage</a>
                <a href="example.com/../../bar.html">BarPage</a>
                <a href="/baz.html">BazPage</a>
            </p>
        </body>
        </html>

        """
        expected_links = [
            Link("http://www.example.com", "/baz.html"),
        ]

        actual_links = LinkExtractor.extract(self.crawled_page_url, page)

        self.assertEqual(actual_links, expected_links)
 def test_in_crawled_domain(self):
     link = Link(self.crawled_page, "")
     self.assertTrue(link.in_crawled_domain())
 def test_url_simple_absolute_url_different_port(self):
     self.assertEqual(
         Link(self.crawled_page,
              "http://www.example.net:123/index.html").url,
         "http://www.example.net:123/index.html",
     )
 def test_url_simple_absolute_url_same_domain(self):
     self.assertEqual(
         Link(self.crawled_page, "http://www.example.com/index.html").url,
         "http://www.example.com/index.html",
     )
 def test_url_complex_relative_url(self):
     self.assertEqual(
         Link(self.crawled_page, "/sub/path/../path/index.html").url,
         "http://www.example.com/sub/path/index.html",
     )
 def test_equal_different_port(self):
     self.assertNotEqual(
         Link(self.crawled_page, "index.html"),
         Link(self.crawled_page, "index.htm"),
     )
 def test_in_crawled_domain_parent_domain(self):
     link = Link(self.crawled_page, "http://example.com/foo.html")
     self.assertFalse(link.in_crawled_domain())
 def test_in_crawled_domain_with_absolute_path_different_port(self):
     link = Link(self.crawled_page, "http://www.example.com:123/foo.html")
     self.assertTrue(link.in_crawled_domain())
 def test_init_with_simple_relative_path_that_escapes_root(self):
     with self.assertRaises(InvalidPathError):
         Link(self.crawled_page, "../foo.html")
 def test_hash(self):
     self.assertEqual(
         Link(self.crawled_page, "/foo/bar.html").__hash__(),
         hash("www.example.com/foo/bar.html"),
     )
 def test_equal_different_tld(self):
     self.assertNotEqual(
         Link(self.crawled_page, "http://www.example.com/index.html"),
         Link(self.crawled_page, "http://www.example.net/index.html"),
     )
 def test_equal_different_subdomain(self):
     self.assertNotEqual(
         Link(self.crawled_page, "http://www.example.com/index.html"),
         Link(self.crawled_page, "http://foo.example.com/index.html"),
     )
 def test_in_crawled_domain_with_relative_path(self):
     link = Link(self.crawled_page, "foo.html")
     self.assertTrue(link.in_crawled_domain())
 def test_init_with_unknown_scheme(self):
     with self.assertRaises(UnknownSchemeError):
         Link(self.crawled_page, "foo://127.0.0.1")
 def test_init_with_complex_relative_path_that_escapes_root(self):
     with self.assertRaises(InvalidPathError):
         Link(self.crawled_page, "/path/foo/../../../bar.html")
 def test_in_crawled_domain_different_scheme(self):
     link = Link(self.crawled_page, "https://www.example.com/foo.html")
     self.assertTrue(link.in_crawled_domain())
 def test_url_simple_relative_url(self):
     self.assertEqual(
         Link(self.crawled_page, "index.html").url,
         "http://www.example.com/index.html",
     )
 def test_init_with_complex_absolute_path_that_escapes_root(self):
     with self.assertRaises(InvalidPathError):
         Link(self.crawled_page,
              "http://www.example.com/path/../foo/../../index.html")
 def test_url_crawled_subpage_relative_url(self):
     self.assertEqual(
         Link("http://www.example.com/sub/path/index.html", "foo.html").url,
         "http://www.example.com/sub/path/foo.html",
     )