Example #1
0
    def test_simple_link_matcher(self):
        # Test that the simple link matcher finds the right links"""
        crawler = Crawler(follow_externals=False)

        # Here, we define:
        #   1. one link that must be followed, cause it's a download one
        #   2. one link that must *not* be followed, cause the is_browsable
        #      returns false for it.
        #   3. one link that must be followed cause it's a homepage that is
        #      browsable
        #   4. one link that must be followed, because it contain a md5 hash
        self.assertTrue(crawler._is_browsable("%stest" % crawler.index_url))
        self.assertFalse(crawler._is_browsable("http://dl-link2"))
        content = """
        <a href="http://dl-link1" rel="download">download_link1</a>
        <a href="http://dl-link2" rel="homepage">homepage_link1</a>
        <a href="%(index_url)stest" rel="homepage">homepage_link2</a>
        <a href="%(index_url)stest/foobar-1.tar.gz#md5=abcdef>download_link2</a>
        """ % {'index_url': crawler.index_url}

        # Test that the simple link matcher yield the good links.
        generator = crawler._simple_link_matcher(content, crawler.index_url)
        self.assertEqual(('%stest/foobar-1.tar.gz#md5=abcdef' %
                          crawler.index_url, True), next(generator))
        self.assertEqual(('http://dl-link1', True), next(generator))
        self.assertEqual(('%stest' % crawler.index_url, False),
                         next(generator))
        self.assertRaises(StopIteration, generator.__next__)

        # Follow the external links is possible (eg. homepages)
        crawler.follow_externals = True
        generator = crawler._simple_link_matcher(content, crawler.index_url)
        self.assertEqual(('%stest/foobar-1.tar.gz#md5=abcdef' %
                          crawler.index_url, True), next(generator))
        self.assertEqual(('http://dl-link1', True), next(generator))
        self.assertEqual(('http://dl-link2', False), next(generator))
        self.assertEqual(('%stest' % crawler.index_url, False),
                         next(generator))
        self.assertRaises(StopIteration, generator.__next__)
Example #2
0
    def test_default_link_matcher(self):
        crawler = Crawler("http://example.org", mirrors=[])
        crawler.follow_externals = True
        crawler._is_browsable = lambda *args: True
        base_url = "http://example.org/some/file/"
        content = """
<a href="../homepage" rel="homepage">link</a>
<a href="../download" rel="download">link2</a>
<a href="../simpleurl">link2</a>
        """
        found_links = set(uri for uri, _ in
                          crawler._default_link_matcher(content, base_url))
        self.assertIn('http://example.org/some/homepage', found_links)
        self.assertIn('http://example.org/some/simpleurl', found_links)
        self.assertIn('http://example.org/some/download', found_links)
Example #3
0
    def test_is_browsable(self):
        crawler = Crawler(follow_externals=False)
        self.assertTrue(crawler._is_browsable(crawler.index_url + "test"))

        # Now, when following externals, we can have a list of hosts to trust.
        # and don't follow other external links than the one described here.
        crawler = Crawler(hosts=["pypi.python.org", "example.org"],
                          follow_externals=True)
        good_urls = (
            "http://pypi.python.org/foo/bar",
            "http://pypi.python.org/simple/foobar",
            "http://example.org",
            "http://example.org/",
            "http://example.org/simple/",
        )
        bad_urls = (
            "http://python.org",
            "http://example.tld",
        )

        for url in good_urls:
            self.assertTrue(crawler._is_browsable(url))

        for url in bad_urls:
            self.assertFalse(crawler._is_browsable(url))

        # allow all hosts
        crawler = Crawler(follow_externals=True, hosts=("*",))
        self.assertTrue(crawler._is_browsable("http://an-external.link/path"))
        self.assertTrue(crawler._is_browsable("pypi.example.org/a/path"))

        # specify a list of hosts we want to allow
        crawler = Crawler(follow_externals=True,
                          hosts=("*.example.org",))
        self.assertFalse(crawler._is_browsable("http://an-external.link/path"))
        self.assertTrue(
            crawler._is_browsable("http://pypi.example.org/a/path"))