def test_simple_link_matcher(self): # Test that the simple link matcher finds the right links""" crawler = Crawler(follow_externals=False) # Here, we define: # 1. one link that must be followed, cause it's a download one # 2. one link that must *not* be followed, cause the is_browsable # returns false for it. # 3. one link that must be followed cause it's a homepage that is # browsable # 4. one link that must be followed, because it contain a md5 hash self.assertTrue(crawler._is_browsable("%stest" % crawler.index_url)) self.assertFalse(crawler._is_browsable("http://dl-link2")) content = """ <a href="http://dl-link1" rel="download">download_link1</a> <a href="http://dl-link2" rel="homepage">homepage_link1</a> <a href="%(index_url)stest" rel="homepage">homepage_link2</a> <a href="%(index_url)stest/foobar-1.tar.gz#md5=abcdef>download_link2</a> """ % {'index_url': crawler.index_url} # Test that the simple link matcher yield the good links. generator = crawler._simple_link_matcher(content, crawler.index_url) self.assertEqual(('%stest/foobar-1.tar.gz#md5=abcdef' % crawler.index_url, True), next(generator)) self.assertEqual(('http://dl-link1', True), next(generator)) self.assertEqual(('%stest' % crawler.index_url, False), next(generator)) self.assertRaises(StopIteration, generator.__next__) # Follow the external links is possible (eg. homepages) crawler.follow_externals = True generator = crawler._simple_link_matcher(content, crawler.index_url) self.assertEqual(('%stest/foobar-1.tar.gz#md5=abcdef' % crawler.index_url, True), next(generator)) self.assertEqual(('http://dl-link1', True), next(generator)) self.assertEqual(('http://dl-link2', False), next(generator)) self.assertEqual(('%stest' % crawler.index_url, False), next(generator)) self.assertRaises(StopIteration, generator.__next__)
def test_default_link_matcher(self): crawler = Crawler("http://example.org", mirrors=[]) crawler.follow_externals = True crawler._is_browsable = lambda *args: True base_url = "http://example.org/some/file/" content = """ <a href="../homepage" rel="homepage">link</a> <a href="../download" rel="download">link2</a> <a href="../simpleurl">link2</a> """ found_links = set(uri for uri, _ in crawler._default_link_matcher(content, base_url)) self.assertIn('http://example.org/some/homepage', found_links) self.assertIn('http://example.org/some/simpleurl', found_links) self.assertIn('http://example.org/some/download', found_links)
def test_is_browsable(self): crawler = Crawler(follow_externals=False) self.assertTrue(crawler._is_browsable(crawler.index_url + "test")) # Now, when following externals, we can have a list of hosts to trust. # and don't follow other external links than the one described here. crawler = Crawler(hosts=["pypi.python.org", "example.org"], follow_externals=True) good_urls = ( "http://pypi.python.org/foo/bar", "http://pypi.python.org/simple/foobar", "http://example.org", "http://example.org/", "http://example.org/simple/", ) bad_urls = ( "http://python.org", "http://example.tld", ) for url in good_urls: self.assertTrue(crawler._is_browsable(url)) for url in bad_urls: self.assertFalse(crawler._is_browsable(url)) # allow all hosts crawler = Crawler(follow_externals=True, hosts=("*",)) self.assertTrue(crawler._is_browsable("http://an-external.link/path")) self.assertTrue(crawler._is_browsable("pypi.example.org/a/path")) # specify a list of hosts we want to allow crawler = Crawler(follow_externals=True, hosts=("*.example.org",)) self.assertFalse(crawler._is_browsable("http://an-external.link/path")) self.assertTrue( crawler._is_browsable("http://pypi.example.org/a/path"))