def test_default_link_matcher(self): crawler = Crawler("http://example.org", mirrors=[]) crawler.follow_externals = True crawler._is_browsable = lambda *args: True base_url = "http://example.org/some/file/" content = """ <a href="../homepage" rel="homepage">link</a> <a href="../download" rel="download">link2</a> <a href="../simpleurl">link2</a> """ found_links = set(uri for uri, _ in crawler._default_link_matcher(content, base_url)) self.assertIn('http://example.org/some/homepage', found_links) self.assertIn('http://example.org/some/simpleurl', found_links) self.assertIn('http://example.org/some/download', found_links)
def test_simple_link_matcher(self): # Test that the simple link matcher finds the right links""" crawler = Crawler(follow_externals=False) # Here, we define: # 1. one link that must be followed, cause it's a download one # 2. one link that must *not* be followed, cause the is_browsable # returns false for it. # 3. one link that must be followed cause it's a homepage that is # browsable # 4. one link that must be followed, because it contain a md5 hash self.assertTrue(crawler._is_browsable("%stest" % crawler.index_url)) self.assertFalse(crawler._is_browsable("http://dl-link2")) content = """ <a href="http://dl-link1" rel="download">download_link1</a> <a href="http://dl-link2" rel="homepage">homepage_link1</a> <a href="%(index_url)stest" rel="homepage">homepage_link2</a> <a href="%(index_url)stest/foobar-1.tar.gz#md5=abcdef>download_link2</a> """ % {'index_url': crawler.index_url} # Test that the simple link matcher yield the good links. generator = crawler._simple_link_matcher(content, crawler.index_url) self.assertEqual(('%stest/foobar-1.tar.gz#md5=abcdef' % crawler.index_url, True), next(generator)) self.assertEqual(('http://dl-link1', True), next(generator)) self.assertEqual(('%stest' % crawler.index_url, False), next(generator)) self.assertRaises(StopIteration, generator.__next__) # Follow the external links is possible (eg. homepages) crawler.follow_externals = True generator = crawler._simple_link_matcher(content, crawler.index_url) self.assertEqual(('%stest/foobar-1.tar.gz#md5=abcdef' % crawler.index_url, True), next(generator)) self.assertEqual(('http://dl-link1', True), next(generator)) self.assertEqual(('http://dl-link2', False), next(generator)) self.assertEqual(('%stest' % crawler.index_url, False), next(generator)) self.assertRaises(StopIteration, generator.__next__)